diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 0000000..b3f4d6c --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,37 @@ +name: Python Tests + +on: [pull_request] + +jobs: + test: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + # Checkout the latest commit associated with the PR + - uses: actions/checkout@v4 + + - name: Debug matrix value + run: echo "Python version is ${{ matrix.python-version }}" + + # Set up Miniconda + - name: Set up Miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true # Optional: update Conda to the latest version + python-version: ${{ matrix.python-version }} + + # Install any additional dependencies not included in the pyproject.toml file + - name: Install additional dependencies + run: | + pip install '.[tests]' # Install all dependencies, including test-specific ones + shell: bash -l {0} + + # Run pytest on the specified directory + - name: Run tests + run: | + pytest tests + shell: bash -l {0} \ No newline at end of file diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml new file mode 100644 index 0000000..e947faa --- /dev/null +++ b/.github/workflows/ruff.yml @@ -0,0 +1,23 @@ +name: Ruff Formatting +on: [pull_request] +jobs: + ruff: + if: ${{ github.actor != 'dependabot[bot]' }} # Do not run on commits created by dependabot + runs-on: ubuntu-latest + permissions: + # Give the default GITHUB_TOKEN write permission to commit and push the changed files. + contents: write # Allows reading and writing repository contents (e.g., commits) + pull-requests: write # Allows reading and writing pull requests + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.sha }} + token: ${{ secrets.GITHUB_TOKEN }} + - uses: chartboost/ruff-action@v1 + with: + src: './src/flexidot' + args: 'format --target-version py310' + - uses: stefanzweifel/git-auto-commit-action@v5 + id: auto-commit-action + with: + commit_message: 'Style fixes by Ruff' \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eeb5ed3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,70 @@ +# Mac stuff +.DS_Store + +# Versioning +src/flexidot/_version.py + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# mypy +.mypy_cache/ diff --git a/code/README.md b/CHANGELOG.md similarity index 79% rename from code/README.md rename to CHANGELOG.md index 94489a1..315e914 100644 --- a/code/README.md +++ b/CHANGELOG.md @@ -1,6 +1,39 @@ # FlexiDot version changes -![alt text](https://github.com/molbio-dresden/flexidot/blob/master/images/Selfdotplots_banner4.png "FlexiDot self dotplots") +![alt text](https://github.com/molbio-dresden/flexidot/blob/master/docs/images/Selfdotplots_banner4.png "FlexiDot self dotplots") + +## Version 2.0.0 +*Jan 2025* + +This release is a major refactor of the Flexidot codebase that migrates to Python 3 and a modern package structure. + +**[Faster run time]:** +- When comparing a sequence to an identical sequence `find_match_pos_diag()` will recycle kmer counts from the first seq. Saves 33% runtime. + +**[New features]:** + +- Flexidot and its dependancies are now pip installable - uses Hatch and pyproject.toml +- Versioning is now managed dynamically using git tags +- Some basic tests for the core `find_match_pos_diag()` function have been added +- cmd line options are now managed with argparse +- Repo includes env yaml to set up conda env for flexidot +- Check that input files exist +- Auto cleanup temp files +- Add action to run pytests +- Add action to format code with Rust +- Uses logging module to manage status logging (removed time logging) + +**[Changed defaults]:** +- Several cmd line options have been renamed or have changes to their expected input formatting. See --help. +- If not using the `--wobble_conversion` option then kmers containing any Ns will be skipped by default. +- If `--wobble_conversion` is set then `--max_n` determines the max percentage of Ns that will be tolerated in a kmer. Default changed to 10% from hard coded 49%. + +**[Bugfixes]:** +- Fix depreciation issue with numpy creating an ndarray from ragged nested sequences in `find_match_pos_diag()` Closes issue #15 +- Read files with r instead of rb +- Fix unicode issue referenced in #10 + +
## Version 1.06 *14.04.2019* diff --git a/README.md b/README.md index 9c2b99f..f82be90 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,36 @@ # FlexiDot: Highly customizable, ambiguity-aware dotplots for visual sequence analyses -![alt text](https://github.com/molbio-dresden/flexidot/blob/master/images/Selfdotplots_banner4.png "FlexiDot self dotplots") +![alt text](https://github.com/molbio-dresden/flexidot/blob/master/docs/images/Selfdotplots_banner4.png "FlexiDot self dotplots") FlexiDot is a cross-platform dotplot suite generating high quality self, pairwise and all-against-all visualizations. To improve dotplot suitability for comparison of consensus and error-prone sequences, FlexiDot harbors routines for strict and relaxed handling of mismatches and ambiguous residues. The custom shading modules facilitate dotplot interpretation and motif identification by adding information on sequence annotations and sequence similarities to the images. Combined with collage-like outputs, FlexiDot supports simultaneous visual screening of a large sequence sets, allowing dotplot use for routine screening. - ## Citation If you use FlexiDot in your research, please cite us: -**Kathrin M. Seibt, Thomas Schmidt, and Tony Heitkam** (2018) "FlexiDot: Highly customizable, ambiguity-aware dotplots for visual sequence analyses". *Bioinformatics* 34 (20), 3575–3577, doi: 10.1093/bioinformatics/bty395 - [**Read article**](https://doi.org/10.1093/bioinformatics/bty395) - [**Preprint**](https://github.com/molbio-dresden/flexidot/blob/master/documentation/FlexiDot__Highly_customizable_ambiguity_aware_dotplots__preprint.pdf) - +**Kathrin M. Seibt, Thomas Schmidt, and Tony Heitkam** (2018) "FlexiDot: Highly customizable, ambiguity-aware dotplots for visual sequence analyses". *Bioinformatics* 34 (20), 3575–3577, doi: 10.1093/bioinformatics/bty395 - [**Read article**](https://doi.org/10.1093/bioinformatics/bty395) - [**Preprint**](https://github.com/molbio-dresden/flexidot/blob/master/documentation/FlexiDot__Highly_customizable_ambiguity_aware_dotplots__preprint.pdf) ## FlexiDot versions and updates - - -*We are currently working on a new version, including the long requested Python3 support. Please stay tuned.* + -**Current version (14.04.2019): [FlexiDot v1.06](https://github.com/molbio-dresden/flexidot/blob/master/code/flexidot_v1.06.py)** +**Current version (Jan 2025): FlexiDot v2.0.0** -For an overview of FlexiDot version updates please see the [code history](https://github.com/molbio-dresden/flexidot/blob/master/code/README.md). - -Older versions can be accessed in the [code directory](https://github.com/molbio-dresden/flexidot/tree/master/code). Corresponding [parameter cheat sheets](https://github.com/molbio-dresden/flexidot/tree/master/documentation) are available as well. +For an overview of FlexiDot version updates please see the [code history](https://github.com/molbio-dresden/flexidot/blob/master/CHANGELOG.md). +Corresponding [parameter cheat sheets](https://github.com/molbio-dresden/flexidot/tree/master/documentation) are available as well. ## Documentation -* [in depth documentation](https://github.com/molbio-dresden/flexidot/blob/master/documentation/SupplementaryData.pdf) (This readme gives an overview, and more detail is in the documentation.) +* [in depth documentation](https://github.com/molbio-dresden/flexidot/blob/master/documentation/SupplementaryData.pdf) (This readme gives an overview, and more detail is in the documentation.) * [parameter cheat sheet](https://github.com/molbio-dresden/flexidot/blob/master/documentation/usage_v1.06.pdf) * [artificial test sequences used for the examples](https://github.com/molbio-dresden/flexidot/tree/master/test-data) * [example: adding annotation-based shading to a dotplot](https://github.com/molbio-dresden/flexidot/blob/master/documentation/tutorial_add_annotation.md) * [presentation slides introducing dotplots and our FlexiDot tool](https://zenodo.org/record/2558556) - ## Implementation -FlexiDot is implemented in [Python 2.7](https://www.python.org/), using +FlexiDot is implemented in [Python 3](https://www.python.org/), with dependencies: * [numpy](https://pypi.python.org/pypi/numpy) * [matplotlib](https://pypi.python.org/pypi/matplotlib) @@ -46,122 +40,139 @@ FlexiDot is implemented in [Python 2.7](https://www.python.org/), using * [easydev](https://pypi.python.org/pypi/easydev) (required for colormap) * [colour](https://pypi.python.org/pypi/colour) -Upon **first starting FlexiDot**, the program calls all needed modules. If absent, it installs them automatically using Python’s install manager pip. If this fails, please try again with **administrator** privileges. +You can create a Conda environment with these dependencies using the YAML file in this repo. -Please note, that the dependency **Biopython** requires a C compiler. In case of errors during Biopython installation, installing Microsoft Visual C++ Compiler (Windows), GCC (Linux) or Apple’s XCode suite (Mac OS) may help. +```bash +conda env create -f environment.yml +conda activate flexidot +``` -## Use FlexiDot - -Download the [FlexiDot script](https://github.com/molbio-dresden/flexidot/blob/master/code/flexidot_v1.06.py). With a right click on the field `Raw` you can download the script easily via `Save as`. +After activating the flexidot environment you can use pip to install the latest version of Flexidot. -To run FlexiDot, [**Python 2.7**](https://www.python.org/download/releases/2.7/) must be installed on the machine. -FlexiDot is started via **command line** in the console. For a brief introduction to the command line interface, check out this nice [tutorial](https://tutorial.djangogirls.org/en/intro_to_command_line/). +## Installing Flexidot -In brief, the console can be started the following way: +Installation options: -* **Windows** - * start console: WINDOWS key + type `CMD` + ENTER (Shift + ENTER starts console as administrator) - * prepare directory - * select directory and add python script "flexidot.py" and sequence files - * copy userpath from address bar (e.g.: C:\Users\Documents\Test) - * navigate to directory in console: type `cd userpath` + ENTER (paste userpath using right click) - * start Flexidot with the command below (with your specific fasta file name) -* **Linux/MacOS** - * start console: Applications → Utilities [Linux] or Accessories [MacOS] → Terminal - * prepare directory (see above, e.g. /Users/Documents/Test) - * navigate to directory in console: type `cd userpath` + ENTER (paste userpath using right click) - * start Flexidot with the command below (with your specific fasta file name) - -The general FlexiDot command depends on whether one or multiple fasta files are used as input via: +pip install the latest development version directly from this repo. +```bash +% pip install git+https://github.com/molbio-dresden/flexidot.git ``` -# use individual fasta file (can contain multiple sequences) -python flexidot.py -i input.fas [optional arguments] -# use multiple fasta files -python flexidot.py -i input1.fas,input2.fas [optional arguments] +Flexidot is not currently available via PyPi or Bioconda. Watch this space. -# use all fasta files in current directory -python flexidot.py -a [optional arguments] -``` +## Use FlexiDot + +Flexidot accepts one or more uncompressed fasta files as input. The files can contain multiple sequences. + +```bash +# Use individual fasta file (can contain multiple sequences) +flexidot -i input.fasta [optional arguments] -Optional arguments are explained below and in detail in the [**usage**](https://github.com/molbio-dresden/flexidot/blob/master/documentation/usage_v1.06.pdf). Importantly, `-k` defines the word size (e.g. `-k 10`) and `-t` specifies the sequence type (`-t y` for DNA [default]; `-t n` for proteins). The plotting mode is chosen via `-p` and described below. +# Use multiple fasta files +flexidot -i input1.fasta input2.fasta [optional arguments] + +# Use all fasta files in current directory +flexidot -i *.fasta [optional arguments] +``` +Optional arguments are explained below and in detail with the `--help` option. +Importantly, `-k` defines the word size (e.g. `-k 10`) and `-t` specifies the sequence type (`-t nuc` for DNA [default]; `-t aa` for proteins). The plotting mode is chosen via `-m` and described below. ## Plotting modes -FlexiDot allows sequence investigation in three run modes via the option `-p/--plotting_mode`: +FlexiDot allows sequence investigation in three run modes via the option `-m/--mode`: -`-p 0` self sequence comparison -`-p 1` pairwise sequence comparison -`-p 2` all-to-all sequence comparison +`-m 0` self sequence comparison +`-m 1` pairwise sequence comparison +`-m 2` all-to-all sequence comparison +To run multiple plotting modes, call the option multiple times i.e. `-m 0 -m 1 -m 2`. ### Self dotplots -with `-p/--plotting_mode 0` +with `-m/--mode 0` -In **self** dotplot mode, each sequence is compared with itself. The resulting dotplots can be combined to form a **collage** [default] or written to separate files. +In **self** dotplot mode, each sequence is compared with itself. The resulting dotplots can be combined to form a **collage** (with `--collage`) or written to separate files. -![alt text](https://github.com/molbio-dresden/flexidot/blob/master/images/Selfdotplots_banner.png "FlexiDot self dotplots") +![alt text](https://github.com/molbio-dresden/flexidot/blob/master/docs/images/Selfdotplots_banner.png "FlexiDot self dotplots") -``` -python flexidot.py -i test-seqs.fas -p 0 -D y -f 1 -k 10 -w y -r y -x n -m 6 -P 15 -g example.gff3 -G gff_color.config -``` +```bash +# A single sequence compared to itself +flexidot -i Seq2.fasta -m 0 -k 10 -P 15 +# Single sequence with annotations +flexidot -i Seq2.fasta -m 0 -k 10 -P 15 -g example.gff3 -G gff_color.config + +# Collage of 6 sequences each compared to themselves with Seq2 annotated (shown above) +flexidot -i test-seqs.fasta -m 0 -k 10 --n_col 6 -P 15 -g example.gff3 -G gff_color.config --collage +``` ### Pairwise comparisons -with `-p/--plotting_mode 1` +with `-m/--mode 1` -For **pairwise** dotplots, the collage output is recommended for larger numbers of sequences. The collage output of the 15 pairwise dotplots for the test sequences is shown below. By default, dotplot images are in square format (panel A). This maximizes the visibility of matches, if the compared sequences differ drastically in length. To enable scaling according to the respective sequence lengths, the FlexiDot scaling feature is callable via option `-L/--length_scaling` (panel B). If scaling is enabled, a red line indicates the end of the shorter sequence in the collage output. +For **pairwise** dotplots, the collage output is recommended for larger numbers of sequences. The collage output of the 15 pairwise dotplots for the test sequences is shown below. By default, dotplot images are in square format (panel A). This maximizes the visibility of matches, if the compared sequences differ drastically in length. To enable scaling according to the respective sequence lengths, the FlexiDot scaling feature is callable via option `-L/--length_scaling` (panel B). If scaling is enabled, a red line indicates the end of the shorter sequence in the collage output. - +Pairwise comparisons can be limited to only pairs that contain the first sequence in a fasta file using `--only_vs_first_seq`. -``` -Panel A$ python flexidot.py -i test-seqs.fas -p 1 -D y -f 0 -k 10 -w y -r y -m 5 -c y -L n -Panel B$ python flexidot.py -i test-seqs.fas -p 1 -D y -f 0 -k 10 -w y -r y -m 5 -c y -L y -``` + +```bash +# Panel A +flexidot -i test-seqs.fasta -m 1 -k 10 --n_col 3 -c +# Panel B (with length scaling) +flexidot -i test-seqs.fasta -m 1 -k 10 --n_col 3 -c -L +``` ### All-against-all comparisons -with `-p/--plotting_mode 2` +with `-m/--mode 2` -In **all-against-all** mode, FlexiDot compares each pair from a set of input sequences. To enable the identification of long shared subsequences at a glance, FlexiDot offers similarity shading (switched on/off via option `-x/--lcs_shading`) based on the LCS length in all-against-all comparisons (see below). +In **all-against-all** mode, FlexiDot compares each pair from a set of input sequences. To enable the identification of long shared subsequences at a glance, FlexiDot offers similarity shading (switched on/off via option `-x/--lcs_shading`) based on the LCS length in all-against-all comparisons (see below). - + +```bash +# All-by-all plot, LCS shading using maximal LCS length +# -y/--lcs_shading_ref: 0 = maximal LCS length +# -x/--lcs_shading +flexidot -i test-seqs.fasta -m 2 -k 10 -y 0 -x ``` -python flexidot.py -i test-seqs.fas -p 2 -D y -f 0 -t y -k 10 -w y -r y -x y -y 0 -``` - ## Major features ### Mismatch and ambiguity handling -In diverged or distantly related sequences matches may be interrupted by mismatches or residues might be represented as ambiguities to refer to frequent variants or mutations. Similarly, relaxed matching is helpful when analyzing error-prone sequences like SMRT reads. The achieved relaxation of the matching conditions thus increases sensitivity, while decreasing specificity. +In diverged or distantly related sequences matches may be interrupted by mismatches, or residues might be represented as ambiguities to refer to frequent variants or mutations. Similarly, relaxed matching is helpful when analyzing error-prone sequences like SMRT reads. Relaxation of the matching conditions thus increases sensitivity, while decreasing specificity. -Firstly, FlexiDot handles **ambiguous residues**, often found in consensus sequences. This allows the comparison of species-specific representations of multigene or repeat families as well as common variants or sequence subfamilies. The ambiguity handling is controlled via`-w/--wobble_conversion Y/N`. +Firstly, FlexiDot handles **ambiguous residues**, often found in consensus sequences. This allows the comparison of species-specific representations of multigene or repeat families as well as common variants or sequence subfamilies. The ambiguity handling is controlled via`-w/--wobble_conversion`. -Secondly, a defined number of **mismatches** within the window can be allowed with `-S/--substitution_count [number of allowed mismatches (substitutions)]`. This is even less stringent than the ambiguity handling. Please note, that only substitution mutations are allowed but not indels. +Secondly, a defined number of **mismatches** within the window can be allowed with `-S/--substitution_count [number of allowed mismatches (substitutions)]`. This is even less stringent than the ambiguity handling. Please note, that only substitution mutations are allowed but not indels. Lastly, both mismatch and ambiguity handling can be combined for the analysis. - + -``` -Panel tl$ python flexidot.py -i Seq4.fas,Seq1.fas -p 1 -D n -f 0 -c n -k 10 -w n -r y -x n -Panel tm$ python flexidot.py -i Seq4.fas,Seq1.fas -p 1 -D n -f 0 -c n -k 10 -w n -r y -x n -S 1 -Panel tr$ python flexidot.py -i Seq4.fas,Seq1.fas -p 1 -D n -f 0 -c n -k 10 -w n -r y -x n -S 2 -Panel bl$ python flexidot.py -i Seq4.fas,Seq1.fas -p 1 -D n -f 0 -c n -k 10 -w y -r y -x n -Panel bm$ python flexidot.py -i Seq4.fas,Seq1.fas -p 1 -D n -f 0 -c n -k 10 -w y -r y -x n -S 1 -Panel br$ python flexidot.py -i Seq4.fas,Seq1.fas -p 1 -D n -f 0 -c n -k 10 -w y -r y -x n -S 2 -``` +```bash +# Mismatch tolerance -S +#Panel tl +flexidot -i Seq1.fasta Seq4.fasta -m 1 -k 10 +#Panel tm +flexidot -i Seq1.fasta Seq4.fasta -m 1 -k 10 -S 1 +#Panel tr +flexidot -i Seq1.fasta Seq4.fasta -m 1 -k 10 -S 2 +# Wobble -w (tolerate ambiguities) +#Panel bl +flexidot -i Seq1.fasta Seq4.fasta -m 1 -k 10 -w +#Panel bm +flexidot -i Seq1.fasta Seq4.fasta -m 1 -k 10 -w -S 1 +#Panel br +flexidot -i Seq1.fasta Seq4.fasta -m 1 -k 10 -w -S 2 +``` ### Annotation-based shading @@ -171,62 +182,71 @@ In FlexiDot self dotplots, annotated sequence regions can be highlighted by **sh If you wish to find out more on the gff3 file format used here, Ensembl provides a [good overview](https://www.ensembl.org/info/website/upload/gff3.html). - + -``` -python flexidot.py -i Seq2.fas -p 0 -D y -f 0 -k 10 -w y -r y -x n -m 12 -P 5 -g example.gff3 -G gff_color.config +```bash +flexidot -i Seq2.fasta -m 0 -k 10 -w -P 5 -g example.gff3 -G gff_color.config ``` ### [since FlexiDot_v1.03] Annotation-based shading also available for all-against-all dotplots Previously only available for self dotplots, we added annotation-based shading to all-against-all dotplots, allowing for many new visualizations. As before, annotation information is provided as general feature file (GFF3). These features are added to the middle diagonal (see our example below). - + Basic command: -``` -python flexidot.py -i test-seqs.fas -g example2.gff3 -G gff_color.config -p 2 + +```bash +flexidot -i test-seqs.fasta -g example2.gff3 -G gff_color.config -m 2 ``` Command plus aesthetics as shown here (+ LCS shading, wordsize 10, change of subplot spacing and line width): -``` -python flexidot.py -i test-seqs.fas -g example2.gff3 -G gff_color.config -p 2 -x y -k 10 -F 0.06 -A 1.5 + +```bash +flexidot -i test-seqs.fasta -g example2.gff3 -G gff_color.config -m 2 -x -k 10 -F 0.06 -A 1.5 ``` -The test files used here are provided: -* [test-seqs.fas](https://github.com/molbio-dresden/flexidot/blob/master/test-data/test-seqs.fas) -* [example2.gff3](https://github.com/molbio-dresden/flexidot/blob/master/test-data/example2.gff3) -* [gff_color.config](https://github.com/molbio-dresden/flexidot/blob/master/test-data/gff_color.config) +The test files used here are [provided](https://github.com/molbio-dresden/flexidot/tree/master/test-data): +* [test-seqs.fasta](https://github.com/molbio-dresden/flexidot/blob/master/tests/test-data/test-seqs.fasta) +* [example2.gff3](https://github.com/molbio-dresden/flexidot/blob/master/tests/test-data/example2.gff3) +* [gff_color.config](https://github.com/molbio-dresden/flexidot/blob/master/tests/test-data/gff_color.config) ### Similarity shading In all-against-all mode, FlexiDot compares each pair from a set of input sequences. To enable the identification of long shared subsequences at a glance, FlexiDot offers similarity shading (switched on/off via option `-x/--lcs_shading`) based on the **LCS length** (longest common subsequence, or longest match if mismatches are considered) in all-against-all comparisons. Longer matches are represented by darker background shading. A separate shading **legend** output file is created written according to mathematical interval notation, where interval boundaries are represented by a pair of numbers. Consequently, the symbols “(” or “)” represent exclusion, whereas “[” or “]” represent inclusion of the respective number. FlexiDot similarity shading is highly customizable with the following parameters, explained in depth in the documentation: + * Reference for shading (option `-y/--lcs_shading_ref`) * Number of shading intervals (option `-X/--lcs_shading_num`) * Shading based on sequence orientation (option `-z/--lcs_shading_ori`) Shading examples based on sequence orientation (forward, panel A; reverse, panel B; both, panel C) are shown: -![alt text](https://github.com/molbio-dresden/flexidot/blob/master/images/all_against_all_shaded_orientation2.png "FlexiDot shaded dotplots") +![alt text](https://github.com/molbio-dresden/flexidot/blob/master/docs/images/all_against_all_shaded_orientation2.png "FlexiDot shaded dotplots") -``` -Panel A$ python flexidot.py -i test-seqs.fas -p 2 -D y -f 0 -t y -k 10 -w n -r y -x y -y 0 -z 0 -Panel B$ python flexidot.py -i test-seqs.fas -p 2 -D y -f 0 -t y -k 10 -w n -r y -x y -y 0 -z 1 -Panel C$ python flexidot.py -i test-seqs.fas -p 2 -D y -f 0 -t y -k 10 -w n -r y -x y -y 0 -z 2 +```bash +#Panel A - lcs_shading_ori: 0 = forward +flexidot -i test-seqs.fasta -m 2 -k 10 -x -y 0 -z 0 +#Panel B - lcs_shading_ori: 1 = reverse +flexidot -i test-seqs.fasta -m 2 -k 10 -x -y 0 -z 1 +#Panel C - lcs_shading_ori: 2 = both +flexidot -i test-seqs.fasta -m 2 -k 10 -x -y 0 -z 2 ``` ### Custom matrix shading -When comparing related sequences, multiple sequence alignments are frequently applied. The resulting pairwise **sequence similarities** can be integrated in the FlexiDot images by providing a **matrix file** via `-u/--input_user_matrix_file `. This allows a shading of the upper right triangle according to the matrix (here orange). With `-U/--user_matrix_print y` the matrix values can be printed into the respective fields. Besides, also **text** information can be provided in the matrix, but then shading is suppressed. +When comparing related sequences, multiple sequence alignments are frequently applied. The resulting pairwise **sequence similarities** can be integrated in the FlexiDot images by providing a **matrix file** via `-u/--user_matrix_file `. This allows a shading of the upper right triangle according to the matrix (here orange). With `-U/--user_matrix_print` the matrix values can be printed into the respective fields. Besides, also **text** information can be provided in the matrix, but then shading is suppressed. -In the example, LCS and matrix shading are combined to visualize the relationships between different members of a repeat family. +In the example, LCS and matrix shading are combined to visualize the relationships between different members of a repeat family. - + -``` -python flexidot.py -i Beetle.fas -p 2 -x y -k 10 -S 1 -r n -u custom_matrix.txt -U y -``` +```bash +# Beetle TE plot +flexidot -i Beetle.fas -m 2 -k 10 -S 1 -r -x -u custom_matrix.txt -U +# Example with test dataset +flexidot -i test-seqs.fasta -m 2 -k 10 -S 1 -x -u custom_matrix.txt -U +``` diff --git a/archive/args.py b/archive/args.py new file mode 100644 index 0000000..d6d93d0 --- /dev/null +++ b/archive/args.py @@ -0,0 +1,1048 @@ +# Old arg handling from flexidot.py +def usage(): + """ + usage and help + """ + + print("""\n\n FLEXIDOT + ------------------------------------------------------------------- + + Version: + 1.06 + + Citation: + Kathrin M. Seibt, Thomas Schmidt, Tony Heitkam (2018) + "FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation" + Bioinformatics 34 (20), 3575–3577, doi: 10.1093/bioinformatics/bty395 + + + General usage: + $ python flexidot.py -a [ARGUMENTS] + $ python flexidot.py -i [ARGUMENTS] + + + ARGUMENTS + ------------------------------------------------------------------- + + + INPUT/OUTPUT OPTIONS... required are [-a] OR [-i] + + -a, --auto_fas Imports all fasta files from current directory (*.fasta, *.fas, *.fa, *.fna) + -i is not needed, if -a is activated + [inactive by default] + + -i, --in_file Input fasta file (fasta file name or comma-separated file list) + > Provide multiple files: Recall -i or provide comma-separated file names + + -o, --output_file_prefix File prefix to be added to the generated filenames [default = NONE] + + -c, --collage_output Multiple dotplots are combined in a collage + Y or 1 = ON [default] + N or 0 = OFF + + -m, --m_col Number of columns per page [default = 4] (only if --collage_output is ON) + + -n, --n_row Number of rows per page [default = 5] (only if --collage_output is ON) + + -f, --filetype Output file format + 0 = PNG [default] + 1 = PDF + 2 = SVG + + -s, --alphabetic_sorting Sort sequences alphabetically according to titles + Y or 1 = ON + N or 0 = OFF [default] + + + CALCULATION PARAMETERS... + + -k, --wordsize Wordsize (kmer length) for dotplot comparison [default = 10] + + -p, --plotting_mode Mode of FlexiDot dotplotting + 0 = self [default] + 1 = paired + 2 = poly (matrix with all-against-all dotplots) + > Run multiple plotting modes: Recall -p or provide comma-separated numbers + + -t, --type_nuc Type of residue is nucleotide + Y or 1 = nucleotide [default] + N or 0 = amino acid + + -w, --wobble_conversion Ambiguity handling for relaxed matching + Y or 1 = ON + N or 0 = OFF [default] + + -S, --substitution_count Number of substitutions (mismatches) allowed per window for relaxed matching + [default = 0] + + -r, --rc_option Find reverse complementary matches (only if type_nuc=y) + Y or 1 = ON [default] + N or 0 = OFF + + -O, --only_vs_first_seq Limit pairwise comparisons to match all sequences to 1st sequence only + (only if --plotting_mode=1) + Y or 1 = ON + N or 0 = OFF [default] + + GRAPHIC FORMATTING... + + -A, --line_width Line width [default = 1] + + -B, --line_col_for Line color [default = black] + + -C, --line_col_rev Reverse line color [default = green] + + -D, --x_label_pos Position of the X-label + Y or 1 = top [default] + N or 0 = bottom + + -E, --label_size Font size [default = 10] + + -F, --spacing Spacing between all-against-all dotplots (only if --plotting_mode=2) + [default = 0.04] + + -L, --length_scaling Scale plot size for pairwise comparison (only if --plotting_mode=1) + Y or 1 = Scaling ON (axes scaled according to sequence length) + N or 0 = Scaling OFF (squared plots) [default] + + -M, --mirror_y_axis Flip y-axis bottom to top (cartesian coordinate system) + Y or 1 = y-axis bottom to top + N or 0 = y-axis top to bottom [default] + + -P, --plot_size Plotsize [default = 10] + + -R, --representation Region of plot to display (only if --plotting_mode=2) + 0 = full [default] + 1 = upper + 2 = lower + + -T, --title_length Limit title length for dotplot comparisons + [default = 20] + Position of selection can be specified by appending a letter (e.g. -T 20E) + B = beginning [default] + E = end + + + GFF SHADING (for -p/--plotting_mode=0 only)... + + -g, --input_gff_files GFF3 file used for markup in self-dotplots + (provide multiple files: Recall -g or provide comma-separated file names) + + -G, --gff_color_config_file Tab-delimited config file for custom gff shading + column 1: feature type + column 2: color + column 3: alpha + column 4: zoom factor (for small regions) + + + LCS SHADING OPTIONS (for -p/--plotting_mode=2 only)... + + -x, --lcs_shading Shade subdotplot based on the length of the longest common substring (LCS) + Y or 1 = ON + N or 0 = OFF [default] + + -X, --lcs_shading_num Number of shading intervals (hues) for LCS (-x) and user matrix shading (-u) + [default = 5] + + -y, --lcs_shading_ref Reference for LCS shading + 0 = maximal LCS length [default] + 1 = maximally possible length (length of shorter sequence in pairwise comparison) + 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] - see -Y + + -Y, --lcs_shading_interval_len Length of intervals for LCS shading (only if --lcs_shading_ref=2) + [default for nucleotides = 50; default for amino acids = 10] + + -z, --lcs_shading_ori Shade subdotplots according to LCS on + 0 = forward [default], + 1 = reverse, or + 2 = both strands (forward shading above diagonal, reverse shading on diagonal and below; + if using --input_user_matrix_file, best LCS is used below diagonal) + + + CUSTOM USER MATRIX SHADING OPTIONS (for -p/--plotting_mode=2 only)... + + -u, --input_user_matrix_file Shading above diagonal according to values in matrix file specified by the user + (tab-delimited or comma-separated matrix with sequence name in column 1 and numbers in columns 2-n + e.g. identity matrix from multiple sequence alignment - strings are ignored) + + -U, --user_matrix_print Display provided matrix entries in the fields above diagonal of all-against-all dotplot + Y or 1 = ON + N or 0 = OFF [default] + + + OTHERS... + + -h, --help Help screen + + -v, --verbose Verbose + + + + + """) + + +def check_input(argv, trial_mode=False): + """ + commandline argument parsing + """ + + global log_txt, aa_bp_unit + + # helpers for argument parsing + ###################################### + + arguments = [ + "-a", + "--auto_fas", + "a", + "auto_fas", + "-i", + "--input_fasta", + "i:", + "input_fasta=", + "-o", + "--output_file_prefix", + "o:", + "output_file_prefix=", + "-c", + "--collage_output", + "c:", + "collage_output=", + "-m", + "--m_col", + "m:", + "m_col=", + "-n", + "--n_row", + "n:", + "n_row=", + "-f", + "--filetype", + "f:", + "filetype=", + "-t", + "--type_nuc", + "t:", + "type_nuc=", + "-g", + "--input_gff_files", + "g:", + "input_gff_files", + "-G", + "--gff_color_config_file", + "G:", + "gff_color_config_file", + "-k", + "--wordsize", + "k:", + "wordsize=", + "-p", + "--plotting_mode", + "p:", + "plotting_mode=", + "-w", + "--wobble_conversion", + "w:", + "wobble_conversion=", + "-S", + "--substitution_count", + "S:", + "substitution_count=", + "-r", + "--rc_option", + "r:", + "rc_option=", + "-O", + "--only_vs_first_seq", + "O:", + "only_vs_first_seq=", + "-s", + "--alphabetic_sorting", + "s:", + "alphabetic_sorting=", + "-x", + "--lcs_shading", + "x:", + "lcs_shading=", + "-X", + "--lcs_shading_num", + "X:", + "lcs_shading_num=", + "-y", + "--lcs_shading_ref", + "y:", + "lcs_shading_ref=", + "-Y", + "--lcs_shading_interval_len", + "Y:", + "lcs_shading_interval_len=", + "-z", + "--lcs_shading_ori", + "z:", + "lcs_shading_ori=", + "-u", + "--input_user_matrix_file", + "u:", + "input_user_matrix_file=", + "-U", + "--user_matrix_print", + "U:", + "user_matrix_print=", + "-P", + "--plot_size", + "P:", + "plot_size=", + "-A", + "--line_width", + "A:", + "line_width=", + "-B", + "--line_col_for", + "B:", + "line_col_for=", + "-C", + "--line_col_rev", + "C:", + "line_col_rev=", + "-D", + "--x_label_pos", + "D:", + "x_label_pos=", + "-E", + "--label_size", + "E:", + "label_size=", + "-F", + "--spacing", + "F:", + "spacing=", + "-L", + "--length_scaling", + "L:", + "length_scaling=", + "-M", + "--mirror_y_axis", + "M:", + "mirror_y_axis=", + "-R", + "--representation", + "R:", + "representation=", + "-T", + "--title_length", + "T:", + "title_length=", + "-h", + "--help", + "h", + "help", + "-v", + "--verbose", + "v", + "verbose", + ] + + arguments_sysargv = tuple(arguments[0::4] + arguments[1::4]) + arguments_opts = "".join(arguments[2::4]) + arguments_args = arguments[3::4] + + # setting defaults + ###################################### + + auto_fas = False # 0 + input_fasta = [] + output_file_prefix = None + collage_output = True # 1 + m_col = 4 + n_row = 5 + filetype = 0 + type_nuc = True + input_gff_files = [] + gff_color_config_file = "" + + wordsize = 10 + plotting_modes = [0] + wobble_conversion = False # 0 + substitution_count = 0 + rc_option = True # 1 + alphabetic_sorting = False # 0 + only_vs_first_seq = False # 0 + + lcs_shading = False # 0 + lcs_shading_num = 4 + lcs_shading_ref = 0 + lcs_shading_interval_len = ( + 50 # interval default changes to "10" for amino acids [type_nuc = n] + ) + lcs_shading_ori = 0 + + input_user_matrix_file = "" + user_matrix_print = False + + plot_size = 10 + line_width = 1 + line_col_for = "black" + line_col_rev = "#009243" + x_label_pos = True # 0 + label_size = 10 + spacing = 0.04 + length_scaling = False # 0 + title_length = 20 # float("Inf") + title_clip_pos = "B" # B (begin), E (end) + max_N_percentage = 49 # fixed value, no user input + mirror_y_axis = False + representation = 0 + + aa_bp_unit = "bp" + + verbose = False # 0 + + filetype_dict = {0: "png", 1: "pdf", 2: "svg"} + lcs_shading_ref_dict = { + 0: "maximal LCS length", + 1: "maximally possible length", + 2: "given interval sizes", + } + plotting_mode_dict = {0: "self", 1: "paired", 2: "all-against-all"} + lcs_shading_ori_dict = {0: "forward", 1: "reverse complement", 2: "both"} + representation_dict = {0: "full", 1: "upper", 2: "lower"} + + # return default parameters for testing purposes + if trial_mode: + print("ATTENTION: YOU ARE IN THE TRIAL MODE!!!\n\n") + + commandline = "trial_mode\n" + + parameters = [ + commandline, + auto_fas, + input_fasta, + output_file_prefix, + collage_output, + m_col, + n_row, + filetype_dict[filetype], + type_nuc, + input_gff_files, + gff_color_config_file, + wordsize, + plotting_modes, + wobble_conversion, + substitution_count, + rc_option, + alphabetic_sorting, + only_vs_first_seq, + lcs_shading, + lcs_shading_num, + lcs_shading_ref, + lcs_shading_interval_len, + lcs_shading_ori, + input_user_matrix_file, + user_matrix_print, + plot_size, + line_width, + line_col_for, + line_col_rev, + x_label_pos, + label_size, + spacing, + length_scaling, + title_length, + title_clip_pos, + max_N_percentage, + mirror_y_axis, + representation, + verbose, + ] + return parameters + + # read arguments + ###################################### + + commandline = "" + for arg in sys.argv: + commandline += arg + " " + + log_txt = "\n...reading input arguments..." + print(log_txt) + + if len(sys.argv) < 2: + print("\nERROR: More arguments are needed. Exit...") + log_txt += "\nERROR: More arguments are needed. Exit..." + usage() + sys.exit() + + elif sys.argv[1] not in arguments_sysargv: + print( + "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." + % sys.argv[1] + ) + log_txt += ( + "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." + % sys.argv[1] + ) + # usage() + sys.exit() + + try: + opts, args = getopt.getopt(sys.argv[1:], arguments_opts, arguments_args) + + except getopt.GetoptError: + print( + "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." + % sys.argv[1:] + ) + log_txt += ( + "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." + % sys.argv[1:] + ) + # usage() + sys.exit() + + for opt, arg in opts: + if opt in ("-h", "--help"): + print("...fetch help screen") + log_txt += "\n...fetch help screen" + usage(), sys.exit() + + if opt in ("-v", "--verbose"): + print("...verbose output") + log_txt += "\n...verbose output" + verbose = True + + elif opt in ("-i", "--input_fasta"): + if "," in arg: + arg_list = arg.split(",") + for temp_file in arg_list: + if not os.path.exists(str(temp_file)): + message = "\nERROR: fasta_file '%s' was not found!" % str( + temp_file + ) + sys.exit(message) + else: + input_fasta.append(str(temp_file)) + print("fasta file #%i: %s" % (len(input_fasta), str(temp_file))) + log_txt += "\nfasta file #%i: %s" % ( + len(input_fasta), + str(temp_file), + ) + else: + if not os.path.exists(str(arg)): + message = "\nERROR: fasta_file '%s' was not found!" % str(arg) + log_txt += message + sys.exit(message) + else: + input_fasta.append(str(arg)) + print("fasta file #%i: %s" % (len(input_fasta), str(arg))) + log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(arg)) + + elif opt in ("-a", "--auto_fas"): + auto_fas = True + + # multiple gff files: reads them into a list + elif opt in ("-g", "--input_gff_files"): + # append gff file only if existing + if "," in arg: + arg_list = arg.split(",") + for temp_file in arg_list: + if not os.path.exists(str(temp_file)): + message = "\nERROR: gff_file '%s' was not found!" % str( + temp_file + ) + print(message) + log_txt += message + print(" -->Running FlexiDot without this gff file!") + log_txt += "\n -->Running FlexiDot without this gff file!" + else: + print( + "GFF file #%i: %s" % (len(input_gff_files), str(temp_file)) + ) + log_txt += "\nGFF file #%i: %s" % ( + len(input_gff_files), + str(temp_file), + ) + input_gff_files.append(str(temp_file)) + else: + if not os.path.exists(str(arg)): + message = "\nERROR: gff_file '%s' was not found!" % str(arg) + print(message) + log_txt += message + print(" -->Running FlexiDot without this gff file!") + log_txt += "\n -->Running FlexiDot without this gff file!" + else: + input_gff_files.append(str(arg)) + print("GFF file #%i: %s" % (len(input_gff_files), str(arg))) + log_txt += "\nGFF file #%i: %s" % (len(input_gff_files), str(arg)) + + elif opt in ("-G", "--gff_color_config_file"): + if not os.path.exists(str(arg)): + message = "\nERROR: gff_color_config_file '%s' was not found!" % str( + arg + ) + print( + message + + "\n -->Running FlexiDot with default gff coloring specification!" + ) + log_txt += ( + message + + "\n -->Running FlexiDot with default gff coloring specification!" + ) + else: + gff_color_config_file = str(arg) + + elif opt in ("-u", "--input_user_matrix_file"): + if not os.path.exists(str(arg)): + message = "\nERROR: input_user_matrix_file '%s' was not found!" % str( + arg + ) + print( + message + + "\n -->Running FlexiDot without input_user_matrix_file %s!" % arg + ) + log_txt += ( + message + "\n -->Running FlexiDot withdefault matrix shading file!" + ) + else: + input_user_matrix_file = str(arg) + + elif opt in ("-U", "--user_matrix_print"): + user_matrix_print = check_bools(str(arg), default=user_matrix_print) + + elif opt in ("-o", "--output_file_prefix"): + output_file_prefix = arg + + elif opt in ("-c", "--collage_output"): + collage_output = check_bools(str(arg), default=collage_output) + + elif opt in ("-m", "--m_col"): + try: + m_col = int(arg) + except: + print("m_col - invalid argument - using default value") + log_txt += "\nm_col - invalid argument - using default value" + + elif opt in ("-n", "--n_row"): + try: + n_row = int(arg) + except: + print("n_row - invalid argument - using default value") + log_txt += "\nn_row - invalid argument - using default value" + + elif opt in ("-f", "--filetype"): + if 0 <= int(arg) <= 2: + filetype = int(arg) + else: + print( + "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." + % (filetype) + ) + log_txt += ( + "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." + % (filetype) + ) + + elif opt in ("-t", "--type_nuc"): + type_nuc = check_bools(str(arg), default=type_nuc) + + if type_nuc == False: + # interval default changed for amino acids + lcs_shading_interval_len = 10 + aa_bp_unit = "aa" + + elif opt in ("-k", "--wordsize"): + try: + wordsize = int(arg) + except: + print("wordsize - invalid argument - using default value") + log_txt += "\nwordsize - invalid argument - using default value" + + elif opt in ("-p", "--plotting_mode"): + if "," in arg: + temp_modes = arg.split(",") + for item in temp_modes: + if item in ["0", "1", "2"]: + plotting_modes.append(int(item)) + elif arg in ["0", "1", "2"]: + plotting_modes = [int(arg)] + else: + print( + "Please provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" + ) + log_txt += "\nPlease provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" + + elif opt in ("-w", "--wobble_conversion"): + wobble_conversion = check_bools(str(arg), default=wobble_conversion) + + elif opt in ("-S", "--substitution_count"): + try: + substitution_count = int(arg) + except: + print("substitution_count - invalid argument - using default value") + log_txt += ( + "\nsubstitution_count - invalid argument - using default value" + ) + + elif opt in ("-r", "--rc_option"): + rc_option = check_bools(str(arg), default=rc_option) + + elif opt in ("-s", "--alphabetic_sorting"): + alphabetic_sorting = check_bools(str(arg), default=alphabetic_sorting) + + elif opt in ("-O", "--only_vs_first_seq"): + only_vs_first_seq = check_bools(str(arg), default=only_vs_first_seq) + + elif opt in ("-x", "--lcs_shading"): + lcs_shading = check_bools(str(arg), default=lcs_shading) + + elif opt in ("-X", "--lcs_shading_num"): + try: + lcs_shading_num = int(arg) - 1 + except: + print("lcs_shading_num - invalid argument - using default value") + log_txt += "\nlcs_shading_num - invalid argument - using default value" + + elif opt in ("-y", "--lcs_shading_ref"): + try: + if 0 <= int(arg) <= 2: + lcs_shading_ref = int(arg) + else: + print( + "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." + % (lcs_shading_ref) + ) + log_txt += ( + "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." + % (lcs_shading_ref) + ) + except: + print("lcs_shading_ref - invalid argument - using default value") + log_txt += "\nlcs_shading_ref - invalid argument - using default value" + + elif opt in ("-Y", "--lcs_shading_interval_len"): + try: + lcs_shading_interval_len = int(arg) + except: + print( + "lcs_shading_interval_len - invalid argument - using default value" + ) + log_txt += "\nlcs_shading_interval_len - invalid argument - using default value" + + elif opt in ("-z", "--lcs_shading_ori"): + if 0 <= int(arg) <= 2: + lcs_shading_ori = int(arg) + else: + print( + "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." + % (lcs_shading_ori) + ) + log_txt += ( + "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." + % (lcs_shading_ori) + ) + + elif opt in ("-P", "--plot_size"): + try: + plot_size = float(arg) + except: + print("plot_size - invalid argument - using default value") + log_txt += "\nplot_size - invalid argument - using default value" + + elif opt in ("-A", "--line_width"): + try: + line_width = float(arg) + except: + print("line_width - invalid argument - using default value") + log_txt += "\nline_width - invalid argument - using default value" + + elif opt in ("-B", "--line_col_for"): + if mcolors.is_color_like(arg): + line_col_for = arg + else: + print("line_col_for - invalid argument - using default value") + log_txt += "\nline_col_for - invalid argument - using default value" + + elif opt in ("-C", "--line_col_rev"): + if mcolors.is_color_like(arg): + line_col_rev = arg + else: + print("line_col_rev - invalid argument - using default value") + log_txt += "\nline_col_rev - invalid argument - using default value" + + elif opt in ("-D", "--x_label_pos"): + x_label_pos = check_bools(str(arg), default=x_label_pos) + + elif opt in ("-E", "--label_size"): + try: + label_size = float(arg) + except: + print("label_size - invalid argument - using default value") + log_txt += "\nlabel_size - invalid argument - using default value" + + elif opt in ("-F", "--spacing"): + try: + spacing = float(arg) + except: + print("spacing - invalid argument - using default value") + log_txt += "\nspacing - invalid argument - using default value" + + elif opt in ("-L", "--length_scaling"): + length_scaling = check_bools(str(arg), default=length_scaling) + + elif opt in ("-M", "--mirror_y_axis"): + mirror_y_axis = check_bools(str(arg), default=mirror_y_axis) + + elif opt in ("-R", "--representation"): + if 0 <= int(arg) <= 2: + representation = int(arg) + else: + print( + "\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." + % (representation) + ) + log_txt += ( + "\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." + % (representation) + ) + + elif opt in ("-T", "--title_length"): + try: + title_length = int(arg) + except: + try: + title_length = int(str(arg)[:-1]) + if arg[-1].upper() in ["B", "E"]: # B (beginning), E (end) + title_clip_pos = arg[-1].upper() + else: + print( + "title_length position information invalid - using default value" + ) + log_txt += "\ntitle_length position information invalid - using default value" + except: + print("title_length - invalid argument - using default value") + log_txt += "\ntitle_length - invalid argument - using default value" + + # start logging file + logprint(commandline, start=True, printing=False, prefix=output_file_prefix) + logprint(log_txt, start=False, printing=False) + + + + + # print chosen arguments + ###################################### + + text = "\n%s\n" % (70 * "-") + text += "\n" + "INPUT/OUTPUT OPTIONS...\n" + text += ( + "\n" + + "Input fasta file: " + + ", ".join(input_fasta) + ) + text += "\n" + "Automatic fasta collection from current directory: " + str(auto_fas) + text += ( + "\n" + + "Collage output: " + + str(collage_output) + ) + text += "\n" + "Number of columns per page: " + str(m_col) + text += "\n" + "Number of rows per page: " + str(n_row) + text += ( + "\n" + + "File format: " + + filetype_dict[filetype] + ) + text += "\n" + "Residue type is nucleotide: " + str(type_nuc) + + text += "\n" + "\n\nCALCULATION PARAMETERS...\n" + text += "\n" + "Wordsize: " + str(wordsize) + text += ( + "\n" + + "Sustitution count: " + + str(substitution_count) + ) + text += ( + "\n" + + "Plotting mode: " + + str(plotting_modes).replace("[", "").replace("]", "") + + "\n" + + 51 * " " + ) + for item in plotting_modes: + text += plotting_mode_dict[item] + " " + text += ( + "\n" + + "Ambiguity handling: " + + str(wobble_conversion) + ) + text += ( + "\n" + "Reverse complement scanning: " + str(rc_option) + ) + text += ( + "\n" + + "Alphabetic sorting: " + + str(alphabetic_sorting) + ) + + if 1 in plotting_modes: + text += ( + "\n" + + "Only matching sequences to first entry: " + + str(only_vs_first_seq) + ) + + if 0 in plotting_modes and input_gff_files != []: + text += ( + "\n" + + "Input gff files: " + + ", ".join(input_gff_files) + ) + if gff_color_config_file != "": + text += ( + "\n" + + "GFF color config file: " + + gff_color_config_file + ) + text += ( + "\n" + + "Prefix for output files: " + + str(output_file_prefix) + ) + + if 2 in plotting_modes: + text += ( + "\n" + "\n\nLCS SHADING OPTIONS (plotting_mode 'all-against-all' only)...\n" + ) + text += ( + "\n" + + "LCS shading: " + + str(lcs_shading) + ) + text += ( + "\n" + + "LCS shading interval number: " + + str(lcs_shading_num + 1) + ) + text += ( + "\n" + + "LCS shading reference: " + + lcs_shading_ref_dict[lcs_shading_ref] + ) + if lcs_shading_ref == 2: + text += ( + "\n" + + "LCS shading interval size [%s]: " % (aa_bp_unit) + + str(lcs_shading_interval_len) + ) + text += ( + "\n" + + "LCS shading orientation: " + + lcs_shading_ori_dict[lcs_shading_ori] + ) + if input_user_matrix_file != "": + text += ( + "\n" + + "Custom user shading matrix file: " + + input_user_matrix_file + ) + text += ( + "\n" + + "Print user matrix values (instead of dotplot): " + + str(user_matrix_print) + ) + text += ( + "\n" + + "Displayed plot region: " + + representation_dict[representation] + ) + + text += "\n" + "\n\nGRAPHIC FORMATTING...\n" + text += ( + "\n" + "Plot size: " + str(plot_size) + ) + text += ( + "\n" + "Line width: " + str(line_width) + ) + text += "\n" + "Line color: " + line_col_for + text += "\n" + "Reverse line color: " + line_col_rev + text += ( + "\n" + "X label position: " + str(x_label_pos) + ) + text += ( + "\n" + "Label size: " + str(label_size) + ) + text += "\n" + "Spacing: " + str(spacing) + if mirror_y_axis: + text += ( + "\n" + + "Y-axis mirrored (bottom to top) " + + str(mirror_y_axis) + ) + if title_clip_pos == "E": + text += ( + "\n" + + "Title length (limit number of characters): " + + "last" + + str(title_length) + + "characters" + ) + else: + text += ( + "\n" + + "Title length (limit number of characters): " + + "first" + + str(title_length) + + "characters" + ) + text += ( + "\n" + + "Length scaling: " + + str(length_scaling) + ) + text += "\n%s\n" % (70 * "-") + logprint(text) + + # collect settings + parameters = [ + commandline, + auto_fas, + input_fasta, + output_file_prefix, + collage_output, + m_col, + n_row, + filetype_dict[filetype], + type_nuc, + input_gff_files, + gff_color_config_file, + wordsize, + plotting_modes, + wobble_conversion, + substitution_count, + rc_option, + alphabetic_sorting, + only_vs_first_seq, + lcs_shading, + lcs_shading_num, + lcs_shading_ref, + lcs_shading_interval_len, + lcs_shading_ori, + input_user_matrix_file, + user_matrix_print, + plot_size, + line_width, + line_col_for, + line_col_rev, + x_label_pos, + label_size, + spacing, + length_scaling, + title_length, + title_clip_pos, + max_N_percentage, + mirror_y_axis, + representation, + verbose, + ] + + return parameters \ No newline at end of file diff --git a/code/flexidot_v1.06.py b/archive/flexidot.py similarity index 70% rename from code/flexidot_v1.06.py rename to archive/flexidot.py index cb223c0..98e7e54 100644 --- a/code/flexidot_v1.06.py +++ b/archive/flexidot.py @@ -1,18 +1,19 @@ -#!/usr/bin/python2.7 +#!/usr/bin/python3 # -*- coding: utf-8 -*- """ -FlexiDot Version 1.06 +FlexiDot Version 2.01 FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation -Kathrin M. Seibt, Thomas Schmidt and Tony Heitkam +Kathrin M. Seibt, Thomas Schmidt and Tony Heitkam Institute of Botany, TU Dresden, Dresden, 01277, Germany Bioinformatics (2018) Vol. 34 (20), 3575–3577, doi 10.1093/bioinformatics/bty395 """ + ############################### # Requirements # ############################### @@ -23,20 +24,21 @@ import sys import shutil, getopt import unicodedata +import math def module_install_command(module_name, upgrade=False): """ - create installation commands for Python modules and print information + create installation commands for Python modules and print information """ if upgrade: - load_command = "python -m pip install --upgrade %s" % module_name + load_command = "python -m pip install --upgrade %s" % (module_name) else: - load_command = "python -m pip install %s" % module_name + load_command = "python -m pip install %s" % (module_name) try: logprint("Installing Python module: %s\n\t%s\n" % (module_name, load_command)) except: - print "Installing Python module: %s\n\t%s\n" % (module_name, load_command) + print("Installing Python module: %s\n\t%s\n" % (module_name, load_command)) return load_command @@ -45,7 +47,7 @@ def load_modules(): load Python modules, if possible - otherwise try to install them """ # make module names global - global cllct, gridspec, patches, rcParams, mplrc, P, Color, SeqIO, np, mcolors, rgb2hex, regex + global cllct, gridspec, patches, rcParams, mplrc, P, Color, SeqIO, np, mcolors, rgb2hex, regex, PdfPages # matplotlib try: @@ -53,16 +55,28 @@ def load_modules(): except: command = module_install_command("matplotlib", upgrade=True) try: - os.system(command) - print "\n" + os.system(command) + print("\n") import matplotlib.collections as cllct except: - print "Please install module matplotlib manually" + print("Please install module matplotlib manually") import matplotlib.colors as mcolors import matplotlib.gridspec as gridspec import matplotlib.patches as patches - import pylab as P - P.switch_backend('agg') # bugfix for _tkinter.TclError on CentOs 7 servers, see Github Issue #5 + from matplotlib.backends.backend_pdf import PdfPages + try: + import pylab as P + except: + command = module_install_command("tkinter", upgrade=True) + try: + os.system(command) + print("\n") + import pylab as P + except: + print("Please install module matplotlib manually") + print("\n>>> In case of error with 'tkinter' under linux: try the following command:") + print(">>> \tsudo apt-get install python-tk\n\n") + P.switch_backend('agg') # bugfix for _tkinter.TclError on CentOs 7 servers, see Github Issue #5 # specify matplotlib font settings from matplotlib import rc as mplrc @@ -70,6 +84,7 @@ def load_modules(): from matplotlib import rcParams rcParams['font.family'] = 'sans-serif' rcParams['font.sans-serif'] = ['Helvetica', 'Verdana', 'Tahoma' , 'DejaVu Sans', 'Droid Sans Mono', 'Sans', 'Liberation', 'Ubuntu', 'Arial', ] + # rcParams['figure.max_open_warning'] = 0 # colour for color gradient palette try: @@ -77,11 +92,11 @@ def load_modules(): except: command = module_install_command("colour") try: - os.system(command) - print "\n" + os.system(command) + print("\n") from colour import Color except: - print "Please install module colour manually" + print("Please install module colour manually") # color converter try: @@ -91,12 +106,12 @@ def load_modules(): # additional module easydev.tools required by colormap command2 = module_install_command("easydev") try: - os.system(command) - os.system(command2) - print "\n" + os.system(command) + os.system(command2) + print("\n") from colormap import rgb2hex except: - print "Please install module colormap manually" + print("Please install module colormap manually") # biopython try: @@ -104,11 +119,11 @@ def load_modules(): except: command = module_install_command("biopython") try: - os.system(command) - print "\n" + os.system(command) + print("\n") from Bio import SeqIO except: - print "Please install module biopython manually" + print("Please install module biopython manually") # numpy try: @@ -116,11 +131,11 @@ def load_modules(): except: command = module_install_command("numpy") try: - os.system(command) - print "\n" + os.system(command) + print("\n") import numpy as np except: - print "Please install module numpy manually" + print("Please install module numpy manually") # regex for pattern matching try: @@ -128,12 +143,11 @@ def load_modules(): except: command = module_install_command("regex") try: - os.system(command) - print "\n" + os.system(command) + print("\n") import regex except: - print "Please install module regex manually" - + print("Please install module regex manually") ############################### @@ -145,27 +159,27 @@ def usage(): usage and help """ - print """\n\n FLEXIDOT + print("""\n\n FLEXIDOT ------------------------------------------------------------------- Version: - 1.06 + 1.07 - Citation: - Kathrin M. Seibt, Thomas Schmidt, Tony Heitkam (2018) + Citation: + Kathrin M. Seibt, Thomas Schmidt, Tony Heitkam (2018) "FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation" Bioinformatics 34 (20), 3575–3577, doi: 10.1093/bioinformatics/bty395 - - General usage: + + General usage: $ python flexidot.py -a [ARGUMENTS] $ python flexidot.py -i [ARGUMENTS] - + ARGUMENTS ------------------------------------------------------------------- - + INPUT/OUTPUT OPTIONS... required are [-a] OR [-i] -a, --auto_fas Imports all fasta files from current directory (*.fasta, *.fas, *.fa, *.fna) @@ -173,28 +187,28 @@ def usage(): [inactive by default] -i, --in_file Input fasta file (fasta file name or comma-separated file list) - > Provide multiple files: Recall -i or provide comma-separated file names + > To provide multiple files, recall -i or provide comma-separated file names -o, --output_file_prefix File prefix to be added to the generated filenames [default = NONE] - -c, --collage_output Multiple dotplots are combined in a collage + -c, --collage_output Multiple dotplots are combined in a collage Y or 1 = ON [default] - N or 0 = OFF + N or 0 = OFF -m, --m_col Number of columns per page [default = 4] (only if --collage_output is ON) -n, --n_row Number of rows per page [default = 5] (only if --collage_output is ON) -f, --filetype Output file format - 0 = PNG [default] - 1 = PDF - 2 = SVG + 0 or png = PNG [default] + 1 or pdf = PDF + 2 or svg = SVG - -s, --alphabetic_sorting Sort sequences alphabetically according to titles + -s, --alphabetic_sorting Sort sequences alphabetically according to titles Y or 1 = ON N or 0 = OFF [default] - + CALCULATION PARAMETERS... -k, --wordsize Wordsize (kmer length) for dotplot comparison [default = 10] @@ -203,7 +217,7 @@ def usage(): 0 = self [default] 1 = paired 2 = poly (matrix with all-against-all dotplots) - > Run multiple plotting modes: Recall -p or provide comma-separated numbers + > To run multiple plotting modes, recall -p or provide comma-separated numbers -t, --type_nuc Type of residue is nucleotide Y or 1 = nucleotide [default] @@ -213,18 +227,23 @@ def usage(): Y or 1 = ON N or 0 = OFF [default] - -S, --substitution_count Number of substitutions (mismatches) allowed per window for relaxed matching + -S, --substitution_count Number of substitutions (mismatches) allowed per window for relaxed matching [default = 0] -r, --rc_option Find reverse complementary matches (only if type_nuc=y) Y or 1 = ON [default] - N or 0 = OFF + N or 0 = OFF - -O, --only_vs_first_seq Limit pairwise comparisons to match all sequences to 1st sequence only - (only if --plotting_mode=1) - Y or 1 = ON + -O, --only_vs_first_seq Limit pairwise comparisons to match all sequences to 1st sequence only + (only if --plotting_mode=1) + Y or 1 = ON N or 0 = OFF [default] - + + -N, --narrow_diagonal_interval Interval size [integer] for quicker narrow self-dotplot - limits calculation to + interval of given size along the main diagonal (only if --plotting_mode=0) + [inactive by default] + + GRAPHIC FORMATTING... -A, --line_width Line width [default = 1] @@ -242,12 +261,12 @@ def usage(): -F, --spacing Spacing between all-against-all dotplots (only if --plotting_mode=2) [default = 0.04] - -L, --length_scaling Scale plot size for pairwise comparison (only if --plotting_mode=1) + -L, --length_scaling Scale plot size for pairwise comparison (only if --plotting_mode=1) Y or 1 = Scaling ON (axes scaled according to sequence length) N or 0 = Scaling OFF (squared plots) [default] -M, --mirror_y_axis Flip y-axis bottom to top (cartesian coordinate system) - Y or 1 = y-axis bottom to top + Y or 1 = y-axis bottom to top N or 0 = y-axis top to bottom [default] -P, --plot_size Plotsize [default = 10] @@ -261,15 +280,15 @@ def usage(): [default = 20] Position of selection can be specified by appending a letter (e.g. -T 20E) B = beginning [default] - E = end + E = end - GFF SHADING (for -p/--plotting_mode=0 only)... + GFF SHADING (not for -p/--plotting_mode=1)... - -g, --input_gff_files GFF3 file used for markup in self-dotplots - (provide multiple files: Recall -g or provide comma-separated file names) + -g, --input_gff_files GFF3 file used for markup in self-dotplots and poly-dotplots + > To provide multiple files, recall -g or provide comma-separated file names) - -G, --gff_color_config_file Tab-delimited config file for custom gff shading + -G, --gff_color_config_file Tab-delimited config file for custom GFF-based shading column 1: feature type column 2: color column 3: alpha @@ -282,16 +301,16 @@ def usage(): Y or 1 = ON N or 0 = OFF [default] - -X, --lcs_shading_num Number of shading intervals (hues) for LCS (-x) and user matrix shading (-u) + -X, --lcs_shading_num Number of shading intervals (hues) for LCS (-x) and user matrix shading (-u) [default = 5] -y, --lcs_shading_ref Reference for LCS shading 0 = maximal LCS length [default] 1 = maximally possible length (length of shorter sequence in pairwise comparison) - 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] - see -Y + 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] -> see -Y - -Y, --lcs_shading_interval_len Length of intervals for LCS shading (only if --lcs_shading_ref=2) - [default for nucleotides = 50; default for amino acids = 10] + -Y, --lcs_shading_interval_len Length of intervals for LCS shading (only if --lcs_shading_ref=2) + [default for nucleotides = 50; default for amino acids = 10] -z, --lcs_shading_ori Shade subdotplots according to LCS on 0 = forward [default], @@ -303,7 +322,7 @@ def usage(): CUSTOM USER MATRIX SHADING OPTIONS (for -p/--plotting_mode=2 only)... -u, --input_user_matrix_file Shading above diagonal according to values in matrix file specified by the user - (tab-delimited or comma-separated matrix with sequence name in column 1 and numbers in columns 2-n + (tab-delimited or comma-separated matrix with sequence name in column 1 and numbers in columns 2-n e.g. identity matrix from multiple sequence alignment - strings are ignored) -U, --user_matrix_print Display provided matrix entries in the fields above diagonal of all-against-all dotplot @@ -318,9 +337,7 @@ def usage(): -v, --verbose Verbose - - - """ + """) def check_input(argv, trial_mode=False): """ @@ -365,6 +382,7 @@ def check_input(argv, trial_mode=False): "-F", "--spacing", "F:", "spacing=", "-L", "--length_scaling", "L:", "length_scaling=", "-M", "--mirror_y_axis", "M:", "mirror_y_axis=", + "-N", "--narrow_diagonal_interval", "N:", "narrow_diagonal_interval=", "-R", "--representation", "R:", "representation=", "-T", "--title_length", "T:", "title_length=", "-h", "--help", "h", "help", @@ -379,7 +397,7 @@ def check_input(argv, trial_mode=False): ###################################### auto_fas = False # 0 - input_fasta = [] + input_fasta = [] output_file_prefix = None collage_output = True # 1 m_col = 4 @@ -396,6 +414,7 @@ def check_input(argv, trial_mode=False): rc_option = True # 1 alphabetic_sorting = False # 0 only_vs_first_seq = False # 0 + narrow_diagonal_interval = False lcs_shading = False # 0 lcs_shading_num = 4 @@ -416,15 +435,16 @@ def check_input(argv, trial_mode=False): length_scaling = False # 0 title_length = 20 # float("Inf") title_clip_pos = "B" # B (begin), E (end) - max_N_percentage = 49 # fixed value, no user input + max_N_percentage = 49 # fixed value, no user input mirror_y_axis = False representation = 0 - + aa_bp_unit = "bp" verbose = False # 0 - filetype_dict = {0: "png", 1: "pdf", 2: "svg"} + filetype_dict = {0: "png", 1: "pdf", 2: "svg", + "png": "png", "pdf": "pdf", "svg": "svg"} lcs_shading_ref_dict = {0: "maximal LCS length", 1: "maximally possible length", 2: "given interval sizes"} plotting_mode_dict = {0: "self", 1: "paired", 2: "all-against-all"} lcs_shading_ori_dict = {0: "forward", 1: "reverse complement", 2: "both"} @@ -432,11 +452,11 @@ def check_input(argv, trial_mode=False): # return default parameters for testing purposes if trial_mode: - print "ATTENTION: YOU ARE IN THE TRIAL MODE!!!\n\n" + print("ATTENTION: YOU ARE IN THE TRIAL MODE!!!\n\n") commandline = "trial_mode\n" - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, only_vs_first_seq, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose] + parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, only_vs_first_seq, narrow_diagonal_interval, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose] return parameters @@ -448,17 +468,17 @@ def check_input(argv, trial_mode=False): commandline += arg + " " log_txt = "\n...reading input arguments..." - print log_txt + print(log_txt) if len(sys.argv) < 2: - print "\nERROR: More arguments are needed. Exit..." + print("\nERROR: More arguments are needed. Exit...") log_txt += "\nERROR: More arguments are needed. Exit..." usage() sys.exit() elif sys.argv[1] not in arguments_sysargv: - print "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - log_txt += "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] + print("\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % (sys.argv[1])) + log_txt += "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % (sys.argv[1]) # usage() sys.exit() @@ -466,93 +486,90 @@ def check_input(argv, trial_mode=False): opts, args = getopt.getopt(sys.argv[1:], arguments_opts, arguments_args) except getopt.GetoptError: - print "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - log_txt += "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] + print("\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % (sys.argv[1:])) + log_txt += "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % (sys.argv[1:]) # usage() sys.exit() for opt, arg in opts: + arg = arg.strip() if opt in ("-h", "--help"): - print "...fetch help screen" + print("...fetch help screen") log_txt += "\n...fetch help screen" usage(), sys.exit() if opt in ("-v", "--verbose"): - print "...verbose output" + print("...verbose output") log_txt += "\n...verbose output" verbose = True elif opt in ("-i", "--input_fasta"): if "," in arg: arg_list = arg.split(",") - for temp_file in arg_list: + for temp_file in arg_list: if not os.path.exists(str(temp_file)): - message = "\nERROR: fasta_file '%s' was not found!" % str(temp_file) + message = "\nERROR: fasta_file '%s' was not found!" % (str(temp_file)) sys.exit(message) else: input_fasta.append(str(temp_file)) - print "fasta file #%i: %s" % (len(input_fasta), str(temp_file)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(temp_file)) + print("fasta file #%d: %s" % (len(input_fasta), str(temp_file))) + log_txt += "\nfasta file #%d: %s" % (len(input_fasta), str(temp_file)) else: if not os.path.exists(str(arg)): - message = "\nERROR: fasta_file '%s' was not found!" % str(arg) + message = "\nERROR: fasta_file '%s' was not found!" % (str(arg)) log_txt += message sys.exit(message) else: input_fasta.append(str(arg)) - print "fasta file #%i: %s" % (len(input_fasta), str(arg)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(arg)) - + print("fasta file #%d: %s" % (len(input_fasta), str(arg))) + log_txt += "\nfasta file #%d: %s" % (len(input_fasta), str(arg)) elif opt in ("-a", "--auto_fas"): auto_fas = True - # multiple gff files: reads them into a list elif opt in ("-g", "--input_gff_files"): - # append gff file only if existing + # append gff file only if existing if "," in arg: arg_list = arg.split(",") - for temp_file in arg_list: + for temp_file in arg_list: if not os.path.exists(str(temp_file)): - message = "\nERROR: gff_file '%s' was not found!" % str(temp_file) - print message + message = "\nERROR: gff_file '%s' was not found!" % (str(temp_file)) + message += "\n -->Running FlexiDot without this gff file!" + print(message) log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" else: - print "GFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(temp_file)) + print("GFF file #%d: %s" % (len(input_gff_files), str(temp_file))) + log_txt += "\nGFF file #%d: %s" % (len(input_gff_files), str(temp_file)) input_gff_files.append(str(temp_file)) else: if not os.path.exists(str(arg)): - message = "\nERROR: gff_file '%s' was not found!" % str(arg) - print message + message = "\nERROR: gff_file '%s' was not found!" % (str(arg)) + message += "\n -->Running FlexiDot without this gff file!" + print(message) log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" else: input_gff_files.append(str(arg)) - print "GFF file #%i: %s" %(len(input_gff_files), str(arg)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(arg)) - + print("GFF file #%d: %s" % (len(input_gff_files), str(arg))) + log_txt += "\nGFF file #%d: %s" % (len(input_gff_files), str(arg)) elif opt in ("-G", "--gff_color_config_file"): if not os.path.exists(str(arg)): - message = "\nERROR: gff_color_config_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot with default gff coloring specification!" - log_txt += message + "\n -->Running FlexiDot with default gff coloring specification!" + message = "\nERROR: gff_color_config_file '%s' was not found!" % (str(arg)) + message += "\n -->Running FlexiDot with default gff coloring specification!" + print(message) + log_txt += message else: gff_color_config_file = str(arg) - elif opt in ("-u", "--input_user_matrix_file"): if not os.path.exists(str(arg)): - message = "\nERROR: input_user_matrix_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot without input_user_matrix_file %s!" % arg - log_txt += message + "\n -->Running FlexiDot withdefault matrix shading file!" + message = "\nERROR: input_user_matrix_file '%s' was not found!" % (str(arg)) + message += "\n -->Running FlexiDot with default input_user_matrix_file! Invalid file %s" % (str(arg)) + print(message) + log_txt += message else: input_user_matrix_file = str(arg) @@ -567,22 +584,24 @@ def check_input(argv, trial_mode=False): elif opt in ("-m", "--m_col"): try: m_col = int(arg) - except: - print "m_col - invalid argument - using default value" - log_txt += "\nm_col - invalid argument - using default value" + except: + print("Invalid argument for m_col - using default value") + log_txt += "\nInvalid argument for m_col - using default value" elif opt in ("-n", "--n_row"): try: n_row = int(arg) - except: - print "n_row - invalid argument - using default value" - log_txt += "\nn_row - invalid argument - using default value" + except: + print("Invalid argument for n_row - using default value") + log_txt += "\nInvalid argument for n_row - using default value" elif opt in ("-f", "--filetype"): - if 0 <= int(arg) <= 2: + if arg.isdigit() and 0 <= int(arg) <= 2: filetype = int(arg) - else: - print "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - log_txt += "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) + elif arg.lower() in ("pdf", "png", "jpg"): + filetype = arg.lower() + else: + print("\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." % (filetype)) + log_txt += "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." % (filetype) elif opt in ("-t", "--type_nuc"): type_nuc = check_bools(str(arg), default=type_nuc) @@ -594,9 +613,9 @@ def check_input(argv, trial_mode=False): elif opt in ("-k", "--wordsize"): try: wordsize = int(arg) - except: - print "wordsize - invalid argument - using default value" - log_txt += "\nwordsize - invalid argument - using default value" + except: + print("Invalid argument for wordsize - using default value") + log_txt += "\nInvalid argument for wordsize - default value" elif opt in ("-p", "--plotting_mode"): if "," in arg: @@ -607,7 +626,7 @@ def check_input(argv, trial_mode=False): elif arg in ["0","1","2"]: plotting_modes = [int(arg)] else: - print "Please provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" + print("Please provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]") log_txt += "\nPlease provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" elif opt in ("-w", "--wobble_conversion"): @@ -615,9 +634,9 @@ def check_input(argv, trial_mode=False): elif opt in ("-S", "--substitution_count"): try: substitution_count = int(arg) - except: - print "substitution_count - invalid argument - using default value" - log_txt += "\nsubstitution_count - invalid argument - using default value" + except: + print("Invalid argument for substitution_count - using default value") + log_txt += "\nInvalid argument for substitution_count - using default value" elif opt in ("-r", "--rc_option"): rc_option = check_bools(str(arg), default=rc_option) @@ -628,80 +647,91 @@ def check_input(argv, trial_mode=False): elif opt in ("-O", "--only_vs_first_seq"): only_vs_first_seq = check_bools(str(arg), default=only_vs_first_seq) + elif opt in ("-N", "--narrow_diagonal_interval"): + if str(arg).isdigit(): + narrow_diagonal_interval + try: + narrow_diagonal_interval = int(arg) + except: + log_txt += "\nInvalid argument for narrow_diagonal_interval - set as False [default]" + print("\nInvalid argument for narrow_diagonal_interval - set as False [default]." % (lcs_shading_ref)) + else: + log_txt += "\nInvalid argument for narrow_diagonal_interval - set as False [default]" + print("\nInvalid argument for narrow_diagonal_interval - set as False [default]." % (lcs_shading_ref)) + elif opt in ("-x", "--lcs_shading"): lcs_shading = check_bools(str(arg), default=lcs_shading) elif opt in ("-X", "--lcs_shading_num"): try: lcs_shading_num = int(arg) - 1 - except: - print "lcs_shading_num - invalid argument - using default value" - log_txt += "\nlcs_shading_num - invalid argument - using default value" + except: + print("Invalid argument for lcs_shading_num - using default value") + log_txt += "\nInvalid argument for lcs_shading_num - using default value" elif opt in ("-y", "--lcs_shading_ref"): try: - if 0 <= int(arg) <= 2: + if 0 <= int(arg) <= 2: lcs_shading_ref = int(arg) - else: - print "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - log_txt += "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - except: - print "lcs_shading_ref - invalid argument - using default value" - log_txt += "\nlcs_shading_ref - invalid argument - using default value" + else: + print("\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." % (lcs_shading_ref)) + log_txt += "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." % (lcs_shading_ref) + except: + print("Invalid argument for lcs_shading_ref - using default value") + log_txt += "\nInvalid argument for lcs_shading_ref - using default value" elif opt in ("-Y", "--lcs_shading_interval_len"): try: lcs_shading_interval_len = int(arg) - except: - print "lcs_shading_interval_len - invalid argument - using default value" - log_txt += "\nlcs_shading_interval_len - invalid argument - using default value" + except: + print("Invalid argument for lcs_shading_interval_len - using default value") + log_txt += "\nInvalid argument for lcs_shading_interval_len - using default value" elif opt in ("-z", "--lcs_shading_ori"): if 0 <= int(arg) <= 2: lcs_shading_ori = int(arg) - else: - print "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - log_txt += "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) + else: + print("\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." % (lcs_shading_ori)) + log_txt += "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." % (lcs_shading_ori) elif opt in ("-P", "--plot_size"): try: plot_size = float(arg) - except: - print "plot_size - invalid argument - using default value" - log_txt += "\nplot_size - invalid argument - using default value" - + except: + print("Invalid argument for plot_size - using default value") + log_txt += "\nInvalid argument for plot_size - using default value" elif opt in ("-A", "--line_width"): try: line_width = float(arg) - except: - print "line_width - invalid argument - using default value" - log_txt += "\nline_width - invalid argument - using default value" + except: + print("Invalid argument for line_width - using default value") + log_txt += "\nInvalid argument for line_width - using default value" elif opt in ("-B", "--line_col_for"): if mcolors.is_color_like(arg): line_col_for = arg - else: - print "line_col_for - invalid argument - using default value" - log_txt += "\nline_col_for - invalid argument - using default value" + else: + print("Invalid argument for line_col_for - using default value") + log_txt += "\nInvalid argument for line_col_for - using default value" elif opt in ("-C", "--line_col_rev"): if mcolors.is_color_like(arg): line_col_rev = arg - else: - print "line_col_rev - invalid argument - using default value" - log_txt += "\nline_col_rev - invalid argument - using default value" + else: + print("Invalid argument for line_col_rev - using default value") + log_txt += "\nInvalid argument for line_col_rev - using default value" elif opt in ("-D", "--x_label_pos"): x_label_pos = check_bools(str(arg), default=x_label_pos) elif opt in ("-E", "--label_size"): try: label_size = float(arg) - except: - print "label_size - invalid argument - using default value" - log_txt += "\nlabel_size - invalid argument - using default value" + except: + print("Invalid argument for label_size - using default value") + log_txt += "\nInvalid argument for label_size - using default value" elif opt in ("-F", "--spacing"): try: spacing = float(arg) - except: - print "spacing - invalid argument - using default value" - log_txt += "\nspacing - invalid argument - using default value" + except: + print("Invalid argument for spacing - using default value") + log_txt += "\nInvalid argument for spacing - using default value" elif opt in ("-L", "--length_scaling"): length_scaling = check_bools(str(arg), default=length_scaling) @@ -712,23 +742,23 @@ def check_input(argv, trial_mode=False): elif opt in ("-R", "--representation"): if 0 <= int(arg) <= 2: representation = int(arg) - else: - print "\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." %(representation) - log_txt += "\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." %(representation) + else: + print("\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." % (representation)) + log_txt += "\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." % (representation) elif opt in ("-T", "--title_length"): try: title_length = int(arg) except: - try: + try: title_length = int(str(arg)[:-1]) if arg[-1].upper() in ["B", "E"]: # B (beginning), E (end) - title_clip_pos = arg[-1].upper() + title_clip_pos = arg[-1].upper() else: - print "title_length position information invalid - using default value" - log_txt += "\ntitle_length position information invalid - using default value" + print("Invalid title_length position information - using default value") + log_txt += "\nInvalid title_length position information - using default value" except: - print "title_length - invalid argument - using default value" - log_txt += "\ntitle_length - invalid argument - using default value" + print("Invalid argument for title_length - using default value") + log_txt += "\nInvalid argument for title_length - using default value" # start logging file logprint(commandline, start=True, printing=False, prefix=output_file_prefix) @@ -738,33 +768,38 @@ def check_input(argv, trial_mode=False): # print chosen arguments ###################################### - text = "\n%s\n" % (70 * "-") + text = "\n%s\n" % (80*"-") text += "\n" + "INPUT/OUTPUT OPTIONS...\n" text += "\n" + "Input fasta file: " + ", ".join(input_fasta) text += "\n" + "Automatic fasta collection from current directory: " + str(auto_fas) - text += "\n" + "Collage output: " + str(collage_output) - text += "\n" + "Number of columns per page: " + str(m_col) - text += "\n" + "Number of rows per page: " + str(n_row) text += "\n" + "File format: " + filetype_dict[filetype] text += "\n" + "Residue type is nucleotide: " + str(type_nuc) + if 0 in plotting_modes or 1 in plotting_modes: + text += "\n" + "Collage output: " + str(collage_output) + text += "\n" + "Number of columns per page: " + str(m_col) + text += "\n" + "Number of rows per page: " + str(n_row) text += "\n" + "\n\nCALCULATION PARAMETERS...\n" text += "\n" + "Wordsize: " + str(wordsize) text += "\n" + "Sustitution count: " + str(substitution_count) - text += "\n" + "Plotting mode: " + str(plotting_modes).replace("[", "").replace("]", "") + "\n" + 51 * " " + text += "\n" + "Plotting mode: " + str(plotting_modes).replace("[", "").replace("]", "") + "\n" + 51*" " for item in plotting_modes: text += plotting_mode_dict[item] + " " text += "\n" + "Ambiguity handling: " + str(wobble_conversion) text += "\n" + "Reverse complement scanning: " + str(rc_option) text += "\n" + "Alphabetic sorting: " + str(alphabetic_sorting) - + if 1 in plotting_modes: text += "\n" + "Only matching sequences to first entry: " + str(only_vs_first_seq) - if 0 in plotting_modes and input_gff_files != []: + if len(input_gff_files) != 0 and (0 in plotting_modes or 2 in plotting_modes): text += "\n" + "Input gff files: " + ", ".join(input_gff_files) - if gff_color_config_file != "": + if gff_color_config_file != "": text += "\n" + "GFF color config file: " + gff_color_config_file + + if 0 in plotting_modes and narrow_diagonal_interval != False: + text += "\n" + "Narrowed window for fast self-dotplots: " + str(narrow_diagonal_interval) + text += "\n" + "Prefix for output files: " + str(output_file_prefix) if 2 in plotting_modes: @@ -790,17 +825,17 @@ def check_input(argv, trial_mode=False): text += "\n" + "Spacing: " + str(spacing) if mirror_y_axis: text += "\n" + "Y-axis mirrored (bottom to top) " + str(mirror_y_axis) - if title_clip_pos == "E": + if title_clip_pos == "E": text += "\n" + "Title length (limit number of characters): " + "last" + str(title_length) + "characters" else: text += "\n" + "Title length (limit number of characters): " + "first" + str(title_length) + "characters" text += "\n" + "Length scaling: " + str(length_scaling) - text += "\n%s\n" % (70 * "-") + text += "\n%s\n" % (80*"-") logprint(text) # collect settings - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, only_vs_first_seq, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose] + parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, only_vs_first_seq, narrow_diagonal_interval, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose] return parameters @@ -816,7 +851,7 @@ def alphabets(type_nuc=True): nucleotide_alphabet = ["A", "C", "G", "T"] - nucleotide_alphabet_full = ["A", "C", "G", "T", "N", "B", "D", "H", + nucleotide_alphabet_full = ["A", "C", "G", "T", "N", "B", "D", "H", "V", "Y", "R", "W", "S", "K", "M"] nucleotide_ambiguity_code = {"N": ["A", "C", "G", "T"], # any @@ -845,43 +880,27 @@ def alphabets(type_nuc=True): "A": "[ANDHVRWM]", "C": "[CNBHVYSM]", "G": "[GNBDVRSK]", - "T": "[TNBDHYWK]"} - - # nucleotide_match_dict = {"N": ".", # any - # "B": "[^A]", # not A - # "D": "[^C]", # not C - # "H": "[^G]", # not G - # "V": "[^T]", # not T - # "K": "[^ACM]", # keto - not A,C,M - # "M": "[^GTK]", # amino - not G,T,K - # "W": "[^CGS]", # weak - not C,G,S - # "S": "[^AGW]", # strong - not A,G,W - # "Y": "[^AGR]", # pyrimidine - not A,G,R - # "R": "[^CTY]", # purine - not C,T,Y - # "A": "[ANDHVRWM]", - # "C": "[CNBHVYSM]", - # "G": "[GNBDVRSK]", - # "T": "[TNBDHYWK]"} - - aminoacid_alphabet = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", + "T": "[TNBDHYWK]"} + + aminoacid_alphabet = ["A", "R", "N", "D", "C", "E", "Q", "G", + "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", "U", "O", "*"] - aminoacid_alphabet_full = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*", "J", + aminoacid_alphabet_full = ["A", "R", "N", "D", "C", "E", "Q", "G", + "H", "I", "L", "K", "M", "F", "P", "S", + "T", "W", "Y", "V", "U", "O", "*", "J", "Z", "B", "X"] aminoacid_ambiguity_code = {"J": ["I", "L"], - "Z": ["Q", "E"], - "B": ["N", "D"], - "X": ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", + "Z": ["Q", "E"], + "B": ["N", "D"], + "X": ["A", "R", "N", "D", "C", "E", "Q", "G", + "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", "U", "O", "*"]} # any aminoacid_match_dict = {"J": "[ILJ]", - "Z": "[QEZ]", - "B": "[NDB]", + "Z": "[QEZ]", + "B": "[NDB]", # "X": ".", "X": "[ARNDCEQGHILKMFPSTWYVUO*XBZJ]", "A": "[AX]", @@ -923,31 +942,26 @@ def logprint(text, start=False, printing=True, prefix=""): # define log file name and open file global log_file_name - if start and trial_mode: - log_file_name = "log_file.txt" - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - elif start: - date = datetime.date.today() - time = str(datetime.datetime.now()).split(" ")[1].split(".")[0].replace(":", "-") - log_file_name = "%s_%s_log_file.txt" % (date, time) + if start: + if not trial_mode: + date = datetime.date.today() + time = str(datetime.datetime.now()).split(" ")[1].split(".")[0].replace(":", "-") + log_file_name = "%s_%s_log_file.txt" % (date, time) + else: + log_file_name = "Log_file.txt" if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name + if prefix[-1] not in ["-", "_"]: + prefix = prefix + "_" + log_file_name = prefix + log_file_name.lower() log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) + log_file.write("Date: %s\n\n" % (str(datetime.datetime.now()))) else: log_file = open(log_file_name, 'a') # write log (and print) log_file.write(text + "\n") if printing: - print text + print(text) log_file.close() def time_track(starting_time, show=True): @@ -957,13 +971,13 @@ def time_track(starting_time, show=True): now = time.time() delta = now - starting_time if show: - text = "\n\t %s seconds\n" % str(delta) + text = "\n\t %s seconds\n" % (str(delta)) logprint(text, start=False, printing=True) return now def calc_fig_ratio(ncols, nrows, plot_size, verbose=False): """ - calculate size ratio for given number of columns (ncols) and rows (nrows) + calculate size ratio for given number of columns (ncols) and rows (nrows) with plot_size as maximum width and length """ ratio = ncols*1./nrows @@ -972,7 +986,7 @@ def calc_fig_ratio(ncols, nrows, plot_size, verbose=False): logprint(text, start=False, printing=True) if ncols >= nrows: figsize_x = plot_size - figsize_y = plot_size / ratio + figsize_y = plot_size * 1. / ratio else: figsize_x = plot_size * ratio figsize_y = plot_size @@ -983,6 +997,7 @@ def shorten_name(seq_name, max_len=20, title_clip_pos="B"): #, delim="_"): shorten sequence names (for diagram titles) """ + # check title length if len(seq_name) <= max_len: return seq_name @@ -994,19 +1009,6 @@ def shorten_name(seq_name, max_len=20, title_clip_pos="B"): #, delim="_"): else: name = seq_name[:max_len] - """# keep first and last part if multiple parts separated by delimiter (e.g. species_prefix + sequence_id) - if delim in seq_name: - if seq_name.count(delim) >= 2: - name = "%s..." % delim.join(seq_name.split(delim)[:1]) + seq_name.split(delim)[-1] # .replace("_000", "-") - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - - if len(name) > max_len: - name = name[:((max_len-2)//2)] + "..." + name[((max_len-2)//2):] - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - """ - return name def unicode_name(name): @@ -1014,18 +1016,17 @@ def unicode_name(name): replace non-ascii characters in string (e.g. for use in matplotlib) """ unicode_string = eval('u"%s"' % name) - return unicodedata.normalize('NFKD', unicode_string).encode('ascii','ignore') + return unicodedata.normalize('NFKD', unicode_string)#.encode('ascii','ignore') def check_bools(arg, update_log_txt = True, default=None): """ converts commandline arguments into boolean """ - # convert valid arguments - if str(arg).lower() == "y" or str(arg) == "1": + if str(arg).lower() == "y" or str(arg) == "1": return True - elif str(arg).lower() == "n" or str(arg) == "0": + elif str(arg).lower() == "n" or str(arg) == "0": return False # use default in case of invalid argument @@ -1037,13 +1038,13 @@ def check_bools(arg, update_log_txt = True, default=None): try: logprint("using default for " + str(arg)) except: - print "using default for " + str(arg) + print("using default for " + str(arg)) return default def create_color_list(number, color_map=None, logging=False, max_grey="#595959"): """ - create color list with given number of entries - grey by default, matplotlib color_map can be provided + create color list with given number of entries + grey by default, matplotlib color_map can be provided """ try: @@ -1052,8 +1053,8 @@ def create_color_list(number, color_map=None, logging=False, max_grey="#595959") # get descrete color list from pylab cmaplist = [cmap(i) for i in range(cmap.N)] # extract colors from map # determine positions for number of colors required - steps = (len(cmaplist)-1)/(number) - numbers = range(0, len(cmaplist), steps) + steps = (len(cmaplist)-1) // (number) + numbers = list(range(0, len(cmaplist), steps)) # extract color and convert to hex code colors = [] @@ -1065,15 +1066,15 @@ def create_color_list(number, color_map=None, logging=False, max_grey="#595959") # grey except: if not color_map == None: - logprint("Invalid color_map (%s) provided! - Examples: jet, Blues, OrRd, bwr,..." % color_map) + logprint("Invalid color_map (%s) provided! - Examples: jet, Blues, OrRd, bwr,..." % (color_map)) logprint("See https://matplotlib.org/users/colormaps.html\n") old_max_grey = "#373737" old_max_grey = "#444444" colors = list(Color("#FFFFFF").range_to(Color(max_grey), number)) # grey - for idx in range(len(colors)): - colors[idx] = str(colors[idx]).replace("Color ", "") + for idx in range(len(colors)): + colors[idx] = str(colors[idx]).replace("Color ", "") if "#" in colors[idx] and len(colors[idx]) != 7: - # print colors[idx] + # print(colors[idx]) colors[idx] = colors[idx] + colors[idx][-(7-len(colors[idx])):] text = "%d Colors: %s" % (len(colors), ", ".join(colors)) @@ -1095,8 +1096,8 @@ def read_seq(input_fasta, verbose=False): """ # check if file provided - if input_fasta == [] or input_fasta == "": - text = "Attention: No valid file names provided: >%s<" % input_fasta + if len(input_fasta) == 0: + text = "Attention: No valid file names provided: >%s<" % (input_fasta) logprint(text, start=False, printing=True) return {}, [] @@ -1105,11 +1106,11 @@ def read_seq(input_fasta, verbose=False): # concatenate fasta files if len(input_fasta) > 1: if verbose: - print "concatenating fastas...", + print("concatenating fastas...", end=" ") text = "concatenating fastas..." input_fasta_combi = concatenate_files(input_fasta) if verbose: - print "done" + print("done") text += "done" logprint(text, start=False, printing=False) else: @@ -1119,10 +1120,10 @@ def read_seq(input_fasta, verbose=False): # read sequences if verbose: - print "reading fasta...", + print("reading fasta...", end=" ") text = "reading fasta...", try: - seq_dict = SeqIO.index(input_fasta_combi, "fasta") + seq_dict = SeqIO.index(input_fasta_combi, "fasta") except ValueError: logprint("Error reading fasta sequences - please check input files, e.g. for duplicate names!") return {}, [] @@ -1131,7 +1132,7 @@ def read_seq(input_fasta, verbose=False): return {}, [] if verbose: - print "done" + print("done") text += "done" logprint(text, start=False, printing=False) @@ -1183,8 +1184,8 @@ def read_gff_color_config(gff_color_config_file=""): text = "Updating GFF color configuration with custom specifications\n" logprint(text, start=False, printing=True) - # read custom gff_color_config_file - in_file = open(gff_color_config_file, 'rb') + # read custom gff_color_config_file + in_file = open(gff_color_config_file, 'r') overwritten = set([]) for line in in_file: if not line.startswith("#") and len(line.strip().split("\t")) >= 4: @@ -1212,7 +1213,7 @@ def read_gff_color_config(gff_color_config_file=""): # track changes of predefined settings if feat in gff_feat_colors.keys(): - overwritten.add(data[0].lower()) + overwritten.add(data[0].lower()) gff_feat_colors[feat] = (color, alpha, zoom) in_file.close() @@ -1223,18 +1224,18 @@ def read_gff_color_config(gff_color_config_file=""): if verbose: # print configuration - text = "\n\nGFF color specification:\n%s\n" % (60 * ".") + text = "\n\nGFF color specification:\n%s\n" % (60*".") for item in sorted(gff_feat_colors.keys()): text += "%-30s\t%-10s\t%-5s\t%s\n" % (item, str(gff_feat_colors[item][0]), str(gff_feat_colors[item][1]), str(gff_feat_colors[item][2])) - logprint (text, printing=True) + logprint(text, printing=True) # print overwritting feature type specifications if len(overwritten) != 0: - text = "%d feature type specifications overwritten:" % len(overwritten) + text = "%d feature type specifications overwritten:" % (len(overwritten)) text += "\n\t"+ ", ".join(overwritten) + "\n" logprint(text, start=False, printing=True) - text = "GFF color specification updated acc. to %s\n\t%s\n\n" % (gff_color_config_file, ", ".join(gff_feat_colors)) + text = "GFF color specification updated acc. to %s\n\t%s\n\n" % (gff_color_config_file, ", ".join(sorted(gff_feat_colors.keys()))) logprint(text, start=False, printing=True) return gff_feat_colors @@ -1247,7 +1248,7 @@ def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=T if type(input_gff_files) != list: input_gff_files = [input_gff_files] - # create dictionary with seq_name as key and (type, start and stop) as value + # create dictionary with seq_name as key and (type, start and stop) as value unknown_feats = set([]) used_feats = set([]) feat_dict = {} @@ -1255,7 +1256,7 @@ def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=T text = "...reading " + input_gff logprint(text, start=False, printing=True) - in_file = open(input_gff, 'rb') + in_file = open(input_gff, 'r') for line in in_file: if not line.startswith("#") and line.strip() != "": data = line.strip().split("\t") @@ -1274,13 +1275,13 @@ def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=T else: feat_dict[data[0]].append((feat_type, int(data[3]), int(data[4]))) # feature type, start, stop if verbose: - text = "\nAnnotations for: %s\n" % ", ".join(feat_dict.keys()[:10]) + text = "\nAnnotations for: %s\n" % (", ".join(sorted(feat_dict.keys())[:10])) if len(feat_dict.keys()) > 10: text = text[:-1] + ", ...\n" logprint(text, start=False, printing=True) in_file.close() - # print feature types without specific shading settings + # print feature types without specific shading settings if len(unknown_feats) != 0: text = "Missing shading specification for %d feature type(s):\n\t%s\n" % (len(unknown_feats), ", ".join(sorted(unknown_feats))) logprint(text, start=False, printing=True) @@ -1299,7 +1300,7 @@ def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=T return feat_dict def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, verbose=False): - input_file = open(matrix_file_name, 'rb') + input_file = open(matrix_file_name, 'r') # read sequence names from first column names = [] @@ -1309,20 +1310,20 @@ def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, v logprint("Delimiter '%s': %d names - %s\n" % (delim, len(names), ", ".join(names))) # check if names were found - otherwise try another delimiter - if names == [] and not recursion: + if len(names) == 0 and not recursion: if delim == "\t": new_delim = "," else: new_delim = "\t" - logprint("\nMatrix file not containing data delimited by '%s' - trying to read matrix with delimiter '%s'" % (delim.replace("\t", "\\t"), new_delim)) + logprint("\nMatrix file not containing data delimited by '%s' - trying to read matrix with delimiter '%s'" % (delim.replace("\t", "\\t"), new_delim)) info_dict = read_matrix(matrix_file_name, delim=new_delim, symmetric=symmetric, recursion=True, verbose=verbose) return info_dict - elif names == []: + elif len(names) == 0: logprint("Empty matrix file with alternative delimiter!") return info_dict input_file.close() - input_file = open(matrix_file_name, 'rb') + input_file = open(matrix_file_name, 'r') # read matrix entries as values in dictionary with tuple(names) as key info_dict = {} contradictory_entries = [] @@ -1330,11 +1331,10 @@ def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, v if not line.startswith("#") and not line.startswith(delim) and delim in line: data = line.strip().split(delim) for idx in range(len(data[1:])): - # print tuple(sorted([data[0], names[idx]])), data[idx+1] if symmetric: key = tuple(sorted([names[idx], data[0]])) else: - key = tuple(names[idx], data[0]) + key = tuple([names[idx], data[0]]) if key in info_dict.keys(): if symmetric and info_dict[key] != data[idx+1] and data[idx+1] not in ["", "-"] and info_dict[key] not in ["", "-"]: contradictory_entries.append(key) @@ -1352,7 +1352,7 @@ def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, v logprint(log_txt) logprint("Using value from bottom left triangle!") if verbose: - logprint("\nMatrix information for Sequences named: " % ", ".join(names)) + logprint("\nMatrix information for Sequences named: " % (", ".join(names))) return info_dict @@ -1365,9 +1365,9 @@ def concatenate_files(file_list, combi_filename="temp_combined.fasta", verbose=F for item in file_list: if verbose: text += item + " " - print item, + print(item, end=" ") # read in_file linewise and write to out_file - in_file = open(item, 'rb') + in_file = open(item, 'r') for line in in_file: out_file.write(line.strip()+"\n") in_file.close() @@ -1387,7 +1387,7 @@ def degap_fasta(input_fasta): input_fasta = list(input_fasta) for input_fas in input_fasta: output_fas = input_fas[:input_fas.rfind(".")] + "_degapped.fas" - in_file = open(input_fas, 'rb') + in_file = open(input_fas, 'r') out_file = open(output_fas, 'w') for line in in_file: if line.startswith(">"): @@ -1410,13 +1410,13 @@ def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="pn if filetype not in ["png", "pdf", "svg"]: text = "Provide valid file type - png, pdf, or svg" logprint(text, start=False, printing=True) - filetype="png" + filetype="png" # check if length of information fit - if not gff_legend and ((bins != [] and len(colors) != lcs_shading_num+1) or (bins != [] and len(colors) != len(bins)+1)): - if bins != [] and len(colors) != lcs_shading_num+1: + if not gff_legend and ((len(bins) != 0 and len(colors) != lcs_shading_num+1) or (len(bins) != 0 and len(colors) != len(bins)+1)): + if len(bins) != 0 and len(colors) != lcs_shading_num+1: text = "**Attention**\nlcs_shading_num (%d) does not match number of colors (%d)!\n"% (lcs_shading_num, len(bins)) - elif bins != [] and len(colors) != len(bins)+1: + elif len(bins) != 0 and len(colors) != len(bins)+1: text = "**Attention**\nnumber of LCS length bins (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) logprint(text, start=False, printing=True) elif gff_legend and len(bins) != len(colors): @@ -1424,12 +1424,12 @@ def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="pn logprint(text, start=False, printing=True) # set alpha values to opaque if none are provided - if alphas == []: + if len(alphas) == 0: for item in colors: alphas.append(1) # legend data points - data_points = range(len(colors)) + data_points = list(range(len(colors))) if not gff_legend: # specify intervals, if max_len provided @@ -1444,9 +1444,9 @@ def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="pn for idx in range(lcs_shading_num): len_pos.append(float("%.2f" % (len_pos[-1] + len_interval_size))) - if prefix.startswith("custom-matrix") and (0 <= max_len <= 100 and 0 <= min_len <= 100): + if "custom-matrix" in prefix.lower() and (0 <= max_len <= 100 and 0 <= min_len <= 100): unit = "%" - elif prefix.startswith("custom-matrix"): + elif "custom-matrix" in prefix.lower(): unit = "" text = "\n%d Legend intervals from %.2f to %.2f: \n\t%s - number: %d, step: %.2f, unit: %s\n" % (lcs_shading_num+1, min_len, max_len, str(len_pos), len(len_pos), len_interval_size, unit) @@ -1463,7 +1463,7 @@ def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="pn pos.append(float("%.2f" % (pos[-1] + len_interval_size))) # interval_size = 100 // lcs_shading_num - # pos = range(interval_size, 101+interval_size, interval_size) + # pos = list(range(interval_size, 101+interval_size, interval_size)) # remove unneccessary zeros in decimal places (i.e. if x.x00 in all entries) while True: @@ -1498,13 +1498,13 @@ def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="pn if "." in str(pos[idx]): rounded_unit = True # round values up to next integer (keep integer, if not a fraction) - pos[idx] = int(pos[idx] / 1) + int(pos[idx] % 1 > 0) + pos[idx] = int(pos[idx] // 1) + int(pos[idx] % 1 > 0) if idx == len(pos) - 1 and pos[idx] == 101: pos[idx] = 100 if rounded_unit: logprint("Fractions not permitted for unit '%s': %s -> %s" % (unit, temp_pos, pos)) - if bins != []: # labels provided + if len(bins) != 0: # labels provided legend_labels = bins[:] legend_labels.append("max") legend_labels_lengths = [] @@ -1540,28 +1540,28 @@ def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="pn if gff_legend: label_text = bins[:] edge_col = None - legend_file_name = "GFF_Shading_Legend_n%d." % lcs_shading_num + filetype - elif max_len != None: + legend_file_name = "Legend_GFF_Shading_n%d.%s" % (lcs_shading_num, filetype) + elif max_len != None: label_text = legend_labels_lengths[:] edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_max%d%s_n%d." % (max_len, unit, lcs_shading_num+1) + filetype - elif bins != []: + legend_file_name = "Legend_LCS_Shading_max%d%s_n%d.%s" % (max_len, unit, lcs_shading_num+1, filetype) + elif len(bins) != 0: label_text = legend_labels_lengths[:] edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%d%s_n%d." % (bins[0], unit, lcs_shading_num+1) + filetype + legend_file_name = "Legend_LCS_Shading_%d%s_n%d.%s" % (bins[0], unit, lcs_shading_num+1, filetype) else: label_text = legend_labels[:] edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%%len_n%d." % (lcs_shading_num+1) + filetype + legend_file_name = "Legend_LCS_Shading_%%len_n%d.%s" % (lcs_shading_num+1, filetype) if prefix != None and prefix != "": - if not prefix.endswith("-"): - prefix = prefix + "-" - legend_type = "LCS" - if prefix.startswith("custom-matrix"): + if prefix[-1] not in ["-", "_"]: + prefix = prefix + "_" + if "custom-matrix" in prefix.lower(): prefix = prefix.replace("custom-matrix", "")[1:] - legend_type = "CustomMatrix" - legend_file_name = prefix + legend_file_name.replace("LCS", legend_type) + legend_file_name = prefix + legend_file_name.replace("LCS", "CustomMatrix") + elif "GFF" in legend_file_name or "LCS" in legend_file_name: + legend_file_name = prefix + legend_file_name # plot legend figure fig, ax = P.subplots(3, 1, figsize=(len(colors)*2, len(colors)*2)) @@ -1602,15 +1602,15 @@ def wobble_replacement(sequence, general_ambiguity_code, verbose=False): (only residues considered that are keys in wobble_dictionary) """ - # get positions of ambiguous residues + # get positions of ambiguous residues wobble_pos = [] for idx in range(len(sequence)): letter = sequence[idx] if letter in general_ambiguity_code.keys(): wobble_pos.append(idx) - if verbose: - text = "\t%d wobbles" % len(wobble_pos) + if verbose: + text = "\t%d wobbles" % (len(wobble_pos)) logprint(text, start=False, printing=True) # replace one wobble through each iteration by all possible residues @@ -1618,7 +1618,7 @@ def wobble_replacement(sequence, general_ambiguity_code, verbose=False): kmer_variants = [sequence] while True: if verbose: - text = "\t\t%d kmer variants" % len(kmer_variants) + text = "\t\t%d kmer variants" % (len(kmer_variants)) logprint(text, start=False, printing=True) temp_kmers = set([]) for kmer in kmer_variants: @@ -1635,10 +1635,10 @@ def wobble_replacement(sequence, general_ambiguity_code, verbose=False): if letter in general_ambiguity_code.keys(): wobble = True break - if wobble: + if wobble: break kmer_variants = set(list(temp_kmers)[:]) - if not wobble: + if not wobble: break return kmer_variants @@ -1679,11 +1679,13 @@ def lcs_from_x_values(x_values): # Matching Functions # ############################### -def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): +def find_match_pos_diag(seq1, seq2, wordsize, rc_option=True, narrow_diagonal_interval=False, report_lcs=False, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): """ find all matching positions with matches >= wordsize convert matching points into lines of the length of the match (+ optional handling of ambiguities) + + if narrow_diagonal_interval, only consider diagonal lines within a certain distance (narrow_diagonal_interval) to the middle diagonal """ global t1 # timer @@ -1693,7 +1695,7 @@ def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, else: any_residue = "X" - # read sequences + # read sequences seq_one = seq1.upper(); len_one = len(seq_one) seq_two = seq2.upper(); len_two = len(seq_two) @@ -1741,7 +1743,6 @@ def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, else: kmer_variants = wobble_replacement(kmer, general_ambiguity_code) for new_kmer in kmer_variants: - # print "\t", new_kmer try: kmer_pos_dict[new_kmer].append(i) except KeyError: @@ -1752,29 +1753,71 @@ def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, matches_rc = set(kmer_pos_dict_three).intersection(kmer_pos_dict_four) # reverse complement if verbose: - text = "[matches: %i for; %.i rc]" % (len(matches_for), len(matches_rc)) + if rc_option: + text = "[matches: %d for; %d rc]" % (len(matches_for), len(matches_rc)) + else: + text = "[matches: %d for; no rc]" % (len(matches_for)) logprint(text, start=False, printing=True) + if narrow_diagonal_interval not in [None, "", 0, False]: + rc_threshold = narrow_diagonal_interval // 2 + (narrow_diagonal_interval % 2) + # create lists of x and y co-ordinates for scatter plot # keep all coordinates of all shared kmers (may match multiple times) diag_dict_for = {} diag_dict_rc = {} - for (match_list, pos_dict1, pos_dict2, diag_dict) in [(matches_for, kmer_pos_dict_one, kmer_pos_dict_two, diag_dict_for), - (matches_rc, kmer_pos_dict_three, kmer_pos_dict_four, diag_dict_rc)]: + for (match_list, pos_dict1, pos_dict2, diag_dict, diag_ori) in [(matches_for, kmer_pos_dict_one, kmer_pos_dict_two, diag_dict_for, "for"), + (matches_rc, kmer_pos_dict_three, kmer_pos_dict_four, diag_dict_rc, "rc")]: for kmer in match_list: for i in pos_dict1[kmer]: for j in pos_dict2[kmer]: diag = i-j - points = set(range(i+1, i+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points + # no narrow window or forward within narrow window + if narrow_diagonal_interval in [None, "", 0, False] or (diag_ori == "for" and (abs(diag) <= narrow_diagonal_interval)): + lower_bound = i+1 + upper_bound = i+1+wordsize + update_dict = True + # recalculate values for reverse if overlapping window + elif diag_ori == "rc": + lower_bound = i+1 + upper_bound = i+1+wordsize + + min_val = int((len_two + diag)//2 + (len_two + diag)%2 - rc_threshold + 1) + max_val = int((len_two + diag)//2 + rc_threshold + 1) + + # check, if lower bounds within narrowed window + check1 = min_val <= lower_bound <= max_val + check2 = min_val <= upper_bound <= max_val + + # both bounds within window + if check1 and check2: + update_dict = True + # lower_bound in window - upper_bound not + elif check1: + upper_bound = max_val + update_dict = True + # upper_bound in window - lower_bound not + elif check2: + lower_bound = min_val + update_dict = True + else: + update_dict = False else: - diag_dict[diag].update(points) + update_dict = False + + # calculate x values for plotting + if update_dict: + points = set(range(lower_bound, upper_bound)) + if len(points) != 0: + if not diag in diag_dict.keys(): + diag_dict[diag] = points + else: + diag_dict[diag].update(points) # convert coordinate points to line start and stop positions x1 = [] # x values reverse y1 = [] # y values forward - for diag in diag_dict_for.keys(): + for diag in sorted(diag_dict_for.keys()): x_values = np.array(sorted(diag_dict_for[diag])) x1.extend(split_diagonals(x_values)) y_values = split_diagonals(x_values - diag) @@ -1783,8 +1826,8 @@ def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, x2 = [] # x values rc y2 = [] # y values rc if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 + for diag in sorted(diag_dict_rc.keys()): + factor = len_two + diag + 1 x_values = np.array(sorted(diag_dict_rc[diag])) x2.extend(split_diagonals(x_values)) y_values = split_diagonals(factor - x_values, -1) @@ -1794,31 +1837,34 @@ def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, t1 = time_track(t1) if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) + return np.array([np.array(x) for x in x1], dtype=object), np.array([np.array(y) for y in y1], dtype=object), np.array([np.array(x) for x in x2], dtype=object), np.array([np.array(y) for y in y2], dtype=object) else: # get length of longest common substring based on match lengths lcs_for = lcs_from_x_values(x1) lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev + return np.array([np.array(x) for x in x1], dtype=object), np.array([np.array(y) for y in y1], dtype=object), np.array([np.array(x) for x in x2], dtype=object), np.array([np.array(y) for y in y2], dtype=object), lcs_for, lcs_rev -def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): + +def find_match_pos_regex(seq1, seq2, wordsize, rc_option=True, narrow_diagonal_interval=False, report_lcs=False, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False, substitution_count=0): """ find all matching positions with matches >= wordsize via regular expression search - fuzzy matching - allow up to substitution_count substitutions + fuzzy matching - allow up to substitution_count substitutions convert matching points into lines of the length of the match (+ optional handling of ambiguities) + + if narrow_diagonal_interval, only consider diagonal lines within a certain distance (narrow_diagonal_interval) to the middle diagonal """ global t1 # timer - # read sequences + # read sequences seq_one = seq1.upper(); len_one = len(seq_one) seq_two = seq2.upper(); len_two = len(seq_two) # set ambiguity code for wobble replacement general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - ambiguity_match_dict = alphabets(type_nuc)[3] + ambiguity_match_dict = alphabets(type_nuc)[3] - ambiq_residues = "[%s]" % "".join(general_ambiguity_code.keys()) + ambiq_residues = "[%s]" % ("".join(sorted(general_ambiguity_code.keys()))) # look for Ns in DNA or Xs in proeins (minimum word size) if type_nuc == True: @@ -1837,24 +1883,27 @@ def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs= diag_dict_rc = {} counter = [0, 0] + if narrow_diagonal_interval not in [None, "", 0, False]: + rc_threshold = narrow_diagonal_interval // 2 + (narrow_diagonal_interval % 2) + # one-way matching if rc_option: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0), - (str(seq_one), str(seq_two.reverse_complement()), diag_dict_rc, 1)] + data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0, "for"), + (str(seq_one), str(seq_two.reverse_complement()), diag_dict_rc, 1, "rc")] else: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0)] + data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0, "for")] - for seq_query, seq_target, diag_dict, counter_pos in data_list: + for (seq_query, seq_target, diag_dict, counter_pos, diag_ori) in data_list: # split query sequence into kmers if not rc_option and counter_pos == 1: - break + break for idx in range(len(str(seq_query))-wordsize+1): kmer = str(seq_query)[idx:idx+wordsize] # skip excessive N/X stretches (big black areas) if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - # convert kmer to regular expression for wobble_matching + # convert kmer to regular expression for wobble_matching if convert_wobbles and wobble_found: kmer_string = "" # replace each residue with matching residues or wobbles @@ -1881,11 +1930,48 @@ def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs= # skip excessive N/X stretches (big black areas) if kmer2.count(any_residue)*100./wordsize <= max_N_percentage: diag = idx-(kdx+result.start()) - points = set(range(idx+1, idx+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points + + # no narrow window or forward within narrow window + if narrow_diagonal_interval in [None, "", 0, False] or (diag_ori == "for" and (abs(diag) <= narrow_diagonal_interval)): + lower_bound = idx+1 + upper_bound = idx+1+wordsize + update_dict = True + # recalculate values for reverse if overlapping window + elif diag_ori == "rc": + lower_bound = idx+1 + upper_bound = idx+1+wordsize + + min_val = int((len_two + diag)//2 + (len_two + diag)%2 - rc_threshold + 1) + max_val = int((len_two + diag)//2 + rc_threshold + 1) + + # check, if lower bounds within narrowed window + check1 = min_val <= lower_bound <= max_val + check2 = min_val <= upper_bound <= max_val + + # both bounds within window + if check1 and check2: + update_dict = True + # lower_bound in window - upper_bound not + elif check1: + upper_bound = max_val + update_dict = True + # upper_bound in window - lower_bound not + elif check2: + lower_bound = min_val + update_dict = True + else: + update_dict = False else: - diag_dict[diag].update(points) + update_dict = False + + # calculate x values for plotting + if update_dict: + points = set(range(lower_bound, upper_bound)) + if len(points) != 0: + if not diag in diag_dict.keys(): + diag_dict[diag] = points + else: + diag_dict[diag].update(points) kdx += result.start() + 1 if kdx >= len(seq_target): @@ -1894,14 +1980,16 @@ def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs= counter[counter_pos] += 1 if verbose: - text = "%5.i \tforward matches" % counter[0] - text += "\n%5.i \treverse complementary matches" % counter[1] + if rc_option: + text = "[matches: %d for; %d rc]" % (counter[0], counter[1]) + else: + text = "[matches: %d for; no rc]" % (counter[0]) logprint(text, start=False, printing=True) # convert coordinate points to line start and stop positions x1 = [] # x values reverse y1 = [] # y values forward - for diag in diag_dict_for.keys(): + for diag in sorted(diag_dict_for.keys()): x_values = np.array(sorted(diag_dict_for[diag])) x1.extend(split_diagonals(x_values)) y_values = split_diagonals(x_values - diag) @@ -1910,8 +1998,8 @@ def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs= x2 = [] # x values rc y2 = [] # y values rc if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 + for diag in sorted(diag_dict_rc.keys()): + factor = len_two + diag + 1 x_values = np.array(sorted(diag_dict_rc[diag])) x2.extend(split_diagonals(x_values)) y_values = split_diagonals(factor - x_values, -1) @@ -1921,19 +2009,20 @@ def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs= t1 = time_track(t1) if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) + return np.array([np.array(x) for x in x1], dtype=object), np.array([np.array(y) for y in y1], dtype=object), np.array([np.array(x) for x in x2], dtype=object), np.array([np.array(y) for y in y2], dtype=object) else: # get length of longest common substring based on match lengths lcs_for = lcs_from_x_values(x1) lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev + return np.array([np.array(x) for x in x1], dtype=object), np.array([np.array(y) for y in y1], dtype=object), np.array([np.array(x) for x in x2], dtype=object), np.array([np.array(y) for y in y2], dtype=object), lcs_for, lcs_rev + ############################### # Dot Plot Functions # ############################### -def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}): +def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, rc_option=True, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, narrow_diagonal_interval=False): """ self-against-self dotplot partially from biopython cookbook @@ -1950,19 +2039,19 @@ def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # check if at least one input sequence if len(sequences) == 0: - text = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") + text = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (60*"=", len(sequences), 40*"-") text += " No sequences provided for selfdotplot!\n\nTerminating polydotplot!" logprint(text, start=False, printing=True) return elif len(sequences) == 1 and multi: text = "\n\nCreating collage output for single selfdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" + text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n" logprint(text, start=False, printing=True) if multi and (ncols == 0 or nrows == 0): ncols = max(ncols, 1) nrows = max(nrows, 1) - text = "\n\nSelfdotplot Collage: Invalid collage - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) + text = "\n\nSelfdotplot Collage: Invalid collage settings - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) logprint(text, start=False, printing=True) if multi and ncols > len(sequences): @@ -1970,42 +2059,47 @@ def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, nrows = 1 text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences): + elif multi and ncols*(nrows-1) > len(sequences): nrows = ((len(sequences)-1) // ncols) + 1 text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) logprint(text, start=False, printing=True) - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: + if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size // 2: label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size + text = "Reducing label size for better visualization to %d\n" % (label_size) logprint(text, start=False, printing=True) - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) + # prepare prefix and prefix for legend files, if required + if prefix != None: + if prefix[-1] not in ["-", "_"]: + prefix = prefix + "_" + else: + prefix = "" + + # read gff annotation data if provided for shading + if gff_files != None and len(gff_files) != 0: + legend_prefix = prefix + "Self" + text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (60*"=", len(gff_files), 40*"-", ", ".join(gff_files)) logprint(text, start=False, printing=True) - if prefix != None and prefix != "": - legend_prefix = prefix + "-Selfdotplot" - else: legend_prefix = "Selfdotplot" feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=legend_prefix, filetype=filetype, verbose=verbose) global t1 - print "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-"), - log_txt = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") + print("\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (60*"=", len(sequences), 40*"-"), end=" ") + log_txt = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (60*"=", len(sequences), 40*"-") # preparations for file name - name_graph = "Selfdotplots" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" + if not narrow_diagonal_interval: + name_graph = "Selfdotplot" else: - prefix = "" + name_graph = "NarrowSelfdotplot" suffix = "" + if substitution_count != 0: + suffix += "_S%d" % (substitution_count) if convert_wobbles: suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count + if rc_option: + suffix += "_rc" if multi: suffix += "_collage" @@ -2019,11 +2113,17 @@ def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, if multi: fig = P.figure(figsize=(figsize_x, figsize_y)) page_counter = 1 - list_of_png_names = [] + list_of_fig_names = [] + + # create multi-page pdf + if filetype == "pdf": + fig_name = "%s%s_ws%d%s.%s" % (prefix, name_graph, wordsize, suffix, filetype) + list_of_fig_names.append(fig_name) + pdf = PdfPages(fig_name) counter = 0 for seq_name in sequences: - print seq_name, + print(seq_name, end=" ") log_txt += " " + seq_name counter += 1 @@ -2038,11 +2138,38 @@ def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # get positions of matches if substitution_count != 0: - # print "RE" - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_regex(seq_one, seq_one, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) + # print("RE") + x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_regex(seq_one, seq_one, wordsize, rc_option=rc_option, substitution_count=substitution_count, convert_wobbles=convert_wobbles, narrow_diagonal_interval=narrow_diagonal_interval, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) else: - # print "DIAG", - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_diag(seq_one, seq_one, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) + # print("DIAG") + x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_diag(seq_one, seq_one, wordsize, rc_option=rc_option, convert_wobbles=convert_wobbles, narrow_diagonal_interval=narrow_diagonal_interval, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) + + # shade annotated regions + gff_shade_list = [] + if gff_files != None and len(gff_files) != 0: + if seq_name in feat_dict.keys(): + features = feat_dict[seq_name] + for item in features: + feat_type, start, stop = item + feat_color, strength, zoom = gff_color_dict[feat_type.lower()] + start = max(0, start - zoom - 0.5) + stop = min(length_seq+1, stop + zoom + 0.5) + width = stop - start + gff_shade_list.append(tuple([feat_type, start, width, feat_color, strength, zoom])) + + # collect lines + lines = [] + color_list = [] + for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: + if col != "white" and len(x_lines) != 0: + for ldx in range(len(x_lines)): + try: + lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) + color_list.append(col) + except: + print("Proceeding after error with line collection at index %d (data=%s) in Selfdotplot" % (ldx, str(x_lines[ldx]))) + color_list = np.array(color_list) + # plotting with matplotlib ################################# @@ -2052,48 +2179,31 @@ def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # plotting subplot with matplotlib ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) + # gff-based shading of annotated regions + for item in gff_shade_list: + feat_type, start, width, feat_color, strength, zoom = item + ax.add_patch(patches.Rectangle((start, start), # (x,y) + width, width, # width, height + edgecolor=None, linewidth=line_width+zoom, + fill=True, facecolor=feat_color, + alpha=strength)) # draw lines lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) ax.add_collection(lc) # format axes - # print P.xticks()[0], P.yticks()[0] + # print(P.xticks()[0], P.yticks()[0]) P.axis('scaled') # make images quadratic P.xlim(0, length_seq+1) if mirror_y_axis: P.ylim(0, length_seq+1) # rotate y axis (point upwards) else: P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) + P.xlabel("[%s]" % (aa_bp_unit), fontsize=label_size) + P.ylabel("[%s]" % (aa_bp_unit), fontsize=label_size) P.tick_params(axis='both', which='major', labelsize=label_size*.9) - + # # use same tick labels for x and y axis # tick_locs, tick_labels = P.yticks() # P.xticks(tick_locs) @@ -2105,21 +2215,26 @@ def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # save figure and reinitiate if page is full if counter == ncols * nrows: - # finalize layout - margins & spacing between plots + # finalize layout - margins & spacing between plots try: P.tight_layout(h_pad=.02, w_pad=.02) except: logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' % (prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') + # save figure + if filetype == "pdf": + P.suptitle("Page %.3d" % (page_counter), size=label_size*1.5, weight="bold", y=1.02) + pdf.savefig(bbox_inches='tight') + else: + # name and create output files (names derived from SEQNAME) + fig_name = "%s%s_ws%d%s-%.3d.%s" % (prefix, name_graph, wordsize, suffix, page_counter, filetype) + list_of_fig_names.append(fig_name) + P.savefig(fig_name, bbox_inches='tight') + P.close() P.cla() - list_of_png_names.append(fig_name) - counter = 0 page_counter += 1 @@ -2129,35 +2244,16 @@ def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, else: # not multi fig = P.figure(figsize=(plot_size, plot_size)) # figure size needs to be a square - ax = P.subplot(1, 1, 1) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) + ax = P.subplot(1, 1, 1) # rows, columns, plotnumber - # collect lines - lines = [] - number = 0 - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - - color_list = np.array(color_list) + # gff-based shading of annotated regions + for item in gff_shade_list: + feat_type, start, width, feat_color, strength, zoom = item + ax.add_patch(patches.Rectangle((start, start), # (x,y) + width, width, # width, height + edgecolor=None, linewidth=line_width+zoom, + fill=True, facecolor=feat_color, + alpha=strength)) # draw lines lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) @@ -2170,10 +2266,10 @@ def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, P.ylim(0, length_seq+1) # rotate y axis (point upwards) else: P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) + P.xlabel("[%s]" % (aa_bp_unit), fontsize=label_size) + P.ylabel("[%s]" % (aa_bp_unit), fontsize=label_size) P.tick_params(axis='both', which='major', labelsize=label_size*.9) - + # # use same tick labels for x and y axis # tick_locs, tick_labels = P.yticks() # P.xticks(tick_locs) @@ -2181,38 +2277,50 @@ def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, P.title(unicode_name(shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size*1.3, fontweight='bold') - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_%s_wordsize%i%s.%s' %(prefix, name_graph, counter, shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos), wordsize, suffix, filetype) - P.savefig(fig_name, bbox_inches='tight') + # save figure + if filetype == "pdf": + P.suptitle("Page %.3d" % (counter), size=label_size*1.5, weight="bold", y=1.02) + pdf.savefig(bbox_inches='tight') + else: + # name and create output files (names derived from SEQNAME) + fig_name = "%s%s-%d_%s_ws%d%s.%s" % (prefix, name_graph, counter, shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos), wordsize, suffix, filetype) + list_of_fig_names.append(fig_name) + P.savefig(fig_name, bbox_inches='tight') P.close() P.cla() # clear any prior graph - list_of_png_names.append(fig_name) - if multi and counter >= 1: - # finalize layout - margins & spacing between plots - try: + # finalize layout - margins & spacing between plots + try: P.tight_layout(h_pad=.02, w_pad=.02) except: logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') + # save figure + if filetype == "pdf": + P.suptitle("Page %.3d" % (page_counter), size=label_size*1.5, weight="bold", y=1.02) + pdf.savefig(bbox_inches='tight') + else: + # name and create output files (names derived from SEQNAME) + fig_name = "%s%s_ws%d%s-%.3d.%s" % (prefix, name_graph, wordsize, suffix, page_counter, filetype) + list_of_fig_names.append(fig_name) + P.savefig(fig_name, bbox_inches='tight') + P.close() P.cla() # clear any prior graph - list_of_png_names.append(fig_name) + if filetype == "pdf": + pdf.close() - print "\n\nDrawing selfdotplots done" + print("\n\nDrawing selfdotplots done") log_txt += "\n\nDrawing selfdotplots done" logprint(log_txt, start=False, printing=False) - return list_of_png_names + return list_of_fig_names -def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, x_label_pos_top=True, only_vs_first_seq=False, length_scaling=True, scale_delim_col="red"): +def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, rc_option=True, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, x_label_pos_top=True, only_vs_first_seq=False, length_scaling=True, scale_delim_col="red"): """ pairwise dotplot (all-against-all) """ @@ -2228,13 +2336,13 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # check if at least two input sequences if len(sequences) < 2: - text = "\n%s\n\nCreating %d paired dotplot image \n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") + text = "\n%s\n\nCreating %d paired dotplot image \n%s\n\n=>" % (60*"=", len(sequences)*(len(sequences)-1)//2, 40*"-") text += " Please provide at least two sequences for pairdotplot!\n\nTerminating paired dotplot!" logprint(text, start=False, printing=True) return elif len(sequences) == 2 and multi: text = "\n\nCreating collage output for single pairdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" + text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n" logprint(text, start=False, printing=True) if multi and (ncols == 0 or nrows == 0): @@ -2248,22 +2356,22 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, nrows = 1 text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences)*(len(sequences)-1): + elif multi and ncols*(nrows-1) > len(sequences)*(len(sequences)-1): nrows = ((len(sequences)-1) // ncols) + 1 text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) logprint(text, start=False, printing=True) if not only_vs_first_seq: - text = "\n%s\n\nCreating %d paired dotplot image for\n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") + text = "\n%s\n\nCreating %d paired dotplot image for\n%s\n\n=>" % (60*"=", len(sequences)*(len(sequences)-1)//2, 40*"-") text += ", ".join(sequences) + "\n" else: - text = "\n%s\n\nCreating %d paired dotplot images against 1st sequence '%s':\n%s\n\n=>" % (50*"=", len(sequences)-1, sequences[0], 36*"-") + text = "\n%s\n\nCreating %d paired dotplot images against 1st sequence '%s':\n%s\n\n=>" % (60*"=", len(sequences)-1, sequences[0], 40*"-") text += ", ".join(sequences[1:]) + "\n" logprint(text, start=False, printing=True) - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: + if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size//2: label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size + text = "Reducing label size for better visualization to %d\n" % (label_size) logprint(text, start=False, printing=True) y_label_rotation = "vertical" @@ -2274,15 +2382,17 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # preparations for file name name_graph = "Pairdotplot" if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" + if prefix[-1] not in ["-", "_"]: + prefix = prefix + "_" else: prefix = "" suffix = "" + if substitution_count != 0: + suffix += "_S%d" % (substitution_count) if convert_wobbles: suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count + if rc_option: + suffix += "_rc" if length_scaling: suffix += "_scaled" if multi: @@ -2295,23 +2405,33 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) P.cla() # clear any prior graph - list_of_png_names = [] + list_of_fig_names = [] if multi: fig = P.figure(figsize=(figsize_x, figsize_y)) page_counter = 1 # prepare LCS data file - lcs_data_file = open("%sPairdotplot_wordsize%d_lcs_data_file%s.txt" % (prefix, wordsize, suffix.replace("_scaled", "").replace("_collage", "")), 'w') + lcs_data_file = open("%sPairdotplot_LCS_data_file_ws%d%s.txt" % (prefix, wordsize, suffix.replace("_scaled", "").replace("_collage", "")), 'w') lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") + # create multi-page pdf + if filetype == "pdf": + fig_name = "%s%s_ws%d%s.%s" % (prefix, name_graph, wordsize, suffix, filetype) + list_of_fig_names.append(fig_name) + pdf = PdfPages(fig_name) + if not mirror_y_axis: + page_num_pos = 1.10 + else: + page_num_pos = 1.02 + counter, seq_counter = 0, 0 - print "Drawing pairwise dotplot...", - log_txt = "Drawing pairwise dotplot..." + print("Drawing pairwise dotplot...", end=" ") + log_txt = "Drawing pairwise dotplot..." if verbose: seq_text = "" for idx in range(len(sequences)-1): if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), + print("\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), end=" ") seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) rec_two = seq_dict[sequences[idx]] name_two = rec_two.id @@ -2327,25 +2447,32 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, counter += 1 seq_counter += 1 if verbose: - print sequences[jdx], + print(sequences[jdx], end=" ") seq_text += " " + sequences[jdx] elif not seq_counter % 25: - print seq_counter, - log_txt += " " + str(seq_counter) + print(seq_counter, end=" ") + log_txt += " " + str(seq_counter) # get positions of matches if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) + # print("RE") + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, rc_option=rc_option, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) + # print("DIAG") + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, rc_option=rc_option, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - # write LCS data file - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") + # prevent ZeroDivisionError in case of empty sequences + try: + lcs_for_percentage = round(lcs_for*100./min(len_one, len_two), 3) + lcs_rev_percentage = round(lcs_rev*100./min(len_one, len_two), 3) + except: + lcs_for_percentage = "NA" + lcs_rev_percentage = "NA" + # write LCS data file + lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), + str(lcs_for), str(lcs_for_percentage), + str(lcs_rev), str(lcs_rev_percentage)]) + "\n") # plotting with matplotlib ################################# @@ -2371,10 +2498,13 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, lines = [] color_list = [] for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": + if col != "white" and len(x_lines) != 0: for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) + try: + lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) + color_list.append(col) + except: + print("Proceeding after error with line collection at index %d (data=%s) in Pairdotplot" % (ldx, str(x_lines[ldx]))) color_list = np.array(color_list) # draw lines @@ -2382,8 +2512,8 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, ax.add_collection(lc) # format axes - P.xlabel(unicode_name(shorten_name(name_one, max_len=title_length, title_clip_pos=title_clip_pos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.ylabel(unicode_name(shorten_name(name_two, max_len=title_length, title_clip_pos=title_clip_pos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) + P.xlabel(unicode_name("%s [%s]" % (shorten_name(name_one, max_len=title_length, title_clip_pos=title_clip_pos), aa_bp_unit)), fontsize=label_size, fontweight='bold', labelpad=4) + P.ylabel(unicode_name("%s [%s]" % (shorten_name(name_two, max_len=title_length, title_clip_pos=title_clip_pos), aa_bp_unit)), fontsize=label_size, fontweight='bold', labelpad=4) P.tick_params(axis='both', which='major', labelsize=label_size*.9) # P.axis('scaled') # make images scaled by size ### optional update ### @@ -2414,9 +2544,9 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # plot line deliminating shorter sequence if max_len != len_one: - ax.plot((len_one+1, len_one+1), (0, len_two), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") + ax.plot((len_one+1, len_one+1), (0, len_two), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") if max_len != len_two: - ax.plot((0, len_one), (len_two+1, len_two+1), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") + ax.plot((0, len_one), (len_two+1, len_two+1), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") # # use same tick labels for x and y axis # if P.xlim() == P.ylim(): @@ -2434,8 +2564,8 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # save figure and reinitiate if page is full if multi and counter == ncols * nrows: - # finalize layout - margins & spacing between plots - try: + # finalize layout - margins & spacing between plots + try: P.tight_layout(h_pad=.02, w_pad=.02) except: logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") @@ -2444,13 +2574,20 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, else: P.subplots_adjust(hspace=.5, wspace=.5, bottom=0.05) # space between rows - def 0.4 - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') + # save figure + if filetype == "pdf": + P.suptitle("Page %.3d" % (page_counter), size=label_size*1.5, weight="bold", y=page_num_pos) + pdf.savefig(bbox_inches='tight') + else: + # name and create output files + fig_name = "%s%s_ws%d%s-%.3d.%s" % (prefix, name_graph, wordsize, suffix, page_counter, filetype) + list_of_fig_names.append(fig_name) + P.savefig(fig_name, bbox_inches='tight') + P.close() P.cla() - list_of_png_names.append(fig_name) + list_of_fig_names.append(fig_name) counter = 0 page_counter += 1 @@ -2460,7 +2597,7 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # plotting separate figure files elif not multi: - # finalize layout - margins & spacing between plots + # finalize layout - margins & spacing between plots try: P.tight_layout(h_pad=.02, w_pad=.02) except: @@ -2473,14 +2610,22 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, else: P.subplots_adjust(hspace=0.02, wspace=0.02) # space between rows - def 0.4 - # name and create output files - fig_name = '%s%s-%d_wordsize%i%s.%s' % (prefix, name_graph, counter, wordsize, suffix, filetype) - P.savefig(fig_name) + # save figure + if filetype == "pdf": + P.suptitle("Page %.3d" % (counter), size=label_size*1.5, weight="bold", y=page_num_pos) + pdf.savefig(bbox_inches='tight') + else: + # name and create output files + fig_name = "%s%s-%d_ws%d%s.%s" % (prefix, name_graph, counter, wordsize, suffix, filetype) + list_of_fig_names.append(fig_name) + P.savefig(fig_name) + + # # check currently open figures + # print ([manager.canvas.figure for manager in matplotlib._pylab_helpers.Gcf.get_all_fig_managers()]) P.close() P.cla() - list_of_png_names.append(fig_name) - fig = P.figure() + list_of_fig_names.append(fig_name) if only_vs_first_seq: break @@ -2488,7 +2633,7 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # save figure if multi and counter >= 1: - # finalize layout - margins & spacing between plots + # finalize layout - margins & spacing between plots try: P.tight_layout(h_pad=.02, w_pad=.02) except: @@ -2498,29 +2643,39 @@ def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, else: P.subplots_adjust(hspace=0.5, wspace=0.5, bottom=0.05) # space between rows - def 0.4 - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') + # save figure + if filetype == "pdf": + P.suptitle("Page %.3d" % (page_counter), size=label_size*1.5, weight="bold", y=page_num_pos) + pdf.savefig(bbox_inches='tight') + else: + # name and create output files + fig_name = "%s%s_ws%d%s-%.3d.%s" % (prefix, name_graph, wordsize, suffix, page_counter, filetype) + list_of_fig_names.append(fig_name) + P.savefig(fig_name, bbox_inches='tight') + P.close() P.cla() - list_of_png_names.append(fig_name) + list_of_fig_names.append(fig_name) + + if filetype == "pdf": + pdf.close() if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" + print("%d done\n" % (seq_counter)) + log_txt += "%d done\n" % (seq_counter) else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter + print("\n%d done\n" % (seq_counter)) + log_txt += "\n%d done\n" % (seq_counter) logprint(log_txt, start=False, printing=False) if verbose: - print + print("") logprint(seq_text, start=False, printing=False) - return list_of_png_names + return list_of_fig_names -def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, x_label_pos_top=True, lcs_shading=True, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, lcs_shading_num=5, spacing=0.04, input_user_matrix_file="", user_matrix_print=True, rotate_labels=False): +def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, rc_option=True, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, x_label_pos_top=True, lcs_shading=True, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, lcs_shading_num=5, spacing=0.04, input_user_matrix_file="", user_matrix_print=True, rotate_labels=False): """ all-against-all dotplot derived from dotplot function @@ -2544,26 +2699,33 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, sequences = sorted(sequences) if len(sequences) == 0: - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") + text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (60*"=", len(sequences), len(sequences), 40*"-") text += " No sequences provided for polydotplot!\n\nTerminating polydotplot!" logprint(text, start=False, printing=True) return elif len(sequences) == 1: text = "\n\nCreating polydotplot for single sequence!" - text += "\nRecommendation: Use selfdotplot via '--plotting_mode 0'!\n\n" + text += "\nRecommendation: Use selfdotplot via '--plotting_mode 0'!\n" logprint(text, start=False, printing=True) - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") + text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (60*"=", len(sequences), len(sequences), 40*"-") text += " " + " ".join(sequences) + "\n" logprint(text, start=False, printing=True) - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) + # prepare prefix and prefix for legend files, if required + if prefix != None: + if prefix[-1] not in ["-", "_"]: + prefix = prefix + "_" + else: + prefix = "" + + if gff_files != None and len(gff_files) != 0 or lcs_shading: + legend_prefix = prefix + "Poly" + + # read gff annotation data if provided for shading + if gff_files != None and len(gff_files) != 0: + text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (60*"=", len(gff_files), 40*"-", ", ".join(gff_files)) logprint(text, start=False, printing=True) - if prefix != None and prefix != "": - legend_prefix = prefix + "-Polydotplot" - else: legend_prefix = "Polydotplot" feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=legend_prefix, filetype=filetype, verbose=verbose) if lcs_shading and not type_nuc: @@ -2574,7 +2736,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # read custom shading matrix & match names of sequences to fasta if input_user_matrix_file != "" and input_user_matrix_file != None: - logprint("Reading user matrix file: %s" % input_user_matrix_file) + logprint("Reading user matrix file: %s" % (input_user_matrix_file)) # lcs_shading_ori = 2 custom_dict = read_matrix(input_user_matrix_file) if custom_dict != {}: @@ -2587,15 +2749,15 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, number_key = [] # convert number into float - try: + try: value = float(custom_dict[key]) if not "." in custom_dict[key]: value = int(custom_dict[key]) custom_max = max(custom_max, value) custom_min = min(custom_min, value) - except: + except: value = custom_dict[key] - if value == "": + if value == "": value = None invalid_entries.append(key) # match matrix names with sequence names @@ -2619,8 +2781,8 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, rounding_factor = 5 multi_factor = 100 text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (multi_factor*custom_min // rounding_factor) * (1.*rounding_factor/multi_factor)) - custom_max = min((multi_factor*custom_max // rounding_factor) * (1.*rounding_factor/multi_factor), 1) + custom_min = max(0, (multi_factor*custom_min // rounding_factor) * (rounding_factor*1./multi_factor)) + custom_max = min((multi_factor*custom_max // rounding_factor) * (rounding_factor*1./multi_factor), 1) text += "new (%.2f, >%2f)\n" % (custom_min, custom_max) elif 0 <= custom_min < 100 and 0 < custom_max <= 100: @@ -2637,23 +2799,25 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, name_graph = "Polydotplot" suffix = "" - if convert_wobbles: - suffix += "_wobbles" if substitution_count != 0: - suffix += "_S%d" % substitution_count + suffix += "_S%d" % (substitution_count) + if convert_wobbles: + suffix += "_wobbles" + if rc_option: + suffix += "_rc" if custom_shading: - suffix += "_matrix" + suffix += "_matrix" if lcs_shading: suffix += "_%dshades_ref%d_ori%s" % (lcs_shading_num+1, lcs_shading_ref, lcs_shading_ori) if "ref2" in suffix and type_nuc: - suffix = suffix.replace("ref2", "%dbp" % lcs_shading_interval_len) + suffix = suffix.replace("ref2", "%dbp" % (lcs_shading_interval_len)) elif "ref2" in suffix: - suffix = suffix.replace("ref2", "%daa" % lcs_shading_interval_len) - + suffix = suffix.replace("ref2", "%daa" % (lcs_shading_interval_len)) # name and create output files (names derived from SEQNAME) if prefix != None and str(prefix) != "": - prefix = str(prefix) + "-" + if prefix[-1] not in ["_", "-"]: + prefix = prefix + "_" else: prefix = "" @@ -2667,20 +2831,23 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, text = "Custom Matrix Colors: " + ", ".join(colors_2) # write lcs lengths to file - lcs_data_file = open("%sPolydotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') + if substitution_count != 0: + lcs_data_file = open("%sPolydotplot_LCS_data_file_ws%d_S%d%s.txt" % (prefix, wordsize, substitution_count, suffix), 'w') + else: + lcs_data_file = open("%sPolydotplot_LCS_data_file_ws%d%s.txt" % (prefix, wordsize, suffix), 'w') lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") # compare sequences pairwise - save lcs and line information in dictionary for plotting - data_dict = {} # keys = tuple(idx, jdx), value = x1, y1, x2, y2 (line positions) - lcs_dict = {} # keys = tuple(idx, jdx), value = length of lcs: lcs_len or (lcs_for, lcs_rev) + data_dict = {} # keys = tuple([idx, jdx]), value = x1, y1, x2, y2 (line positions) + lcs_dict = {} # keys = tuple([idx, jdx]), value = length of lcs: lcs_len or (lcs_for, lcs_rev) for_lcs_set = set([]) # keep lengths to calculate max (excluding self comparisons) rev_lcs_set = set([]) # keep lengths to calculate max (all) - text = "\nTotal plot count: %d" % (len(sequences)*(len(sequences))) - text += "\nTotal calculations: %d" % (len(sequences)*(len(sequences)+1)/2) + text = "\nTotal calculations: %d" % (len(sequences)*(len(sequences)+1)//2) + text += "\nTotal plot count: %d" % (len(sequences)*(len(sequences))) logprint(text, start=False, printing=True) - print "\nCalculating shared regions and lengths of longest_common_substring...", + print("\nCalculating shared regions and lengths of longest_common_substring...", end=" ") log_txt = "\nCalculating shared regions and lengths of longest_common_substring..." # determine matches and length of lcs by comparing all sequence pairs if verbose: @@ -2688,7 +2855,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, counter = 0 for idx in range(len(sequences)): if verbose: - print "\n%d\t%s vs." % ((counter+1), sequences[idx]), + print("\n%d\t%s vs." % ((counter+1), sequences[idx]), end=" ") seq_text += "\n%d\t%s vs." % ((counter+1), sequences[idx]) rec_two = seq_dict[sequences[idx]] name_two = rec_two.id @@ -2703,23 +2870,23 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, counter += 1 if verbose: - print sequences[jdx], + print(sequences[jdx], end=" ") seq_text += " " + sequences[jdx] elif len(sequences) < 5: - print "\t%s (%d %s), %s (%d %s)" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) + print("\t%s (%d %s), %s (%d %s)" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit)) log_txt += "\t%s (%d %s), %s (%d %s)\n" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) else: if not counter % 25: - print counter, + print(counter, end=" ") log_txt += str(counter) # get positions of matches & length of longest common substring based on match lengths if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) + # print("RE") + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, rc_option=rc_option, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) + # print("DIAG") + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, rc_option=rc_option, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) data_dict[(idx, jdx)] = x1[:], y1[:], x2[:], y2[:] lcs_dict[idx, jdx] = lcs_for, lcs_rev @@ -2727,31 +2894,40 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, for_lcs_set.add(lcs_for) rev_lcs_set.add(lcs_rev) + # prevent ZeroDivisionError in case of empty sequences + try: + lcs_for_percentage = round(lcs_for*100./min(len_one, len_two), 3) + lcs_rev_percentage = round(lcs_rev*100./min(len_one, len_two), 3) + except: + lcs_for_percentage = "NA" + lcs_rev_percentage = "NA" + + # write LCS data file lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") + str(lcs_for), str(lcs_for_percentage), + str(lcs_rev), str(lcs_rev_percentage)]) + "\n") - if not verbose: - print len(sequences)*(len(sequences)+1)/2, " done\n" - log_txt += str(len(sequences)*(len(sequences)+1)/2) + " done\n" + if verbose: + print("%d done\n" % (len(sequences)*(len(sequences)+1)//2)) + log_txt += "%d done\n" % (len(sequences)*(len(sequences)+1)//2) else: - print "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - log_txt += "\n%d done" % (len(sequences)*(len(sequences)+1)/2) + print("\n%d done\n" % (len(sequences)*(len(sequences)+1)//2)) + log_txt += "\n%d done\n" % (len(sequences)*(len(sequences)+1)//2) logprint(log_txt, start=False, printing=False) if verbose: - logprint ("\n\nlcs_dict\n" + str(lcs_dict)) + logprint("\n\nlcs_dict\n" + str(sorted(lcs_dict.items()))) if custom_shading: - logprint ("\ncustom_dict\n" + str(custom_dict)) - logprint ("\ncustom_similarity_dict\n\n" + str(custom_similarity_dict)) + logprint("\ncustom_dict\n" + str(sorted(custom_dict.items()))) + logprint("\ncustom_similarity_dict\n\n" + str(custom_similarity_dict)) if verbose: - print + print("") logprint(seq_text+"\n", start=False, printing=False) if lcs_shading_ref == 2: color_bins = [] - text = "\nLCS lengh bins: " + text = "\nLCS length bins: " for idx in range(lcs_shading_num): color_bins.append(lcs_shading_interval_len*(idx+1)) text += " " + str(lcs_shading_interval_len*(idx+1)) @@ -2782,7 +2958,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, text = "Maximum LCS: %d %s" % (max_lcs, aa_bp_unit) logprint(text, start=False, printing=True) if custom_shading: - text = "Maximum custom value: %d\n" % custom_max + text = "Maximum custom value: %d\n" % (custom_max) logprint(text, start=False, printing=True) # count sequences @@ -2791,7 +2967,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # get sequence lengths to scale plot widths and heights accordingly size_ratios = [] for item in sequences: - size_ratios.append(len(seq_dict[item].seq)) + size_ratios.append(len(seq_dict[item].seq)) P.cla() # clear any prior graph # use GridSpec to resize plots according to sequence length @@ -2811,9 +2987,9 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, x_label_pos_top = False # print y labels on the right, if upper right triangle is displayed - if (representation == 1 and not mirror_y_axis) or (representation == 2 and mirror_y_axis): + if (representation == 1 and not mirror_y_axis) or (representation == 2 and mirror_y_axis): y_label_pos = 0 # last column - else: # left y label + else: # left y label y_label_pos = 1 # first column # determine label orientations @@ -2841,15 +3017,15 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, elif representation != 0 and lcs_shading and lcs_shading_ori == 2: # both directions in triangle logprint("\nAttention: For triangular output LCS shading for both orientations is combined to max of both orientations!\n") - print "\nDrawing polydotplot...", + print("\nDrawing polydotplot...", end=" ") log_txt = "\nDrawing polydotplot..." # draw subplots if verbose: if lcs_shading and custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "Custom matrix value", "Matrix color index", "LCS color index"]) + "\n" + lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" % (aa_bp_unit), "LCS for [%s]" % (aa_bp_unit), "Custom matrix value", "Matrix color index", "LCS color index"]) + "\n" elif lcs_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "LCS color index for", "LCS color index rev"]) + "\n" + lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" % (aa_bp_unit), "LCS for [%s]" % (aa_bp_unit), "LCS color index for", "LCS color index rev"]) + "\n" elif custom_shading: lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "Custom matrix value", "Color index for", "Color index rev"]) + "\n" @@ -2858,7 +3034,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, counter, seq_counter = 0, 0 for idx in range(len(sequences)): if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), + print("\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), end=" ") seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) rec_two = seq_dict[sequences[idx]] len_two = len(rec_two.seq) @@ -2872,10 +3048,10 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, counter += 1 seq_counter += 1 if verbose: - print sequences[jdx], + print(sequences[jdx], end=" ") seq_text += " " + sequences[jdx] elif not seq_counter % 25: - print seq_counter, + print(seq_counter, end=" ") log_txt += str(seq_counter) # optional shade background according to length of LCS and/or user matrix @@ -2923,7 +3099,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, color_idx1 = max(color_idx0, color_idx1) # top right triangle = custom value (not colored if text matrix provided) - if type(custom_value) == int or type(custom_value) == float: + if type(custom_value) == int or type(custom_value) == float: color_idx0 = int((custom_value-custom_min)*lcs_shading_num // (custom_max-custom_min)) # no color if string is proviced else: @@ -2931,29 +3107,28 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # use best LCS of both orientations for coloring triangle with two-ori-LCS if representation != 0 and lcs_shading_ori == 2: # both directions in triangle - color_idx0, color_idx1 = max(color_idx0, color_idx1), max(color_idx0, color_idx1) + color_idx0, color_idx1 = max(color_idx0, color_idx1), max(color_idx0, color_idx1) # set colors dependent on lcs dependent on orientation if lcs_shading_bool and not custom_shading: - if idx != jdx: + if idx != jdx: if lcs_shading_ori == 0: color_idx1 = color_idx0 elif lcs_shading_ori == 1: color_idx0 = color_idx1 - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx1] + background_colors[0] = colors[color_idx0] + background_colors[1] = colors[color_idx1] # for selfcomparison, only color reverse complement elif lcs_shading_ori != 0 and not custom_shading: - background_colors[0] = colors[color_idx1] + background_colors[0] = colors[color_idx1] # set different colors for shading by LCS + user matrix elif lcs_shading_bool and custom_shading: - # print colors, background_colors, color_idx0, color_idx1 - background_colors[0] = colors_2[color_idx0] + background_colors[0] = colors_2[color_idx0] background_colors[1] = colors[color_idx1] # set grey color range for user matrix if no LCS shading - elif custom_shading: - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx0] + elif custom_shading: + background_colors[0] = colors[color_idx0] + background_colors[1] = colors[color_idx0] if verbose: if custom_shading and lcs_shading_bool: @@ -2981,7 +3156,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, if mirror_y_axis: col_pos = sequences.index(name_two)+1 row_pos = len(sequences) - (sequences.index(name_one)+1) - counter1 = row_pos * ncols + col_pos + counter1 = row_pos * ncols + col_pos counter2 = (ncols - col_pos) * ncols + ncols - row_pos else: counter1 = counter @@ -2993,7 +3168,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, if len(counters) == 2: seq_counter += 1 if not verbose and not seq_counter % 25: - print seq_counter, + print(seq_counter, end=" ") log_txt += str(seq_counter) x_lists, y_lists, x_lists_rc, y_lists_rc = data_dict[(idx, jdx)] @@ -3008,7 +3183,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, ax = P.subplot(gs[fig_pos-1]) # rows, columns, plotnumber # shade annotated regions if gff file(s) provided - if idx == jdx and gff_files != None and gff_files != []: + if idx == jdx and gff_files != None and len(gff_files) != 0: if name_one in feat_dict.keys(): features = feat_dict[name_one] if len_two != len_one: @@ -3023,15 +3198,15 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, ax.add_patch(patches.Rectangle((start, start), # (x,y) width, width, # width, height edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, + fill=True, facecolor=feat_color, alpha=strength)) # if custom matrix value printed into upper matrix triangle, skip data plotting # text print in top triangle - if user_matrix_print and custom_shading and kdx==0 and idx!=jdx: + if user_matrix_print and custom_shading and kdx==0 and idx!=jdx: data_plotting = False # dotplot in bottom triangle - else: + else: data_plotting = True # mirror plot, if plotting below diagonal @@ -3055,22 +3230,25 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, lines = [] color_list = [] for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": + if col != "white" and len(x_lines) != 0: for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) + try: + lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) + color_list.append(col) + except: + print("Proceeding after error with line collection at index %d (data=%s) in Polydotplot" % (ldx, str(x_lines[ldx]))) color_list = np.array(color_list) # draw lines lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) + ax.add_collection(lc) # plot value provided by customer instead of dotplot - else: + else: alignment = {'horizontalalignment': 'center', 'verticalalignment': 'center'} # P.text(0.5, 0.5, custom_value, size='medium', transform=ax.transAxes, **alignment) P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, **alignment) - # P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, + # P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, # horizontalalignment='center', verticalalignment='center', color="black") if custom_shading: @@ -3116,7 +3294,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, else: label_dist = 8 - # x axis labels dependent on plot position/number + # x axis labels dependent on plot position/number if x_label_bool: # x title and labels on top or bottom P.xlabel(unicode_name(shorten_name(n1, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, rotation=x_label_rotation, verticalalignment=xvalign, horizontalalignment=xhalign, fontweight='bold', labelpad=8) # axis naming if not x_label_rotation in ["horizontal", "vertical"]: @@ -3142,7 +3320,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, else: # no x ticks on internal rows ax.axes.get_xaxis().set_visible(False) - # y axis labels dependent on plot position/number + # y axis labels dependent on plot position/number if fig_pos % ncols == y_label_pos or (ncols == 1 and nrows == 1): # y title and labels in 1st column P.ylabel(unicode_name(shorten_name(n2, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, rotation=y_label_rotation, verticalalignment=yvalign, horizontalalignment=yhalign, fontweight='bold', labelpad=label_dist) P.setp(ax.get_yticklabels(), fontsize=label_size*.9) # axis naming @@ -3163,11 +3341,11 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, ax.axes.get_yaxis().set_visible(False) if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" + print("%d done\n" % (seq_counter)) + log_txt += "%d done\n" % (seq_counter) else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter + print("\n%d done\n" % (seq_counter)) + log_txt += "\n%d done\n" % (seq_counter) logprint(log_txt, start=False, printing=False) if verbose: @@ -3176,7 +3354,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, except: pass - # finalize layout - margins & spacing between plots + # finalize layout - margins & spacing between plots P.tick_params(axis='both', which='major', labelsize=label_size*.9) try: P.tight_layout(h_pad=.02, w_pad=.02) @@ -3192,24 +3370,23 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, P.subplots_adjust(hspace=spacing, wspace=spacing) # space between rows - def 0.4 # save figure and close instance - fig_name = '%s%s_wordsize%i%s.%s' % (prefix, name_graph, wordsize, suffix, filetype) + fig_name = "%s%s_ws%d%s.%s" % (prefix, name_graph, wordsize, suffix, filetype) P.savefig(fig_name) P.close() P.cla() - # create figure color legend if lcs_shading: if lcs_shading_ref == 1: # percentage of shorter sequence - legend_file_name = legend_figure(colors, lcs_shading_num, unit="%", filetype=filetype, prefix=prefix) + legend_file_name = legend_figure(colors, lcs_shading_num, unit="%", filetype=filetype, prefix=legend_prefix) elif lcs_shading_ref == 2: # interval sizes - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, bins=color_bins) + legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=legend_prefix, bins=color_bins) else: # relative of maximum lcs - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, max_len=max_lcs) + legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=legend_prefix, max_len=max_lcs) if custom_shading: - custom_prefix = "custom-matrix-" + prefix - legend_file_name_custom = legend_figure(colors_2, lcs_shading_num, unit="%", filetype=filetype, prefix=custom_prefix, max_len=custom_max, min_len=custom_min) + custom_prefix = "_custom-matrix_" + legend_file_name_custom = legend_figure(colors_2, lcs_shading_num, unit="%", filetype=filetype, prefix=legend_prefix+custom_prefix, max_len=custom_max, min_len=custom_min) if lcs_shading and custom_shading: return [fig_name, legend_file_name, legend_file_name_custom] @@ -3225,7 +3402,7 @@ def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, # Function Call # ############################### -def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_size=10, filetype="png", type_nuc=True, convert_wobbles=False, substitution_count=0, rc_option=True, alphabetic_sorting=False, only_vs_first_seq=False, gff=None, multi=True, ncols=1, nrows=1, lcs_shading=True, lcs_shading_num=5, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, gff_color_config_file="", input_user_matrix_file="", user_matrix_print=False, length_scaling=True, title_length=50, title_clip_pos="B", spacing=0.04, max_N_percentage=49, mirror_y_axis=False, verbose=False): +def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_size=10, filetype="png", type_nuc=True, convert_wobbles=False, substitution_count=0, rc_option=True, alphabetic_sorting=False, only_vs_first_seq=False, gff=None, multi=True, ncols=1, nrows=1, narrow_diagonal_interval=False, lcs_shading=True, lcs_shading_num=5, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, gff_color_config_file="", input_user_matrix_file="", user_matrix_print=False, length_scaling=True, title_length=50, title_clip_pos="B", spacing=0.04, max_N_percentage=49, mirror_y_axis=False, verbose=False): global t1, line_col_rev @@ -3240,23 +3417,23 @@ def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_s logprint(text, start=False, printing=True) if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given:%s\n" % filetype + text = "Provide valid file type - png, pdf, or svg - given:%s\n" % (filetype) logprint(text, start=False, printing=True) filetype = "png" # read gff color config file if provided - if len(input_gff_files) != 0 and input_gff_files != None: + if input_gff_files != None and len(input_gff_files) != 0: if gff_color_config_file not in ["", None]: - text = "\n%s\n\nReading GFF color configuration file\n%s\n\n=> %s\n" % (50*"=", 28*"-", gff_color_config_file) + text = "\n%s\n\nReading GFF color configuration file\n%s\n\n=> %s\n" % (60*"=", 40*"-", gff_color_config_file) logprint(text, start=False, printing=True) gff_feat_colors = read_gff_color_config(gff_color_config_file) else: gff_feat_colors = {} if gff_color_config_file not in ["", None]: - text = "Please provide GFF annotation files to use configuration file", gff_color_config_file + text = "Please provide GFF annotation files to use configuration file", gff_color_config_file logprint(text, start=False, printing=True) - # if color is set to white, reverse complementary matches are skipped + # if color is set to white, reverse complementary matches are skipped if not rc_option: line_col_rev = "white" # reverse matches not calculated elif not type_nuc: @@ -3266,7 +3443,7 @@ def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_s mode_text = [] for item in modes: mode_text.append(str(item)) - text = "%s\n\nRunning plotting modes %s" % (50*"=", ", ".join(mode_text)) + text = "%s\n\nRunning plotting modes %s" % (60*"=", ", ".join(mode_text)) logprint(text, start=False, printing=True) @@ -3276,45 +3453,45 @@ def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_s # self dotplots t1 = time.time() if 0 in modes: - list_of_png_names = selfdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, gff_files=gff, gff_color_dict=gff_feat_colors, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) + list_of_fig_names = selfdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, rc_option=rc_option, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, gff_files=gff, gff_color_dict=gff_feat_colors, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose, narrow_diagonal_interval=narrow_diagonal_interval) t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) + if list_of_fig_names != None and len(list_of_fig_names) != 0: + text = "-> Image file(s): %s\n" % (", ".join(list_of_fig_names)) else: text = "No image files were created!\n" logprint(text, start=False, printing=True) - logprint(50*"=") + logprint(60*"=") # paired dotplots if 1 in modes: if multi: - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, only_vs_first_seq=only_vs_first_seq, multi=multi, ncols=ncols, nrows=nrows, length_scaling=length_scaling, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) + list_of_fig_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, rc_option=rc_option, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, only_vs_first_seq=only_vs_first_seq, multi=multi, ncols=ncols, nrows=nrows, length_scaling=length_scaling, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) t1 = time_track(t1) else: if not length_scaling: text = "\nPairwise dotplot with individual output files scaled by sequence length automatically!" logprint(text, start=False, printing=True) - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, only_vs_first_seq=only_vs_first_seq, multi=multi, ncols=ncols, nrows=nrows, length_scaling=True, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) + list_of_fig_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, rc_option=rc_option, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, only_vs_first_seq=only_vs_first_seq, multi=multi, ncols=ncols, nrows=nrows, length_scaling=True, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) + if list_of_fig_names != None and len(list_of_fig_names) != 0: + text = "-> Image file(s): %s\n" % (", ".join(list_of_fig_names)) else: text = "No image files were created!\n" logprint(text, start=False, printing=True) - logprint(50*"=") + logprint(60*"=") # all-against-all dotplot if 2 in modes: - list_of_png_names = polydotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, spacing=spacing, gff_files=gff, gff_color_dict=gff_feat_colors, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) + list_of_fig_names = polydotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, rc_option=rc_option, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, spacing=spacing, gff_files=gff, gff_color_dict=gff_feat_colors, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) + if list_of_fig_names != None and len(list_of_fig_names) != 0: + text = "-> Image file(s): %s\n" % (", ".join(list_of_fig_names)) else: text = "No image files were created!\n" logprint(text, start=False, printing=True) - logprint(50*"=") + logprint(60*"=") - text = "\n" + 50 * "#" + "\n" + 50 * "#" + text = "\n" + 60*"#" + "\n" + 60*"#" text += "\n\nThank you for using FlexiDot!\n" logprint(text, start=False, printing=True) @@ -3329,51 +3506,49 @@ def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_s parameters = check_input(sys.argv, trial_mode=trial_mode) # read out parameters -commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype, type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, only_vs_first_seq, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos_top, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose = parameters +commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype, type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, only_vs_first_seq, narrow_diagonal_interval, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos_top, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose = parameters # evtl. overwrite parameters for testing purposes in trial mode if trial_mode: - input_fasta = ["Inversionen_IDs_v2_test2.fas"] - # input_fasta = ["Inversionen_IDs_v2_test3.fas"] - # input_fasta = ["test-sequences-8.fas"] - # input_gff_files = ["Seq2_annotations.gff3"] - # input_user_matrix_file = "matrix.txt" + input_fasta = ["Test.fas"] + output_file_prefix = "#Test_" + # input_gff_files = ["Test.gff3"] + # input_user_matrix_file = "Test_matrix.txt" # user_matrix_print = True - output_file_prefix = "#Test" plot_size = 10 plotting_modes = [0,1,2] - plotting_modes = [2] - plotting_modes = [0] + + collage_output = True + length_scaling = True + + narrow_diagonal_interval = False + narrow_diagonal_interval = 20 lcs_shading = False lcs_shading = True lcs_shading_ref = 2 lcs_shading_num = 4 lcs_shading_ori = 0 lcs_shading_interval_len = 15 - wordsize = 10 wordsize = 7 - x_label_pos_top = True + wordsize = 10 filetype = "pdf" filetype = "png" - mirror_y_axis = False - mirror_y_axis = True - output_file_prefix = "#R-upper" - representation = 0 # both - representation = 1 # upper - representation = 2 # lower + representation = 2 # lower + representation = 1 # upper + representation = 0 # both + + x_label_pos_top = True + mirror_y_axis = False - wobble_conversion = False wobble_conversion = True - substitution_count = 0 substitution_count = 1 + substitution_count = 0 rc_option = True - rc_option = False label_size = 10 - verbose = False verbose = True if auto_fas: @@ -3398,6 +3573,5 @@ def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_s # FlexiDot Execution # ###################### -main(input_fasta, wordsize, modes=plotting_modes, prefix=output_file_prefix, plot_size=plot_size, label_size=label_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=wobble_conversion, substitution_count=substitution_count, rc_option=rc_option, alphabetic_sorting=alphabetic_sorting, only_vs_first_seq=only_vs_first_seq, gff=input_gff_files, multi=collage_output, ncols=m_col, nrows=n_row, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, gff_color_config_file=gff_color_config_file, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, length_scaling=length_scaling, title_length=title_length, title_clip_pos=title_clip_pos, spacing=spacing, max_N_percentage=max_N_percentage, mirror_y_axis=mirror_y_axis, verbose=verbose) - +main(input_fasta, wordsize, modes=plotting_modes, prefix=output_file_prefix, plot_size=plot_size, label_size=label_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=wobble_conversion, substitution_count=substitution_count, rc_option=rc_option, alphabetic_sorting=alphabetic_sorting, only_vs_first_seq=only_vs_first_seq, gff=input_gff_files, multi=collage_output, ncols=m_col, nrows=n_row, narrow_diagonal_interval=narrow_diagonal_interval, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, gff_color_config_file=gff_color_config_file, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, length_scaling=length_scaling, title_length=title_length, title_clip_pos=title_clip_pos, spacing=spacing, max_N_percentage=max_N_percentage, mirror_y_axis=mirror_y_axis, verbose=verbose) diff --git a/code/flexidot_v1.00.py b/code/flexidot_v1.00.py deleted file mode 100644 index 299780e..0000000 --- a/code/flexidot_v1.00.py +++ /dev/null @@ -1,3129 +0,0 @@ -#!/usr/bin/python2.7 -#!/usr/bin/python2.7 -# -*- coding: utf-8 -*- - -""" -FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation - -Kathrin M. Seibt, Thomas Schmidt and Tony Heitkam -Institute of Botany, TU Dresden, Dresden, 01277, Germany - -(2018) - -version 1.0 -= version 0.121 -""" - - -############################### -# Requirements # -############################### - -# import system modules -import os, glob -import time, datetime -import sys -import shutil, getopt -import unicodedata - -def module_install_command(module_name, upgrade=False): - """ - create installation commands for Python modules and print information - """ - if upgrade: - load_command = "python -m pip install --upgrade %s" % module_name - else: - load_command = "python -m pip install %s" % module_name - - try: - logprint("Installing Python module: %s\n\t%s\n" % (module_name, load_command)) - except: - print "Installing Python module: %s\n\t%s\n" % (module_name, load_command) - - return load_command - -def load_modules(): - """ - load Python modules, if possible - otherwise try to install them - """ - - # make module names global - global cllct, gridspec, patches, rcParams, mplrc, P, Color, SeqIO, np, ccv, mcolors, rgb2hex, regex - - # matplotlib - try: - import matplotlib.collections as cllct - except: - command = module_install_command("matplotlib", upgrade=True) - try: - os.system(command) - print "\n" - import matplotlib.collections as cllct - except: - print "Please install module matplotlib manually" - from matplotlib.colors import colorConverter as ccv - import matplotlib.colors as mcolors - import matplotlib.gridspec as gridspec - import matplotlib.patches as patches - import pylab as P - - # specify matplotlib font settings - from matplotlib import rc as mplrc - mplrc('pdf', fonttype=42, compression=0) - from matplotlib import rcParams - rcParams['font.family'] = 'sans-serif' - rcParams['font.sans-serif'] = ['Helvetica', 'Verdana', 'Tahoma', ] - - # colour for color gradient palette - try: - from colour import Color - except: - command = module_install_command("colour") - try: - os.system(command) - print "\n" - from colour import Color - except: - print "Please install module colour manually" - - # color converter - try: - from colormap import rgb2hex - except: - command = module_install_command("colormap") - # additional module easydev.tools required by colormap - command2 = module_install_command("easydev") - try: - os.system(command) - os.system(command2) - print "\n" - from colormap import rgb2hex - except: - print "Please install module colormap manually" - - # biopython - try: - from Bio import SeqIO - except: - command = module_install_command("biopython") - try: - os.system(command) - print "\n" - from Bio import SeqIO - except: - print "Please install module biopython manually" - - # numpy - try: - import numpy as np - except: - command = module_install_command("numpy") - try: - os.system(command) - print "\n" - import numpy as np - except: - print "Please install module numpy manually" - - # regex for pattern matching - try: - import regex - except: - command = module_install_command("regex") - try: - os.system(command) - print "\n" - import regex - except: - print "Please install module regex manually" - -load_modules() - - -############################### -# Usage & Input # -############################### - -def usage(): - """ - usage and help - """ - - print """\n\n FLEXIDOT - ------------------------------------------------------------------- - - Version: - 1.00 - - Citation: - Kathrin M. Seibt, Thomas Schmidt, Tony Heitkam (in prep.) - "FlexiDot: Highly customizable, ambiguity-aware dotplots for visual sequence analyses" - - - General usage: - $ python flexidot.py -a [ARGUMENTS] - $ python flexidot.py -i [ARGUMENTS] - - - ARGUMENTS - ------------------------------------------------------------------- - - - INPUT/OUTPUT OPTIONS... required are [-a] OR [-i] - - -a, --auto_fas Imports all fasta files from current directory (*.fasta, *.fas, *.fa, *.fna) - Y or 1 = ON - N or 0 = OFF [default] - - -i, --in_file Input fasta file (fasta file name or comma-separated file list) - > Provide multiple files: Recall -i or provide comma-separated file names - - -o, --output_file_prefix File prefix to be added to the generated filenames [default = NONE] - - -c, --collage_output Multiple dotplots are combined in a collage - Y or 1 = ON [default] - N or 0 = OFF - - -m, --m_col Number of columns per page [default = 4] (only if --collage_output is ON) - - -n, --n_row Number of rows per page [default = 5] (only if --collage_output is ON) - - -f, --filetype Output file format - 0 = PNG [default] - 1 = PDF - 2 = SVG - - -s, --alphabetic_sorting Sort sequences alphabetically according to titles - Y or 1 = ON - N or 0 = OFF [default] - - - CALCULATION PARAMETERS... - - -k, --wordsize Wordsize (kmer length) for dotplot comparison [default = 7] - - -p, --plotting_mode Mode of FlexiDot dotplotting - 0 = self [default] - 1 = paired - 2 = poly (matrix with all-against-all dotplots) - > Run multiple plotting modes: Recall -p or provide comma-separated numbers - - -t, --type_nuc Type of residue is nucleotide - Y or 1 = nucleotide [default] - N or 0 = amino acid - - -w, --wobble_conversion Ambiguity handling for relaxed matching - Y or 1 = ON - N or 0 = OFF [default] - - -S, --substitution_count Number of substitutions (mismatches) allowed per window for relaxed matching - [default = 0] - - -r, --rc_option Find reverse complementary matches (only if type_nuc=y) - Y or 1 = ON [default] - N or 0 = OFF - - - GRAPHIC FORMATTING... - - -A, --line_width Line width [default = 1] - - -B, --line_col_for Line color [default = black] - - -C, --line_col_rev Reverse line color [default = green] - - -D, --x_label_pos Position of the X-label - Y or 1 = top [default] - N or 0 = bottom - - -E, --label_size Font size [default = 10] - - -F, --spacing Spacing between all-against-all dotplots (only if --plotting_mode=2) - [default = 0.04] - - -P, --plot_size Plotsize [default = 10] - - -L, --length_scaling Scale plot size for pairwise comparison (only if --plotting_mode=1) - Y or 1 = Scaling ON (axes scaled according to sequence length) - N or 0 = Scaling OFF (squared plots) [default] - - -T, --title_length Limit title length for self dotplot comparison (only if --plotting_mode=0) - [default = infinite] - - - GFF SHADING (for -p/--plotting_mode=0 only)... - - -g, --input_gff_files GFF3 file used for markup in self-dotplots - (provide multiple files: Recall -g or provide comma-separated file names) - - -G, --gff_color_config_file Tab-delimited config file for custom gff shading - column 1: feature type - column 2: color - column 3: alpha - column 4: zoom factor (for small regions) - - - LCS SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -x, --lcs_shading Shade subdotplot based on the length of the longest common substring (LCS) - Y or 1 = ON - N or 0 = OFF [default] - - -X, --lcs_shading_num Number of shading intervals (hues) for LCS (-x) and user matrix shading (-u) - [default = 5] - - -y, --lcs_shading_ref Reference for LCS shading - 0 = maximal LCS length [default] - 1 = maximally possible length (length of shorter sequence in pairwise comparison) - 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] - see -Y - - -Y, --lcs_shading_interval_len Length of intervals for LCS shading (only if --lcs_shading_ref=2) - [default for nucleotides = 50; default for amino acids = 10] - - -z, --lcs_shading_ori Shade subdotplots according to LCS on - 0 = forward [default], - 1 = reverse, or - 2 = both strands (forward shading above diagonal, reverse shading on diagonal and below; - if using --input_user_matrix_file, best LCS is used below diagonal) - - - CUSTOM USER MATRIX SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -u, --input_user_matrix_file Shading above diagonal according to values in matrix file specified by the user - (tab-delimited or comma-separated matrix with sequence name in column 1 and numbers in columns 2-n - e.g. identity matrix from multiple sequence alignment - strings are ignored) - - -U, --user_matrix_print Display provided matrix entries in the fields above diagonal of all-against-all dotplot - Y or 1 = ON - N or 0 = OFF [default] - - - OTHERS... - - -h, --help Help screen - - -v, --verbose Verbose - - - """ - -def check_input(argv, trial_mode=False): - """ - commandline argument parsing - """ - - global log_txt, aa_bp_unit - - # helpers for argument parsing - ###################################### - - arguments = ["-a", "--auto_fas", "a:", "auto_fas=", - "-i", "--input_fasta", "i:", "input_fasta=", - "-o", "--output_file_prefix", "o:", "output_file_prefix=", - "-c", "--collage_output", "c:", "collage_output=", - "-m", "--m_col", "m:", "m_col=", - "-n", "--n_row", "n:", "n_row=", - "-f", "--filetype", "f:", "filetype=", - "-t", "--type_nuc", "t:", "type_nuc=", - "-g", "--input_gff_files", "g:", "input_gff_files", - "-G", "--gff_color_config_file", "G:", "gff_color_config_file", - "-k", "--wordsize", "k:", "wordsize=", - "-p", "--plotting_mode", "p:", "plotting_mode=", - "-w", "--wobble_conversion", "w:", "wobble_conversion=", - "-S", "--substitution_count", "S:", "substitution_count=", - "-r", "--rc_option", "r:", "rc_option=", - "-s", "--alphabetic_sorting", "s:", "alphabetic_sorting", - "-x", "--lcs_shading", "x:", "lcs_shading=", - "-X", "--lcs_shading_num", "X:", "lcs_shading_num=", - "-y", "--lcs_shading_ref", "y:", "lcs_shading_ref=", - "-Y", "--lcs_shading_interval_len", "Y:", "lcs_shading_interval_len=", - "-z", "--lcs_shading_ori", "z:", "lcs_shading_ori=", - "-u", "--input_user_matrix_file", "u:", "input_user_matrix_file=", - "-U", "--user_matrix_print", "U:", "user_matrix_print=", - "-P", "--plot_size", "P:", "plot_size=", - "-A", "--line_width", "A:", "line_width=", - "-B", "--line_col_for", "B:", "line_col_for=", - "-C", "--line_col_rev", "C:", "line_col_rev=", - "-D", "--x_label_pos", "D:", "x_label_pos=", - "-E", "--label_size", "E:", "label_size=", - "-F", "--spacing", "F:", "spacing=", - "-L", "--length_scaling", "L:", "length_scaling=", - "-T", "--title_length", "T:", "title_length=", - "-h", "--help", "h", "help", - "-v", "--verbose", "v", "verbose"] - - arguments_sysargv = tuple(arguments[0::4] + arguments[1::4]) - arguments_opts = "".join(arguments[2::4]) - arguments_args = arguments[3::4] - - - # setting defaults - ###################################### - - auto_fas = False # 0 - input_fasta = [] - output_file_prefix = None - collage_output = True # 1 - m_col = 4 - n_row = 5 - filetype = 0 - type_nuc = True - input_gff_files = [] - gff_color_config_file = "" - - wordsize = 7 - plotting_modes = [0] - wobble_conversion = False # 0 - substitution_count = 0 - rc_option = True # 1 - alphabetic_sorting = False # 0 - - lcs_shading = False # 0 - lcs_shading_num = 4 - lcs_shading_ref = 0 - lcs_shading_interval_len = 50 # interval default changes to "10" for amino acids [type_nuc = n] - lcs_shading_ori = 0 - - input_user_matrix_file = "" - user_matrix_print = False - - plot_size = 10 - line_width = 1 - line_col_for = "black" - line_col_rev = "#009243" - x_label_pos = True # 0 - label_size = 10 - spacing = 0.04 - length_scaling = False # 0 - title_length = float("Inf") - - aa_bp_unit = "bp" - - verbose = False # 0 - - filetype_dict = {0: "png", 1: "pdf", 2: "svg"} - lcs_shading_ref_dict = {0: "maximal LCS length", 1: "maximally possible length", 2: "given interval sizes"} - plotting_mode_dict = {0: "self", 1: "paired", 2: "all-against-all"} - lcs_shading_ori_dict = {0: "forward", 1: "reverse complement", 2: "both"} - - # return default parameters for testing purposes - if trial_mode: - print "ATTENTION: YOU ARE IN THE TRIAL MODE!!!\n\n" - - commandline = "trial_mode\n" - - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, verbose] - return parameters - - - # read arguments - ###################################### - - commandline = "" - for arg in sys.argv: - commandline += arg + " " - - log_txt = "\n...reading input arguments..." - print log_txt - - if len(sys.argv) < 2: - print "\nERROR: More arguments are needed. Exit..." - log_txt += "\nERROR: More arguments are needed. Exit..." - usage() - sys.exit() - - elif sys.argv[1] not in arguments_sysargv: - print "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - log_txt += "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - # usage() - sys.exit() - - try: - opts, args = getopt.getopt(sys.argv[1:], arguments_opts, arguments_args) - - except getopt.GetoptError: - print "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - log_txt += "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - # usage() - sys.exit() - - for opt, arg in opts: - - if opt in ("-h", "--help"): - print "...fetch help screen" - log_txt += "\n...fetch help screen" - usage(), sys.exit() - - if opt in ("-v", "--verbose"): - print "...verbose output" - log_txt += "\n...verbose output" - verbose = True - - elif opt in ("-i", "--input_fasta"): - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: fasta_file '%s' was not found!" % str(temp_file) - sys.exit(message) - else: - input_fasta.append(str(temp_file)) - print "fasta file #%i: %s" % (len(input_fasta), str(temp_file)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: fasta_file '%s' was not found!" % str(arg) - log_txt += message - sys.exit(message) - else: - input_fasta.append(str(arg)) - print "fasta file #%i: %s" % (len(input_fasta), str(arg)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(arg)) - - - elif opt in ("-a", "--auto_fas"): - auto_fas = check_bools(str(arg), default=auto_fas) - - - # multiple gff files: reads them into a list - elif opt in ("-g", "--input_gff_files"): - - # append gff file only if existing - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: gff_file '%s' was not found!" % str(temp_file) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - print "GFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - input_gff_files.append(str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: gff_file '%s' was not found!" % str(arg) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - input_gff_files.append(str(arg)) - print "GFF file #%i: %s" %(len(input_gff_files), str(arg)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(arg)) - - - elif opt in ("-G", "--gff_color_config_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: gff_color_config_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot with default gff coloring specification!" - log_txt += message + "\n -->Running FlexiDot with default gff coloring specification!" - else: - gff_color_config_file = str(arg) - - - elif opt in ("-u", "--input_user_matrix_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: input_user_matrix_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot without input_user_matrix_file %s!" % arg - log_txt += message + "\n -->Running FlexiDot withdefault matrix shading file!" - else: - input_user_matrix_file = str(arg) - - elif opt in ("-U", "--user_matrix_print"): - user_matrix_print = check_bools(str(arg), default=user_matrix_print) - - elif opt in ("-o", "--output_file_prefix"): - output_file_prefix = arg - - elif opt in ("-c", "--collage_output"): - collage_output = check_bools(str(arg), default=collage_output) - - elif opt in ("-m", "--m_col"): - try: m_col = int(arg) - except: - print "m_col - invalid argument - using default value" - log_txt += "\nm_col - invalid argument - using default value" - - elif opt in ("-n", "--n_row"): - try: n_row = int(arg) - except: - print "n_row - invalid argument - using default value" - log_txt += "\nn_row - invalid argument - using default value" - - elif opt in ("-f", "--filetype"): - if 0 <= int(arg) <= 2: - filetype = int(arg) - else: - print "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - log_txt += "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - - elif opt in ("-t", "--type_nuc"): - type_nuc = check_bools(str(arg), default=type_nuc) - - if type_nuc == False: - # interval default changed for amino acids - lcs_shading_interval_len = 10 - aa_bp_unit = "aa" - - elif opt in ("-k", "--wordsize"): - try: wordsize = int(arg) - except: - print "wordsize - invalid argument - using default value" - log_txt += "\nwordsize - invalid argument - using default value" - - elif opt in ("-p", "--plotting_mode"): - if "," in arg: - temp_modes = arg.split(",") - for item in temp_modes: - if item in ["0","1","2"]: - plotting_modes.append(int(item)) - elif arg in ["0","1","2"]: - plotting_modes = [int(arg)] - else: - print "Please provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - log_txt += "\nPlease provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - - elif opt in ("-w", "--wobble_conversion"): - wobble_conversion = check_bools(str(arg), default=wobble_conversion) - - elif opt in ("-S", "--substitution_count"): - try: substitution_count = int(arg) - except: - print "substitution_count - invalid argument - using default value" - log_txt += "\nsubstitution_count - invalid argument - using default value" - - elif opt in ("-r", "--rc_option"): - rc_option = check_bools(str(arg), default=rc_option) - - elif opt in ("-s", "--alphabetic_sorting"): - alphabetic_sorting = check_bools(str(arg), default=alphabetic_sorting) - - elif opt in ("-x", "--lcs_shading"): - lcs_shading = check_bools(str(arg), default=lcs_shading) - - elif opt in ("-X", "--lcs_shading_num"): - try: lcs_shading_num = int(arg) - 1 - except: - print "lcs_shading_num - invalid argument - using default value" - log_txt += "\nlcs_shading_num - invalid argument - using default value" - - elif opt in ("-y", "--lcs_shading_ref"): - try: - if 0 <= int(arg) <= 2: - lcs_shading_ref = int(arg) - else: - print "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - log_txt += "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - except: - print "lcs_shading_ref - invalid argument - using default value" - log_txt += "\nlcs_shading_ref - invalid argument - using default value" - - elif opt in ("-Y", "--lcs_shading_interval_len"): - try: lcs_shading_interval_len = int(arg) - except: - print "lcs_shading_interval_len - invalid argument - using default value" - log_txt += "\nlcs_shading_interval_len - invalid argument - using default value" - - elif opt in ("-z", "--lcs_shading_ori"): - if 0 <= int(arg) <= 2: - lcs_shading_ori = int(arg) - else: - print "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - log_txt += "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - - elif opt in ("-P", "--plot_size"): - try: plot_size = float(arg) - except: - print "plot_size - invalid argument - using default value" - log_txt += "\nplot_size - invalid argument - using default value" - - - elif opt in ("-A", "--line_width"): - try: line_width = float(arg) - except: - print "line_width - invalid argument - using default value" - log_txt += "\nline_width - invalid argument - using default value" - - elif opt in ("-B", "--line_col_for"): - if mcolors.is_color_like(arg): - line_col_for = arg - else: - print "line_col_for - invalid argument - using default value" - log_txt += "\nline_col_for - invalid argument - using default value" - - elif opt in ("-C", "--line_col_rev"): - if mcolors.is_color_like(arg): - line_col_rev = arg - else: - print "line_col_rev - invalid argument - using default value" - log_txt += "\nline_col_rev - invalid argument - using default value" - - elif opt in ("-D", "--x_label_pos"): - x_label_pos = check_bools(str(arg), default=x_label_pos) - - elif opt in ("-E", "--label_size"): - try: label_size = float(arg) - except: - print "label_size - invalid argument - using default value" - log_txt += "\nlabel_size - invalid argument - using default value" - - elif opt in ("-F", "--spacing"): - try: spacing = float(arg) - except: - print "spacing - invalid argument - using default value" - log_txt += "\nspacing - invalid argument - using default value" - - elif opt in ("-L", "--length_scaling"): - length_scaling = check_bools(str(arg), default=length_scaling) - - elif opt in ("-T", "--title_length"): - try: title_length = int(arg) - except: - print "title_length - invalid argument - using default value" - log_txt += "\ntitle_length - invalid argument - using default value" - - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - logprint(log_txt, start=False, printing=False) - - - # print chosen arguments - ###################################### - - text = "\n%s\n" % (70 * "-") - text += "\n" + "INPUT/OUTPUT OPTIONS...\n" - text += "\n" + "Input fasta file: " + ", ".join(input_fasta) - text += "\n" + "Automatic fasta collection from current directory: " + str(auto_fas) - text += "\n" + "Collage output: " + str(collage_output) - text += "\n" + "Number of columns per page: " + str(m_col) - text += "\n" + "Number of rows per page: " + str(n_row) - text += "\n" + "File format: " + filetype_dict[filetype] - text += "\n" + "Residue type is nucleotide: " + str(type_nuc) - - text += "\n" + "\n\nCALCULATION PARAMETERS...\n" - text += "\n" + "Wordsize: " + str(wordsize) - text += "\n" + "Plotting mode: " + str(plotting_modes).replace("[", "").replace("]", "") + "\n" + 51 * " " - for item in plotting_modes: - text += plotting_mode_dict[item] + " " - text += "\n" + "Ambiguity handling: " + str(wobble_conversion) - text += "\n" + "Reverse complement scanning: " + str(rc_option) - text += "\n" + "Alphabetic sorting: " + str(alphabetic_sorting) - - if 0 in plotting_modes and input_gff_files != []: - text += "\n" + "Input gff files: " + ", ".join(input_gff_files) - if gff_color_config_file != "": - text += "\n" + "GFF color config file: " + gff_color_config_file - text += "\n" + "Prefix for output files: " + str(output_file_prefix) - - if 2 in plotting_modes: - text += "\n" + "\n\nLCS SHADING OPTIONS (plotting_mode 'all-against-all' only)...\n" - text += "\n" + "LCS shading: " + str(lcs_shading) - text += "\n" + "LCS shading interval number: " + str(lcs_shading_num + 1) - text += "\n" + "LCS shading reference: " + lcs_shading_ref_dict[lcs_shading_ref] - if lcs_shading_ref == 2: - text += "\n" + "LCS shading interval size [%s]: " % (aa_bp_unit) + str(lcs_shading_interval_len) - text += "\n" + "LCS shading orientation: " + lcs_shading_ori_dict[lcs_shading_ori] - if input_user_matrix_file != "": - text += "\n" + "Custom user shading matrix file: " + input_user_matrix_file - text += "\n" + "Print user matrix values (instead of dotplot): " + str(user_matrix_print) - - text += "\n" + "\n\nGRAPHIC FORMATTING...\n" - text += "\n" + "Plot size: " + str(plot_size) - text += "\n" + "Line width: " + str(line_width) - text += "\n" + "Line color: " + line_col_for - text += "\n" + "Reverse line color: " + line_col_rev - text += "\n" + "X label position: " + str(x_label_pos) - text += "\n" + "Label size: " + str(label_size) - text += "\n" + "Spacing: " + str(spacing) - text += "\n" + "Title length (limit number of characters): " + str(title_length) - text += "\n" + "Length scaling: " + str(length_scaling) - text += "\n%s\n" % (70 * "-") - logprint(text) - - - # collect settings - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, verbose] - - return parameters - - -############################### -# Helper Functions # -############################### - -def alphabets(type_nuc=True): - """ - provide ambiguity code for sequences - """ - - nucleotide_alphabet = ["A", "C", "G", "T"] - - nucleotide_alphabet_full = ["A", "C", "G", "T", "N", "B", "D", "H", - "V", "Y", "R", "W", "S", "K", "M"] - - nucleotide_ambiguity_code = {"N": ["A", "C", "G", "T"], # any - "B": ["C", "G", "T"], # not A - "D": ["A", "G", "T"], # not C - "H": ["A", "C", "T"], # not G - "V": ["A", "C", "G"], # not T - "Y": ["C", "T"], # pyrimidine - "R": ["A", "G"], # purine - "W": ["A", "T"], # weak - "S": ["C", "G"], # strong - "K": ["G", "T"], # keto - "M": ["A", "C"]} # amino - - nucleotide_match_dict = {"N": "[ACGTNBDHVYRWSKM]", # any - "B": "[CGTNBDHVYRWSKM]", # not A - "D": "[AGTNBDHVYRWSKM]", # not C - "H": "[ACTNBDHVYRWSKM]", # not G - "V": "[ACGNBDHVYRWSKM]", # not T - "K": "[GTNBDHVYRWSK]", # keto - not A,C,M - "M": "[ACNBDHVYRWSM]", # amino - not G,T,K - "W": "[ATNBDHVYRWKM]", # weak - not C,G,S - "S": "[CGNBDHVYRSKM]", # strong - not A,G,W - "Y": "[CTNBDHVYWSKM]", # pyrimidine - not A,G,R - "R": "[AGNBDHVRWSKM]", # purine - not C,T,Y - "A": "[ANDHVRWM]", - "C": "[CNBHVYSM]", - "G": "[GNBDVRSK]", - "T": "[TNBDHYWK]"} - - # nucleotide_match_dict = {"N": ".", # any - # "B": "[^A]", # not A - # "D": "[^C]", # not C - # "H": "[^G]", # not G - # "V": "[^T]", # not T - # "K": "[^ACM]", # keto - not A,C,M - # "M": "[^GTK]", # amino - not G,T,K - # "W": "[^CGS]", # weak - not C,G,S - # "S": "[^AGW]", # strong - not A,G,W - # "Y": "[^AGR]", # pyrimidine - not A,G,R - # "R": "[^CTY]", # purine - not C,T,Y - # "A": "[ANDHVRWM]", - # "C": "[CNBHVYSM]", - # "G": "[GNBDVRSK]", - # "T": "[TNBDHYWK]"} - - aminoacid_alphabet = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"] - - aminoacid_alphabet_full = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*", "J", - "Z", "B", "X"] - - aminoacid_ambiguity_code = {"J": ["I", "L"], - "Z": ["Q", "E"], - "B": ["N", "D"], - "X": ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"]} # any - - aminoacid_match_dict = {"J": "[ILJ]", - "Z": "[QEZ]", - "B": "[NDB]", - # "X": ".", - "X": "[ARNDCEQGHILKMFPSTWYVUO*XBZJ]", - "A": "[AX]", - "R": "[RX]", - "N": "[NXB]", - "D": "[DXB]", - "C": "[CX]", - "E": "[EXZ]", - "Q": "[QXZ]", - "G": "[GX]", - "H": "[HX]", - "I": "[IXJ]", - "L": "[LXJ]", - "K": "[KX]", - "M": "[MX]", - "F": "[FX]", - "P": "[PX]", - "S": "[SX]", - "T": "[TX]", - "W": "[WX]", - "Y": "[YX]", - "V": "[VX]", - "U": "[UX]", - "O": "[OX]", - "*": "[*X]"} - - aa_only = set(['E', 'F', 'I', 'J', 'L', 'O', 'Q', 'P', 'U', 'X', 'Z', '*']) - # return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aa_only - - if type_nuc: - return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, nucleotide_match_dict - else: - return aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aminoacid_match_dict - -def logprint(text, start=False, printing=True, prefix=""): - """ - log output to log_file and optionally print - """ - - # define log file name and open file - global log_file_name - if start and trial_mode: - log_file_name = "log_file.txt" - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - elif start: - date = datetime.date.today() - time = str(datetime.datetime.now()).split(" ")[1].split(".")[0].replace(":", "-") - log_file_name = "%s_%s_log_file.txt" % (date, time) - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - else: - log_file = open(log_file_name, 'a') - - # write log (and print) - log_file.write(text + "\n") - if printing: - print text - log_file.close() - -def time_track(starting_time, show=True): - """ - calculate time passed since last time measurement - """ - now = time.time() - delta = now - starting_time - if show: - text = "\n\t %s seconds\n" % str(delta) - logprint(text, start=False, printing=True) - return now - -def calc_fig_ratio(ncols, nrows, plot_size, verbose=False): - """ - calculate size ratio for given number of columns (ncols) and rows (nrows) - with plot_size as maximum width and length - """ - ratio = ncols*1./nrows - if verbose: - text = " ".join([ncols, nrows, ratio]) - logprint(text, start=False, printing=True) - if ncols >= nrows: - figsize_x = plot_size - figsize_y = plot_size / ratio - else: - figsize_x = plot_size * ratio - figsize_y = plot_size - return figsize_x, figsize_y - -def shorten_name(seq_name, max_len=float("Inf"), delim="_"): - """ - shorten sequence names (for diagram titles) - """ - - if len(seq_name) <= max_len: - return seq_name - - # keep first and last part if multiple parts separated by delimiter (e.g. species_prefix + sequence_id) - if delim in seq_name: - if seq_name.count(delim) >= 2: - name = "%s..." % delim.join(seq_name.split(delim)[:1]) + seq_name.split(delim)[-1] # .replace("_000", "-") - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - - if len(name) > max_len: - name = name[:((max_len-2)//2)] + "..." + name[((max_len-2)//2):] - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - - return name - -def unicode_name(name): - """ - replace non-ascii characters in string (e.g. for use in matplotlib) - """ - unicode_string = eval('u"%s"' % name) - return unicodedata.normalize('NFKD', unicode_string).encode('ascii','ignore') - -def check_bools(arg, update_log_txt = True, default=None): - """ - converts commandline arguments into boolean - """ - - - # convert valid arguments - if str(arg).lower() == "y" or str(arg) == "1": - return True - elif str(arg).lower() == "n" or str(arg) == "0": - return False - - # use default in case of invalid argument - else: - if update_log_txt: - global log_txt - log_txt += "using default for " + str(arg) - else: - try: - logprint("using default for " + str(arg)) - except: - print "using default for " + str(arg) - return default - -def create_color_list(number, color_map=None, logging=False, max_grey="#595959"): - """ - create color list with given number of entries - grey by default, matplotlib color_map can be provided - """ - - try: - # create pylab colormap - cmap = eval("P.cm." + color_map) - # get descrete color list from pylab - cmaplist = [cmap(i) for i in range(cmap.N)] # extract colors from map - # determine positions for number of colors required - steps = len(cmaplist)/(number) - numbers = range(0, len(cmaplist), steps) - - # extract color and convert to hex code - colors = [] - for idx in numbers[:-1]: - rgb_color = cmaplist[idx] - col = rgb2hex(rgb_color[0]*255, rgb_color[1]*255, rgb_color[2]*255) - colors.append(col) - - # grey - except: - if not color_map == None: - logprint("Invalid color_map (%s) provided! - Examples: jet, Blues, OrRd, bwr,..." % color_map) - logprint("See https://matplotlib.org/users/colormaps.html\n") - old_max_grey = "#373737" - old_max_grey = "#444444" - colors = list(Color("#FFFFFF").range_to(Color(max_grey), number)) # grey - for idx in range(len(colors)): - colors[idx] = str(colors[idx]).replace("Color ", "") - if "#" in colors[idx] and len(colors[idx]) != 7: - # print colors[idx] - colors[idx] = colors[idx] + colors[idx][-(7-len(colors[idx])):] - - text = "%d Colors: %s" % (len(colors), ", ".join(colors)) - if logging: logprint(text, start=False, printing=True) - - return colors - - -############################### -# File Handling # -############################### - -def read_seq(input_fasta, verbose=False): - """ - read fasta sequences from (all) file(s) - """ - - # check if file provided - if input_fasta == [] or input_fasta == "": - text = "Attention: No valid file names provided: >%s<" % input_fasta - logprint(text, start=False, printing=True) - return {}, [] - - # combine sequence files, if required - if type(input_fasta) == list: - # concatenate fasta files - if len(input_fasta) > 1: - if verbose: - print "concatenating fastas...", - text = "concatenating fastas..." - input_fasta_combi = concatenate_files(input_fasta) - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - else: - input_fasta_combi = input_fasta[0] - else: - input_fasta_combi = input_fasta - - # read sequences - if verbose: - print "reading fasta...", - text = "reading fasta...", - try: - seq_dict = SeqIO.index(input_fasta_combi, "fasta") - except ValueError: - logprint("Error reading fasta sequences - please check input files, e.g. for duplicate names!") - return {}, [] - except: - logprint("Error reading fasta sequences - please check input files!") - return {}, [] - - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - - for seq in seq_dict: - if "-" in seq_dict[seq].seq: - # ungapped = seq_dict[seq].seq.ungap("-") # cannot be assigned back to sequence record - text = "\nSequences degapped prior Analysis!!!" - logprint(text, start=False, printing=True) - return read_seq(degap_fasta(input_fasta), verbose=verbose) - - # get ordered sequence names - sequences = [] - for item in SeqIO.parse(input_fasta_combi, "fasta"): - sequences.append(item.id) - return seq_dict, sequences - -def read_gff_color_config(gff_color_config_file=""): - """ - define coloring options for gff-based color shading of self-dotplots - """ - - # default aestetics for annotation shading (e.g. if no user config file is provided) - # dictionary with feature_type as key and tuple(color, transparency, zoom) as value - gff_feat_colors = {"orf": ("#b41a31", 0.2, 0), - "orf_rev": ("#ff773b", 0.3, 0), - "gene": ("#b41a31", 0.2, 0), - "cds": ("darkorange", 0.2, 0), - "exon": ("orange", 0.2, 0), - "intron": ("lightgrey", 0.2, 0), - "utr": ("lightblue", 0.2, 0), - "repeat_region": ("green", 0.3, 0), - "repeat": ("green", 0.3, 0), - "tandem_repeat": ("red", 0.3, 0), - "transposable_element": ("blue", 0.3, 0), - "ltr_retrotransposon": ("#cccccc", 0.5, 0), - "ltr-retro": ("#cccccc", 0.5, 0), - "long_terminal_repeat": ("#2dd0f0", 0.75, 2), - "ltr": ("#2dd0f0", 0.75, 2), - "pbs": ("purple", 0.75, 2), - "ppt": ("#17805a", 0.5, 2), - "target_site_duplication": ("red", 0.75, 2), - "misc_feature": ("grey", 0.3, 0), - "misc_feat": ("grey", 0.3, 0), - "misc": ("grey", 0.3, 0), - "others": ("grey", 0.5, 0)} - if gff_color_config_file in ["", None] or not os.path.exists(str(gff_color_config_file)): - return gff_feat_colors - - text = "Updating GFF color configuration with custom specifications\n" - logprint(text, start=False, printing=True) - - # read custom gff_color_config_file - in_file = open(gff_color_config_file, 'rb') - overwritten = set([]) - for line in in_file: - if not line.startswith("#") and len(line.strip().split("\t")) >= 4: - data = line.strip().split("\t") - feat = data[0].lower() - color = data[1].lower() - - # check, if settings are valid - if not mcolors.is_color_like(color): - color = "grey" - text = "Invalid color specified for %s: %s - default grey" % (data[0], data[1]) - logprint(text) - try: - alpha = float(data[2]) - except: - alpha = 0.75 - text = "Invalid alpha specified for %s: %s - default 0.75" % (data[0], data[2]) - logprint(text) - try: - zoom = float(data[3]) - except: - zoom = 0 - text = "Invalid zoom specified for %s: %s - default 0" % (data[0], data[3]) - logprint(text) - - # track changes of predefined settings - if feat in gff_feat_colors.keys(): - overwritten.add(data[0].lower()) - - gff_feat_colors[feat] = (color, alpha, zoom) - in_file.close() - - # default coloring for unknown annotations - if not "others" in gff_feat_colors.keys(): - gff_feat_colors["others"] = ("grey", 0.5, 0) - - if verbose: - # print configuration - text = "\n\nGFF color specification:\n%s\n" % (60 * ".") - for item in sorted(gff_feat_colors.keys()): - text += "%-30s\t%-10s\t%-5s\t%s\n" % (item, str(gff_feat_colors[item][0]), str(gff_feat_colors[item][1]), str(gff_feat_colors[item][2])) - logprint (text, printing=True) - - # print overwritting feature type specifications - if len(overwritten) != 0: - text = "%d feature type specifications overwritten:" % len(overwritten) - text += "\n\t"+ ", ".join(overwritten) + "\n" - logprint(text, start=False, printing=True) - - text = "GFF color specification updated acc. to %s\n\t%s\n\n" % (gff_color_config_file, ", ".join(gff_feat_colors)) - logprint(text, start=False, printing=True) - - return gff_feat_colors - -def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=True, prefix="", filetype='png', verbose=False): - """ - create feature dictionary from input_gff - sequence name as key and (feature type, start, stop) as value - """ - if type(input_gff_files) != list: - input_gff_files = [input_gff_files] - - # create dictionary with seq_name as key and (type, start and stop) as value - unknown_feats = set([]) - used_feats = set([]) - feat_dict = {} - for input_gff in input_gff_files: - text = "...reading " + input_gff - logprint(text, start=False, printing=True) - - in_file = open(input_gff, 'rb') - for line in in_file: - if not line.startswith("#") and line.strip() != "": - data = line.strip().split("\t") - feat_type = data[2].lower() - if data[6] == "-": - feat_type += "_rev" - if not feat_type.lower() in color_dict.keys(): - if feat_type.lower().replace("_rev", "") in color_dict.keys(): - feat_type = feat_type.replace("_rev", "") - else: - unknown_feats.add(feat_type) - feat_type = "others" - used_feats.add(feat_type) - if not data[0] in feat_dict.keys(): - feat_dict[data[0]] = [(feat_type, int(data[3]), int(data[4]))] # feature type, start, stop - else: - feat_dict[data[0]].append((feat_type, int(data[3]), int(data[4]))) # feature type, start, stop - if verbose: - text = "\nAnnotations for: %s\n" % ", ".join(feat_dict.keys()[:10]) - if len(feat_dict.keys()) > 10: - text = text[:-1] + ", ...\n" - logprint(text, start=False, printing=True) - in_file.close() - - # print feature types without specific shading settings - if len(unknown_feats) != 0: - text = "Missing shading specification for %d feature type(s):\n\t%s\n" % (len(unknown_feats), ", ".join(sorted(unknown_feats))) - logprint(text, start=False, printing=True) - - # create color legend - colors, alphas = [], [] - for item in sorted(used_feats): - colors.append(color_dict[item][0]) - alphas.append(color_dict[item][1]) - legend_figure(colors=colors, lcs_shading_num=len(used_feats), type_nuc=type_nuc, bins=sorted(used_feats), alphas=alphas, gff_legend=True, prefix=prefix, filetype=filetype) - - # print settings - text = "GFF Feature Types: %s\nGFF Colors: %s" % (", ".join(sorted(used_feats)), ", ".join(sorted(colors))) - logprint(text, start=False, printing=True) - - return feat_dict - -def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, verbose=False): - input_file = open(matrix_file_name, 'rb') - - # read sequence names from first column - names = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - names.append(line.strip().split(delim)[0]) - logprint("Delimiter '%s': %d names - %s\n" % (delim, len(names), ", ".join(names))) - - # check if names were found - otherwise try another delimiter - if names == [] and not recursion: - if delim == "\t": - new_delim = "," - else: - new_delim = "\t" - logprint("\nMatrix file not containing data delimited by '%s' - trying to read matrix with delimiter '%s'" % (delim.replace("\t", "\\t"), new_delim)) - info_dict = read_matrix(matrix_file_name, delim=new_delim, symmetric=symmetric, recursion=True, verbose=verbose) - return info_dict - elif names == []: - logprint("Empty matrix file with alternative delimiter!") - return info_dict - input_file.close() - - input_file = open(matrix_file_name, 'rb') - # read matrix entries as values in dictionary with tuple(names) as key - info_dict = {} - contradictory_entries = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - data = line.strip().split(delim) - for idx in range(len(data[1:])): - # print tuple(sorted([data[0], names[idx]])), data[idx+1] - if symmetric: - key = tuple(sorted([names[idx], data[0]])) - else: - key = tuple(names[idx], data[0]) - if key in info_dict.keys(): - if symmetric and info_dict[key] != data[idx+1] and data[idx+1] not in ["", "-"] and info_dict[key] not in ["", "-"]: - contradictory_entries.append(key) - info_dict[key] = data[idx+1] - input_file.close() - - if len(contradictory_entries) != 0: - try: - logprint("\nContradictory entries in matrix file %s:\n\t%s" % (matrix_file_name, ", ".join(contradictory_entries))) - except: - log_txt = "\nContradictory entries in matrix file %s:\n\t" % (matrix_file_name) - for item in contradictory_entries: - log_txt += str(item).replace("'", "") + ", " - log_txt = log_txt[:-2] - logprint(log_txt) - logprint("Using value from bottom left triangle!") - if verbose: - logprint("\nMatrix information for Sequences named: " % ", ".join(names)) - - return info_dict - -def concatenate_files(file_list, combi_filename="temp_combined.fasta", verbose=False): - """ - concatenate content of all files in file_list into a combined file named combi_filename - """ - out_file = open(combi_filename, 'w') - text = "" - for item in file_list: - if verbose: - text += item + " " - print item, - # read in_file linewise and write to out_file - in_file = open(item, 'rb') - for line in in_file: - out_file.write(line.strip()+"\n") - in_file.close() - out_file.close() - if verbose: - logprint(text, start=False, printing=False) - return combi_filename - -def degap_fasta(input_fasta): - """ - remove gaps from fasta - new degapped sequence file created - """ - - # degap all sequence files - output_fastas = [] - if type(input_fasta) != list: - input_fasta = list(input_fasta) - for input_fas in input_fasta: - output_fas = input_fas[:input_fas.rfind(".")] + "_degapped.fas" - in_file = open(input_fas, 'rb') - out_file = open(output_fas, 'w') - for line in in_file: - if line.startswith(">"): - out_file.write(line.strip()+"\n") - else: - out_file.write(line.strip().replace("-", "")+"\n") - out_file.close() - in_file.close() - output_fastas.append(output_fas) - return output_fastas - -def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="png", max_lcs_len=None, min_lcs_len=0, bins=[], alphas=[], gff_legend=False, prefix="", verbose=False): - """ - create figure color legend - """ - max_legend_length_row = 8 - max_legend_length_col = 4 - - # define output file - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg" - logprint(text, start=False, printing=True) - filetype="png" - - # check if length of information fit - if not gff_legend and ((bins != [] and len(colors) != lcs_shading_num+1) or (bins != [] and len(colors) != len(bins)+1)): - if bins != [] and len(colors) != lcs_shading_num+1: - text = "**Attention**\nlcs_shading_num (%d) does not match number of colors (%d)!\n"% (lcs_shading_num, len(bins)) - elif bins != [] and len(colors) != len(bins)+1: - text = "**Attention**\nnumber of LCS length bins (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - elif gff_legend and len(bins) != len(colors): - text = "**Attention**\nnumber of GFF Feature Types (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - - # set alpha values to opaque if none are provided - if alphas == []: - for item in colors: - alphas.append(1) - - # legend data points - data_points = range(len(colors)) - if not gff_legend: - - # specify intervals, if max_lcs_len provided - if max_lcs_len != None: - multi_factor = 100 # one digit - if max_lcs_len <= 1: - multi_factor = 1000 # two digits - # len_interval_size = (max_lcs_len-min_lcs_len) * multi_factor *1. // lcs_shading_num * (1./ multi_factor) - len_interval_size = (max_lcs_len-min_lcs_len) * 1. / lcs_shading_num - len_pos = [float("%.2f" % (min_lcs_len))] - # calculate interval positions - for idx in range(lcs_shading_num): - len_pos.append(float("%.2f" % (len_pos[-1] + len_interval_size))) - - if prefix.startswith("custom-matrix") and (0 <= max_lcs_len <= 100 and 0 <= min_lcs_len <= 100): - unit = "%" - elif prefix.startswith("custom-matrix"): - unit = "" - - text = "\n%d Legend intervals from %.2f to %.2f: \n\t%s - number: %d, step: %.2f, unit: %s\n" % (lcs_shading_num+1, min_lcs_len, max_lcs_len, str(len_pos), len(len_pos), len_interval_size, unit) - logprint(text, start=False, printing=True) - pos = len_pos - interval_size = len_interval_size - else: - # generate legend labels acc. to standard interval notation - interval_size = 100 // lcs_shading_num - pos = range(interval_size, 101+interval_size, interval_size) - - if bins != []: # labels provided - legend_labels = bins[:] - legend_labels.append("max") - legend_labels_lengths = [] - for item in bins: - legend_labels_lengths.append("[%d %s, %d %s)" % (item - min(bins), unit, item, unit)) - if len(bins) == len(colors) - 1: - legend_labels_lengths.append("[%d %s, %s]" % (max(bins), unit, u"\u221E")) # infinite - - else: - legend_labels = [] - legend_labels_lengths = [] - for idx in range(len(pos)): - num = pos[idx] - legend_labels.append("[%d%%, %d%%)" % (num - interval_size, num)) - if max_lcs_len != None: - num = len_pos[idx] - # as int or float - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths.append("[%d %s, %d %s)" % (num, unit, num + len_interval_size, unit)) - else: - legend_labels_lengths.append("[%.2f %s, %.2f %s)" % (num, unit, num + len_interval_size, unit)) - legend_labels[-1] = "100" + unit - if max_lcs_len != None: - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths[-1] = "%d %s" % (max_lcs_len, unit) - else: - legend_labels_lengths[-1] = "%.2f %s" % (max_lcs_len, unit) - - # set labels and choose file name - if gff_legend: - label_text = bins[:] - edge_col = None - legend_file_name = "Selfdotplot_GFF_Shading_Legend_n%d." % lcs_shading_num + filetype - elif max_lcs_len != None: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_max%d%s_n%d." % (max_lcs_len, unit, lcs_shading_num) + filetype - elif bins != []: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%d%s_n%d." % (bins[0], unit, lcs_shading_num) + filetype - else: - label_text = legend_labels[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%%len_n%d." % lcs_shading_num + filetype - - if prefix != None and prefix != "": - if not prefix.endswith("-"): - prefix = prefix + "-" - legend_type = "LCS" - if prefix.startswith("custom-matrix"): - prefix = prefix.replace("custom-matrix", "")[1:] - legend_type = "CustomMatrix" - legend_file_name = prefix + legend_file_name.replace("LCS", legend_type) - - # plot legend figure - fig, ax = P.subplots(3, 1, figsize=(len(colors)*2, len(colors)*2)) - for idx in range(len(colors)): - ax[0].bar(data_points[idx]+1, data_points[idx]+1, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[2].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].set_ylim(0,1) - ax[2].set_ylim(0,1) - ax[1].legend(ncol=((len(colors)-1)//max_legend_length_row)+1, framealpha=1) # vertical legend - col_num = len(colors) - if len(colors) > max_legend_length_col: - remainder = 0 - if len(colors) % max_legend_length_col != 0: - remainder = 1 - row_num = len(colors) // max_legend_length_col + remainder - remainder = 0 - if len(colors) % row_num != 0: - remainder = 1 - col_num = len(colors) // row_num + remainder - ax[2].legend(ncol=col_num, framealpha=1) # horizontal legend - - P.savefig(legend_file_name) - - return legend_file_name - - -############################### -# Analysis Functions # -############################### - -def wobble_replacement(sequence, general_ambiguity_code, verbose=False): - """ - get all degenerated sequences for sequence with ambiguous residues - (only residues considered that are keys in wobble_dictionary) - """ - - # get positions of ambiguous residues - wobble_pos = [] - for idx in range(len(sequence)): - letter = sequence[idx] - if letter in general_ambiguity_code.keys(): - wobble_pos.append(idx) - - if verbose: - text = "\t%d wobbles" % len(wobble_pos) - logprint(text, start=False, printing=True) - - # replace one wobble through each iteration by all possible residues - # repeat if still wobbles in new kmers - kmer_variants = [sequence] - while True: - if verbose: - text = "\t\t%d kmer variants" % len(kmer_variants) - logprint(text, start=False, printing=True) - temp_kmers = set([]) - for kmer in kmer_variants: - for idx in wobble_pos: - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - for base in general_ambiguity_code[kmer[idx]]: - newkmer = kmer[:idx] + base + kmer[idx+1:] - temp_kmers.add(newkmer) - wobble = False - for kmer in temp_kmers: - for idx in range(len(kmer)): - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - wobble = True - break - if wobble: - break - kmer_variants = set(list(temp_kmers)[:]) - if not wobble: - break - - return kmer_variants - -def split_diagonals(data, stepsize=1): - """ - split array if point difference exceeds stepsize - data = sorted list of numbers - """ - return np.split(data, np.where(np.diff(data) != stepsize)[0]+1) - -def longest_common_substring(s1, s2): - m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))] - longest, x_longest = 0, 0 - for x in xrange(1, 1 + len(s1)): - for y in xrange(1, 1 + len(s2)): - if s1[x - 1] == s2[y - 1]: - m[x][y] = m[x - 1][y - 1] + 1 - if m[x][y] > longest: - longest = m[x][y] - x_longest = x - else: - m[x][y] = 0 - return longest - -def lcs_from_x_values(x_values): - """ - calculate length of longest common substring based on nested list of numbers - """ - if len(x_values) == 0: - return 0 - # get lengths of each subarray data - lengths = np.array([len(i) for i in x_values]) - return max(lengths) - - -############################### -# Matching Functions # -############################### - -def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - - # forward - ################################# - kmer_pos_dict_one = {}; kmer_pos_dict_two = {} # dictionaries for both sequences - - # reverse complement - ################################# - kmer_pos_dict_three = {}; kmer_pos_dict_four = {} # dictionaries for both sequences - - # create dictionaries with kmers (wordsize) and there position(s) in the sequence - if rc_option: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two), - (str(seq_one), kmer_pos_dict_three), - (str(seq_two.reverse_complement()), kmer_pos_dict_four)] - else: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two)] - for (seq, kmer_pos_dict) in data_list: - for i in range(len(seq)-wordsize+1): - kmer = seq[i:i+wordsize] - # discard kmer, if too many Ns included - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - if not convert_wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - wobbles = False - for item in general_ambiguity_code.keys(): - if item in kmer: - wobbles = True - break - if not wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - kmer_variants = wobble_replacement(kmer, general_ambiguity_code) - for new_kmer in kmer_variants: - # print "\t", new_kmer - try: - kmer_pos_dict[new_kmer].append(i) - except KeyError: - kmer_pos_dict[new_kmer] = [i] - - # find kmers shared between both sequences - matches_for = set(kmer_pos_dict_one).intersection(kmer_pos_dict_two) # forward - matches_rc = set(kmer_pos_dict_three).intersection(kmer_pos_dict_four) # reverse complement - - if verbose: - text = "[matches: %i for; %.i rc]" % (len(matches_for), len(matches_rc)) - logprint(text, start=False, printing=True) - - # create lists of x and y co-ordinates for scatter plot - # keep all coordinates of all shared kmers (may match multiple times) - diag_dict_for = {} - diag_dict_rc = {} - for (match_list, pos_dict1, pos_dict2, diag_dict) in [(matches_for, kmer_pos_dict_one, kmer_pos_dict_two, diag_dict_for), - (matches_rc, kmer_pos_dict_three, kmer_pos_dict_four, diag_dict_rc)]: - for kmer in match_list: - for i in pos_dict1[kmer]: - for j in pos_dict2[kmer]: - diag = i-j - points = set(range(i+1, i+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - -def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize via regular expression search - fuzzy matching - allow up to substitution_count substitutions - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - ambiguity_match_dict = alphabets(type_nuc)[3] - - ambiq_residues = "[%s]" % "".join(general_ambiguity_code.keys()) - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # check for wobble presence - if not (regex.search(ambiq_residues, str(seq_one)) == None and regex.search(ambiq_residues, str(seq_two)) == None): - wobble_found = True - else: - wobble_found = False - - # dictionary for matches - diag_dict_for = {} - diag_dict_rc = {} - counter = [0, 0] - - # one-way matching - if rc_option: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0), - (str(seq_one), str(seq_two.reverse_complement()), diag_dict_rc, 1)] - else: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0)] - - for seq_query, seq_target, diag_dict, counter_pos in data_list: - # split query sequence into kmers - if not rc_option and counter_pos == 1: - break - - for idx in range(len(str(seq_query))-wordsize+1): - kmer = str(seq_query)[idx:idx+wordsize] - - # skip excessive N/X stretches (big black areas) - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - # convert kmer to regular expression for wobble_matching - if convert_wobbles and wobble_found: - kmer_string = "" - # replace each residue with matching residues or wobbles - for jdx in range(len(kmer)): - kmer_string += ambiguity_match_dict[kmer[jdx]] - else: - kmer_string = kmer - - # convert to regular expression tolerating substitution errors - if type(substitution_count) == int and substitution_count != 0: - kmer_string = "(%s){s<=%d}" % (kmer_string, substitution_count) - - # search for regular expression in target sequence - kdx = 0 - start = True - if regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - while regex.search(kmer_string, seq_target[kdx:]) != None: - # search for regular expression pattern in target sequence - result = regex.search(kmer_string, seq_target[kdx:]) - - kmer2 = seq_target[kdx:][result.start():result.end()] - - # skip excessive N/X stretches (big black areas) - if kmer2.count(any_residue)*100./wordsize <= max_N_percentage: - diag = idx-(kdx+result.start()) - points = set(range(idx+1, idx+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - kdx += result.start() + 1 - if kdx >= len(seq_target): - break - elif regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - - if verbose: - text = "%5.i \tforward matches" % counter[0] - text += "\n%5.i \treverse complementary matches" % counter[1] - logprint(text, start=False, printing=True) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - - -############################### -# Dot Plot Functions # -############################### - -def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, title_length=float("Inf")): - """ - self-against-self dotplot - partially from biopython cookbook - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least one input sequence - if len(sequences) == 0: - text = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - text += " No sequences provided for selfdotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1 and multi: - text = "\n\nCreating collage output for single selfdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nSelfdotplot Collage: Invalid collage - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences): - ncols = len(sequences) - nrows = 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=prefix, filetype=filetype, verbose=verbose) - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage=49 - text = "Provide valid max_N_percentage, kmers with >=50% Ns are ignored\n" - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given:%s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - global t1 - - print "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-"), - log_txt = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - - # preparations for file name - name_graph = "Selfdotplots" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if multi: - suffix += "_collage" - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - list_of_png_names = [] - - counter = 0 - for seq_name in sequences: - print seq_name, - log_txt += " " + seq_name - - counter += 1 - if not multi: - P.cla() # clear any prior graph - - # read sequence - seq_record = seq_dict[seq_name] - name_seq = seq_record.id - seq_one = seq_record.seq.upper() - length_seq = len(seq_one) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_regex(seq_one, seq_one, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG", - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_diag(seq_one, seq_one, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlim(0, length_seq+1) - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - P.title(unicode_name(shorten_name(name_seq, max_len=title_length)), fontsize=label_size, fontweight='bold') - # P.title(unicode_name(name_seq), fontsize=label_size*1.3, fontweight='bold') - - # save figure and reinitiate if page is full - if counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' % (prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - else: # not multi - - fig = P.figure(figsize=(plot_size, plot_size)) # figure size needs to be a square - ax = P.subplot(1, 1, 1) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - number = 0 - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlim(0, length_seq+1) - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - P.title(unicode_name(shorten_name(name_seq, max_len=title_length)), fontsize=label_size*1.3, fontweight='bold') - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_%s_wordsize%i%s.%s' %(prefix, name_graph, counter, name_seq, wordsize, suffix, filetype) - P.savefig(fig_name, bbox_inches='tight') - - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - if multi and counter >= 1: - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - print "\n\nDrawing selfdotplots done" - log_txt += "\n\nDrawing selfdotplots done" - logprint(log_txt, start=False, printing=False) - - return list_of_png_names - -def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, length_scaling=True, scale_delim_col="red"): - """ - pairwise dotplot (all-against-all) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least two input sequences - if len(sequences) < 2: - text = "\n%s\n\nCreating %d paired dotplot image \n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += " Please provide at least two sequences for pairdotplot!\n\nTerminating paired dotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 2 and multi: - text = "\n\nCreating collage output for single pairdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nPairdotplot Collage: Invalid collage settings - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences)*(len(sequences)-1): - ncols = len(sequences) - nrows = 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences)*(len(sequences)-1): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - - text = "\n%s\n\nCreating %d paired dotplot image for\n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += ", ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - y_label_rotation = "vertical" - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage=49 - text = "Provide valid max_N_percentage, kmers with >50% are ignored\n" - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given: %s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - # preparations for file name - name_graph = "Pairdotplot" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if length_scaling: - suffix += "_scaled" - if multi: - suffix += "_collage" - - - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - list_of_png_names = [] - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - - # prepare LCS data file - lcs_data_file = open("%sPairdotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - counter, seq_counter = 0, 0 - print "Drawing pairwise dotplot...", - log_txt = "Drawing pairwise dotplot..." - if verbose: - seq_text = "" - for idx in range(len(sequences)-1): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx+1, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += " " + str(seq_counter) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - - # write LCS data file - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - else: - # calculate figure size for separate figures - if len_one >= len_two: - sizing = (plot_size, max(2, (plot_size)*len_two*1./len_one)) - # sizing = (plot_size, min(plot_size, max(2, (plot_size-2)*len_two*1./len_one+2))) - else: - sizing = (max(2, (plot_size)*len_one*1./len_two), plot_size) - # sizing = (min(plot_size, max(2, (plot_size-2)*len_one*1./len_two+2)), plot_size) - fig = P.figure(figsize=(plot_size, plot_size)) - - ax = P.subplot(1, 1, 1) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlabel(unicode_name(shorten_name(name_one, max_len=title_length)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.ylabel(unicode_name(shorten_name(name_two, max_len=title_length)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - if not multi: - if length_scaling: - ax.set_aspect(aspect='equal', adjustable='box', anchor='NW') - P.xlim(0, len_one+1) - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - elif not length_scaling: - P.xlim(0, len_one+1) - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - else: - max_len = max(len_one, len_two) - P.xlim(0, max_len+1) - P.ylim(max_len+1, 0) # rotate y axis (point downwards) - - # plot line deliminating shorter sequence - if max_len != len_one: - ax.plot((len_one+1, len_one+1), (0, len_two), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - if max_len != len_two: - ax.plot((0, len_one), (len_two+1, len_two+1), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - - # evtl. switch x axis position - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - P.setp(ax.get_xticklabels(), fontsize=label_size*.9) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) - - # save figure and reinitiate if page is full - if multi and counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=.5, wspace=.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=.5, wspace=.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - elif not multi: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, bottom=0.05) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_wordsize%i%s.%s' % (prefix, name_graph, counter, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - list_of_png_names.append(fig_name) - fig = P.figure() - - # save figure - if multi and counter >= 1: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=0.5, wspace=0.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.5, wspace=0.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - print - logprint(seq_text, start=False, printing=False) - - return list_of_png_names - -def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, lcs_shading=True, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, lcs_shading_num=5, spacing=0.04, input_user_matrix_file="", user_matrix_print=True): - """ - all-against-all dotplot - derived from dotplot function - - lcs_shading_refs: - 0 color relative to maximum lcs observed in dataset [default] - 1 color by coverage of shorter sequence (e.g. lcs = 70% of seq1) - lcs_shading_ori - 0 forward only - 1 reverse only - 2 both orientations (in opposite plot) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - if len(sequences) == 0: - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " No sequences provided for polydotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1: - text = "\n\nCreating polydotplot for single sequence!" - text += "\nRecommendation: Use selfdotplot via '--plotting_mode 0'!\n\n" - logprint(text, start=False, printing=True) - - - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " " + " ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage=49 - text = "Provide valid max_N_percentage, kmers with >50% are ignored\n" - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given: %s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - if lcs_shading and not type_nuc: - if lcs_shading_ori != 0: - lcs_shading_ori = 0 - text = "Protein shading does not support reverse complementary matching!\n" - logprint(text, start=False, printing=True) - - # read custom shading matrix & match names of sequences to fasta - if input_user_matrix_file != "" and input_user_matrix_file != None: - logprint("Reading user matrix file: %s" % input_user_matrix_file) - # lcs_shading_ori = 2 - custom_dict = read_matrix(input_user_matrix_file) - if custom_dict != {}: - custom_shading = True - custom_similarity_dict = {} - invalid_entries = [] - custom_max = 0 - custom_min = float("Inf") - for key in custom_dict.keys(): - number_key = [] - - # convert number into float - try: - value = float(custom_dict[key]) - if not "." in custom_dict[key]: - value = int(custom_dict[key]) - custom_max = max(custom_max, value) - custom_min = min(custom_min, value) - except: - value = custom_dict[key] - if value == "": - value = None - invalid_entries.append(key) - # match matrix names with sequence names - for item in key: - if item in sequences: - number_key.append(sequences.index(item)) - else: - number_key.append(-1) - # dictionary with tuple of sorted sequence indices as key and number as value - custom_similarity_dict[tuple(sorted(number_key))] = value - if len(invalid_entries) != 0: - text = "No valid number in custom similarity matrix for %d entries: \n\t" % (len(invalid_entries)) - for key in invalid_entries: - text += str(key) + " - " + str(custom_dict[key]) + "; " - logprint(text[:-2]+"\n") - - text = "Custom user matrix given: min %.2f, max %.2f\n" % (custom_min, custom_max) - - # artificially rounding intervals if likely identity/divergence percentages - if 0 <= custom_min < 1 and 0 < custom_max <= 1: - rounding_factor = 5 - multi_factor = 100 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (multi_factor*custom_min // rounding_factor) * (1.*rounding_factor/multi_factor)) - custom_max = min((multi_factor*custom_max // rounding_factor) * (1.*rounding_factor/multi_factor), 1) - text += "new (%.2f, %2f)\n" % (custom_min, custom_max) - - elif 0 <= custom_min < 100 and 0 < custom_max <= 100: - rounding_factor = 5 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (custom_min // rounding_factor) * rounding_factor) - custom_max = min((custom_max // rounding_factor) * rounding_factor, 100) - text += "new (%d, %d)\n" % (custom_min, custom_max) - - logprint(text) - - else: - custom_shading = False - - name_graph = "Polydotplot" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if custom_shading: - suffix += "_matrix" - if lcs_shading: - suffix += "_%dshades_ref%d_ori%s" % (lcs_shading_num+1, lcs_shading_ref, lcs_shading_ori) - if "ref2" in suffix and type_nuc: - suffix = suffix.replace("ref2", "%dbp" % lcs_shading_interval_len) - elif "ref2" in suffix: - suffix = suffix.replace("ref2", "%daa" % lcs_shading_interval_len) - - - # name and create output files (names derived from SEQNAME) - if prefix != None and str(prefix) != "": - prefix = str(prefix) + "-" - else: - prefix = "" - - # preparations for background shading - if lcs_shading or custom_shading: - # create color range white to grey - colors = create_color_list(lcs_shading_num+1, color_map=None, logging=True) - colors_2 = create_color_list(lcs_shading_num+1, color_map="OrRd", logging=True) - - if custom_shading: - text = "Custom Matrix Colors: " + ", ".join(colors_2) - - # write lcs lengths to file - lcs_data_file = open("%sPolydotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - # compare sequences pairwise - save lcs and line information in dictionary for plotting - data_dict = {} # keys = tuple(idx, jdx), value = x1, y1, x2, y2 (line positions) - lcs_dict = {} # keys = tuple(idx, jdx), value = length of lcs: lcs_len or (lcs_for, lcs_rev) - for_lcs_set = set([]) # keep lengths to calculate max (excluding self comparisons) - rev_lcs_set = set([]) # keep lengths to calculate max (all) - - text = "\nTotal plot count: %d" % (len(sequences)*(len(sequences))) - text += "\nTotal calculations: %d" % (len(sequences)*(len(sequences)+1)/2) - logprint(text, start=False, printing=True) - - print "\nCalculating shared regions and lengths of longest_common_substring...", - log_txt = "\nCalculating shared regions and lengths of longest_common_substring..." - # determine matches and length of lcs by comparing all sequence pairs - if verbose: - seq_text = "" - counter = 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif len(sequences) < 5: - print "\t%s (%d %s), %s (%d %s)" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - log_txt += "\t%s (%d %s), %s (%d %s)\n" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - else: - if not counter % 25: - print counter, - log_txt += str(counter) - - # get positions of matches & length of longest common substring based on match lengths - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - data_dict[(idx, jdx)] = x1[:], y1[:], x2[:], y2[:] - lcs_dict[idx, jdx] = lcs_for, lcs_rev - - if idx != jdx: - for_lcs_set.add(lcs_for) - rev_lcs_set.add(lcs_rev) - - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - if not verbose: - print len(sequences)*(len(sequences)+1)/2, " done\n" - log_txt += str(len(sequences)*(len(sequences)+1)/2) + " done\n" - else: - print "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - log_txt += "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - logprint(log_txt, start=False, printing=False) - - if verbose: - logprint ("\n\nlcs_dict\n" + str(lcs_dict)) - if custom_shading: - logprint ("\ncustom_dict\n" + str(custom_dict)) - logprint ("\ncustom_similarity_dict\n\n" + str(custom_similarity_dict)) - - if verbose: - print - logprint(seq_text+"\n", start=False, printing=False) - - if lcs_shading_ref == 2: - color_bins = [] - text = "\nLCS lengh bins: " - for idx in range(lcs_shading_num): - color_bins.append(lcs_shading_interval_len*(idx+1)) - text += " " + str(lcs_shading_interval_len*(idx+1)) - logprint(text, start=False, printing=True) - - # calculate maximum lcs length - if lcs_shading_ori == 0: # forward only - if len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - elif lcs_shading_ori == 1: # reverse complement only - if len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - else: - max_lcs = None - else: # both orientations - if len(rev_lcs_set) != 0 and len(for_lcs_set) != 0: - max_lcs = max(max(rev_lcs_set), max(for_lcs_set)) - elif len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - elif len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - - if not max_lcs == None: - text = "Maximum LCS: %d %s" % (max_lcs, aa_bp_unit) - logprint(text, start=False, printing=True) - if custom_shading: - text = "Maximum custom value: %d\n" % custom_max - logprint(text, start=False, printing=True) - - # count sequences - ncols = len(sequences); nrows = len(sequences) - - # get sequence lengths to scale plot widths and heights accordingly - size_ratios = [] - for item in sequences: - size_ratios.append(len(seq_dict[item].seq)) - - P.cla() # clear any prior graph - # use GridSpec to resize plots according to sequence length - gs = gridspec.GridSpec(nrows, ncols, - width_ratios=size_ratios, - height_ratios=size_ratios) - fig = P.figure(figsize=(plot_size, plot_size)) - - # determine label orientations - if len(sequences) > 5: - x_label_rotation = 45 - y_label_rotation = "horizontal" - if x_label_pos_top: - xhalign = 'left' - xvalign = 'bottom' - else: - xhalign = 'right' - xvalign = 'top' - yhalign = "right" - else: - x_label_rotation = "horizontal" - y_label_rotation = "vertical" - xvalign = "center" - xhalign = "center" - yhalign = "center" - yvalign = 'center' - - print "\nDrawing polydotplot...", - log_txt = "\nDrawing polydotplot..." - - # draw subplots - if verbose: - if lcs_shading and custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "Custom matrix value", "Matrix color index", "LCS color index"]) + "\n" - elif lcs_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "LCS color index for", "LCS color index rev"]) + "\n" - elif custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "Custom matrix value", "Color index for", "Color index rev"]) + "\n" - - if verbose: - seq_text = "" - counter, seq_counter = 0, 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - len_two = len(rec_two.seq) - name_two = rec_two.id - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - len_one = len(rec_one.seq) - name_one = rec_one.id - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - # optional shade background according to length of LCS and/or user matrix - ######################################################################### - - # get interval based on LCS - background_colors = [None, None] - if lcs_shading and (lcs_shading_ref==1 or lcs_shading_ref==2 or max_lcs!=None): # self plot max_lcs_for == None - lcs_len = lcs_dict[(idx, jdx)] - l1 = lcs_len[0] # forward - l2 = lcs_len[1] # reverse complement - - lcs_shading_bool = True - - # calculate shading acc. to chosen option - if lcs_shading_ref == 1: # percentage of shorter sequence - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // min(len_one, len_two)) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // min(len_one, len_two)) - elif lcs_shading_ref == 2: # by given interval size - color_idx0 = min(len(colors)-1, l1 // lcs_shading_interval_len) - color_idx1 = min(len(colors)-1, l2 // lcs_shading_interval_len) - if color_idx0 >= len(colors): - color_idx0 = len(colors) - if color_idx1 >= len(colors): - color_idx1 = len(colors) - else: # percentage of maximum lcs length - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // max_lcs) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // max_lcs) - else: - lcs_shading_bool = False - - # get interval based on custom matrix - if custom_shading: - # matrix value - try: - custom_value = custom_similarity_dict[(idx, jdx)] - except: - custom_value = "" - - # bottom left triangle = LCS forward/reverse or best of both - if lcs_shading_bool: - if lcs_shading_ori == 0: # forward - color_idx1 = color_idx0 - elif lcs_shading_ori == 2: # both directions - color_idx1 = max(color_idx0, color_idx1) - - # top right triangle = custom value (not colored if text matrix provided) - if type(custom_value) == int or type(custom_value) == float: - color_idx0 = int((custom_value-custom_min)*lcs_shading_num // (custom_max-custom_min)) - # if string is proviced - else: - color_idx0 = 0 - - # set colors dependent on lcs dependent on orientation - if lcs_shading_bool and not custom_shading: - if idx != jdx: - if lcs_shading_ori == 0: - color_idx1 = color_idx0 - elif lcs_shading_ori == 1: - color_idx0 = color_idx1 - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx1] - # for selfcomparison, only color reverse complement - elif lcs_shading_ori != 0 and not custom_shading: - background_colors[0] = colors[color_idx1] - # set different colors for shading by LCS + user matrix - elif lcs_shading_bool and custom_shading: - # print colors, background_colors, color_idx0, color_idx1 - background_colors[0] = colors_2[color_idx0] - background_colors[1] = colors[color_idx1] - # set grey color range for user matrix if no LCS shading - elif custom_shading: - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx0] - - if verbose: - if custom_shading and lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - elif lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(color_idx0), str(color_idx1)]) + "\n" - elif custom_shading: - lcs_text += "\t".join([name_one, name_two, str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - - # diagonal (self-dotplots) - if idx == jdx: - # skip positions below diagonal - counter = counter + (counter - 1) // (nrows) # + row_pos - counters = [counter] - # draw both graphs at once (due to symmetry) - else: - col_pos = (counter - 1) % ncols - row_pos = (counter - 1) // (nrows) - counter2 = col_pos * ncols + row_pos + 1 - counters = [counter, counter2] - - if len(counters) == 2: - seq_counter += 1 - if not verbose and not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - x_lists, y_lists, x_lists_rc, y_lists_rc = data_dict[(idx, jdx)] - - # plot diagram(s) - for kdx in range(len(counters)): - # if custom matrix value printed into upper matrix triangle, skip data plotting - - # text print in top triangle - if user_matrix_print and custom_shading and kdx==0 and idx!=jdx: - data_plotting = False - # dotplot in bottom triangle - else: - data_plotting = True - - fig_pos = counters[kdx] - # plotting subplot with matplotlib - ax = P.subplot(gs[fig_pos-1]) # rows, columns, plotnumber - - # mirror plot, if plotting below diagonal - if kdx == 0: - l1, l2 = len_one, len_two - n1, n2 = name_one, name_two - x1, y1 = x_lists, y_lists - x2, y2 = x_lists_rc, y_lists_rc - else: - l2, l1 = len_one, len_two - n2, n1 = name_one, name_two - x1, y1 = y_lists, x_lists - x2, y2 = y_lists_rc, x_lists_rc - - if data_plotting: - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # plot value provided by customer instead of dotplot - else: - alignment = {'horizontalalignment': 'center', 'verticalalignment': 'center'} - # P.text(0.5, 0.5, custom_value, size='medium', transform=ax.transAxes, **alignment) - P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, **alignment) - # P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, - # horizontalalignment='center', verticalalignment='center', color="black") - - if custom_shading: - # omit diagonal - if idx == jdx: - ax.set_facecolor("white") - # use white background for text fields (top right triangle only [kdx 0]) - elif type(custom_value) != int and type(custom_value) != float and kdx == 0: - ax.set_facecolor("white") - else: - ax.set_facecolor(background_colors[kdx]) - # set background color if lcs shading - elif lcs_shading_bool and background_colors[kdx] != None: - ax.set_facecolor(background_colors[kdx]) - - # set axis limits - P.xlim(0, l1+1) - P.ylim(l2+1, 0) # rotate y axis (point downwards) - - # determine axis positions - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - x_label_bool = fig_pos <= ncols - x_tick_bool = fig_pos > ncols*(ncols-1) - else: - x_label_bool = fig_pos > ncols*(ncols-1) - x_tick_bool = fig_pos <= ncols - - # x axis labels dependent on plot position/number - if x_label_bool: # x title and labels on top or bottom - P.xlabel(unicode_name(shorten_name(n1, max_len=title_length)), fontsize=label_size, rotation=x_label_rotation, verticalalignment=xvalign, horizontalalignment=xhalign, fontweight='bold', labelpad=8) # axis naming - if not x_label_rotation in ["horizontal", "vertical"]: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation="vertical") - else: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation=x_label_rotation) - elif x_tick_bool and x_label_pos_top: # x ticks on bottom row - ax.xaxis.tick_bottom() # ticks without labels on bottom - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) - elif x_tick_bool: # x ticks on top row - ax.xaxis.tick_top() # # ticks without labels on top - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) # inner diagrams without labelling - else: # no x ticks on internal rows - ax.axes.get_xaxis().set_visible(False) - - # y axis labels dependent on plot position/number - if fig_pos % ncols == 1 or (ncols == 1 and nrows == 1): # y title and labels in 1st column - P.ylabel(unicode_name(shorten_name(n2, max_len=title_length)), fontsize=label_size, rotation=y_label_rotation, verticalalignment=yvalign, horizontalalignment=yhalign, fontweight='bold', labelpad=8) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) # axis naming - elif fig_pos % ncols == 0: # y ticks in last column - ax.yaxis.tick_right() - P.setp(ax.get_yticklabels(), visible=False) # inner diagrams without labelling - else: - ax.axes.get_yaxis().set_visible(False) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - try: - logprint(lcs_text, start=False, printing=True) - except: - pass - - # finalize layout - margins & spacing between plots - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - # gs.tight_layout(fig, h_pad=.02, w_pad=.02) # less overlapping tick labels, but also disturbingly large spacing - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, top=0.87) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, bottom=0.13) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing) # space between rows - def 0.4 - - # save figure and close instance - fig_name = '%s%s_wordsize%i%s.%s' % (prefix, name_graph, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - - # create figure color legend - if lcs_shading: - if lcs_shading_ref == 1: # percentage of shorter sequence - legend_file_name = legend_figure(colors, lcs_shading_num, unit="%", filetype=filetype, prefix=prefix) - elif lcs_shading_ref == 2: # interval sizes - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, bins=color_bins) - else: # relative of maximum lcs - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, max_lcs_len=max_lcs) - - if custom_shading: - custom_prefix = "custom-matrix-" + prefix - legend_file_name_custom = legend_figure(colors_2, lcs_shading_num, unit="%", filetype=filetype, prefix=custom_prefix, max_lcs_len=custom_max, min_lcs_len=custom_min) - - if lcs_shading and custom_shading: - return [fig_name, legend_file_name, legend_file_name_custom] - elif lcs_shading: - return [fig_name, legend_file_name] - elif custom_shading: - return [fig_name, legend_file_name_custom] - else: - return [fig_name] - - -############################### -# Function Call # -############################### - -def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_size=10, filetype="png", type_nuc=True, convert_wobbles=False, substitution_count=0, rc_option=True, alphabetic_sorting=False, gff=None, multi=True, ncols=1, nrows=1, lcs_shading=True, lcs_shading_num=5, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, gff_color_config_file="", input_user_matrix_file="", user_matrix_print=False, length_scaling=True, title_length=50, spacing=0.04, verbose=False): - - global t1, line_col_rev - - # read gff color config file if provided - if len(input_gff_files) != 0 and input_gff_files != None: - if gff_color_config_file not in ["", None]: - text = "\n%s\n\nReading GFF color configuration file\n%s\n\n=> %s\n" % (50*"=", 28*"-", gff_color_config_file) - logprint(text, start=False, printing=True) - gff_feat_colors = read_gff_color_config(gff_color_config_file) - else: - gff_feat_colors = {} - if gff_color_config_file not in ["", None]: - text = "Please provide GFF annotation files to use configuration file", gff_color_config_file - logprint(text, start=False, printing=True) - - # if color is set to white, reverse complementary matches are skipped - if not rc_option: - line_col_rev = "white" # reverse matches not calculated - elif not type_nuc: - logprint("Reverse complement deactivated for proteins!") - line_col_rev = "white" # reverse matches not calculated - - mode_text = [] - for item in modes: - mode_text.append(str(item)) - text = "%s\n\nRunning plotting modes %s" % (50*"=", ", ".join(mode_text)) - logprint(text, start=False, printing=True) - - - # create dotplots - ########################################## - - # self dotplots - t1 = time.time() - if 0 in modes: - list_of_png_names = selfdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, gff_files=gff, gff_color_dict=gff_feat_colors, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # paired dotplots - if 1 in modes: - if multi: - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=length_scaling, verbose=verbose) - t1 = time_track(t1) - else: - if not length_scaling: - text = "\nPairwise dotplot with individual output files scaled by sequence length automatically!" - logprint(text, start=False, printing=True) - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=True, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # all-against-all dotplot - if 2 in modes: - list_of_png_names = polydotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, spacing=spacing, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - text = "\n" + 50 * "#" + "\n" + 50 * "#" - text += "\n\nThank you for using FlexiDot!\n" - logprint(text, start=False, printing=True) - -# testing mode for debugging -trial_mode = False -# trial_mode = True - -# parameters = check_input(sys.argv) -parameters = check_input(sys.argv, trial_mode=trial_mode) - -# read out parameters -commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype, type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos_top, label_size, spacing, length_scaling, title_length, verbose = parameters - -# evtl. overwrite parameters for testing purposes in trial mode -if trial_mode: - # input_user_matrix_file = "AngioSINE-v18-alignment-identities.csv" - input_fasta = ["test-sequences-9-Ns.fas"] - input_fasta = ["Beta_SINEs__select_consensus.fas"] - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix.txt" - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix-01.txt" - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix-comma-str.txt" - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix-100+.txt" - # user_matrix_print = True - output_file_prefix = "SINEmatrix" - output_file_prefix = "SINEmatrix-NoShading" - plot_size = 10 - plotting_modes = [0,1,2] - plotting_modes = [2] - lcs_shading = False - lcs_shading = True - lcs_shading_ref = 2 - lcs_shading_num = 4 - lcs_shading_ori = 0 - lcs_shading_interval_len = 15 - wordsize = 10 - wordsize = 7 - x_label_pos_top = True - filetype = "pdf" - filetype = "png" - - wobble_conversion = False - wobble_conversion = True - - substitution_count = 0 - - rc_option = True - rc_option = False - label_size = 10 - - verbose = False - verbose = True - -if auto_fas: - path = os.path.dirname(os.path.abspath(__file__)) - files_long = glob.glob(path+"/*.fasta") - files_long.extend(glob.glob(path+"/*.fas")) - files_long.extend(glob.glob(path+"/*.fa")) - files_long.extend(glob.glob(path+"/*.fna")) - input_fasta = [] - for i in files_long: - if not "combined" in i: - filename = i[i.rfind('\\')+1:] - input_fasta.append(filename) - -if trial_mode: - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - -main(input_fasta, wordsize, modes=plotting_modes, prefix=output_file_prefix, plot_size=plot_size, label_size=label_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=wobble_conversion, substitution_count=substitution_count, rc_option=rc_option, gff=input_gff_files, multi=collage_output, ncols=m_col, nrows=n_row, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, gff_color_config_file=gff_color_config_file, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, length_scaling=length_scaling, title_length=title_length, spacing=spacing, verbose=verbose) - - diff --git a/code/flexidot_v1.01.py b/code/flexidot_v1.01.py deleted file mode 100644 index 8e5443f..0000000 --- a/code/flexidot_v1.01.py +++ /dev/null @@ -1,3153 +0,0 @@ -#!/usr/bin/python2.7 -#!/usr/bin/python2.7 -# -*- coding: utf-8 -*- - -""" -FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation - -Kathrin M. Seibt, Thomas Schmidt and Tony Heitkam -Institute of Botany, TU Dresden, Dresden, 01277, Germany - -(Bioinformatics, 2018) -""" - - -############################### -# Requirements # -############################### - -# import system modules -import os, glob -import time, datetime -import sys -import shutil, getopt -import unicodedata - -def module_install_command(module_name, upgrade=False): - """ - create installation commands for Python modules and print information - """ - if upgrade: - load_command = "python -m pip install --upgrade %s" % module_name - else: - load_command = "python -m pip install %s" % module_name - - try: - logprint("Installing Python module: %s\n\t%s\n" % (module_name, load_command)) - except: - print "Installing Python module: %s\n\t%s\n" % (module_name, load_command) - - return load_command - -def load_modules(): - """ - load Python modules, if possible - otherwise try to install them - """ - - # make module names global - global cllct, gridspec, patches, rcParams, mplrc, P, Color, SeqIO, np, ccv, mcolors, rgb2hex, regex - - # matplotlib - try: - import matplotlib.collections as cllct - except: - command = module_install_command("matplotlib", upgrade=True) - try: - os.system(command) - print "\n" - import matplotlib.collections as cllct - except: - print "Please install module matplotlib manually" - from matplotlib.colors import colorConverter as ccv - import matplotlib.colors as mcolors - import matplotlib.gridspec as gridspec - import matplotlib.patches as patches - import pylab as P - - # specify matplotlib font settings - from matplotlib import rc as mplrc - mplrc('pdf', fonttype=42, compression=0) - from matplotlib import rcParams - rcParams['font.family'] = 'sans-serif' - rcParams['font.sans-serif'] = ['Helvetica', 'Verdana', 'Tahoma', ] - - # colour for color gradient palette - try: - from colour import Color - except: - command = module_install_command("colour") - try: - os.system(command) - print "\n" - from colour import Color - except: - print "Please install module colour manually" - - # color converter - try: - from colormap import rgb2hex - except: - command = module_install_command("colormap") - # additional module easydev.tools required by colormap - command2 = module_install_command("easydev") - try: - os.system(command) - os.system(command2) - print "\n" - from colormap import rgb2hex - except: - print "Please install module colormap manually" - - # biopython - try: - from Bio import SeqIO - except: - command = module_install_command("biopython") - try: - os.system(command) - print "\n" - from Bio import SeqIO - except: - print "Please install module biopython manually" - - # numpy - try: - import numpy as np - except: - command = module_install_command("numpy") - try: - os.system(command) - print "\n" - import numpy as np - except: - print "Please install module numpy manually" - - # regex for pattern matching - try: - import regex - except: - command = module_install_command("regex") - try: - os.system(command) - print "\n" - import regex - except: - print "Please install module regex manually" - -load_modules() - - -############################### -# Usage & Input # -############################### - -def usage(): - """ - usage and help - """ - - print """\n\n FLEXIDOT - ------------------------------------------------------------------- - - Version: - 1.01 - - Citation: - Kathrin M. Seibt, Thomas Schmidt, Tony Heitkam (in prep.) - "FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation" - - - General usage: - $ python flexidot.py -a [ARGUMENTS] - $ python flexidot.py -i [ARGUMENTS] - - - ARGUMENTS - ------------------------------------------------------------------- - - - INPUT/OUTPUT OPTIONS... required are [-a] OR [-i] - - -a, --auto_fas Imports all fasta files from current directory (*.fasta, *.fas, *.fa, *.fna) - -i is not needed, if -a is activated - [inactive by default] - - -i, --in_file Input fasta file (fasta file name or comma-separated file list) - > Provide multiple files: Recall -i or provide comma-separated file names - - -o, --output_file_prefix File prefix to be added to the generated filenames [default = NONE] - - -c, --collage_output Multiple dotplots are combined in a collage - Y or 1 = ON [default] - N or 0 = OFF - - -m, --m_col Number of columns per page [default = 4] (only if --collage_output is ON) - - -n, --n_row Number of rows per page [default = 5] (only if --collage_output is ON) - - -f, --filetype Output file format - 0 = PNG [default] - 1 = PDF - 2 = SVG - - -s, --alphabetic_sorting Sort sequences alphabetically according to titles - Y or 1 = ON - N or 0 = OFF [default] - - - CALCULATION PARAMETERS... - - -k, --wordsize Wordsize (kmer length) for dotplot comparison [default = 7] - - -p, --plotting_mode Mode of FlexiDot dotplotting - 0 = self [default] - 1 = paired - 2 = poly (matrix with all-against-all dotplots) - > Run multiple plotting modes: Recall -p or provide comma-separated numbers - - -t, --type_nuc Type of residue is nucleotide - Y or 1 = nucleotide [default] - N or 0 = amino acid - - -w, --wobble_conversion Ambiguity handling for relaxed matching - Y or 1 = ON - N or 0 = OFF [default] - - -S, --substitution_count Number of substitutions (mismatches) allowed per window for relaxed matching - [default = 0] - - -r, --rc_option Find reverse complementary matches (only if type_nuc=y) - Y or 1 = ON [default] - N or 0 = OFF - - - GRAPHIC FORMATTING... - - -A, --line_width Line width [default = 1] - - -B, --line_col_for Line color [default = black] - - -C, --line_col_rev Reverse line color [default = green] - - -D, --x_label_pos Position of the X-label - Y or 1 = top [default] - N or 0 = bottom - - -E, --label_size Font size [default = 10] - - -F, --spacing Spacing between all-against-all dotplots (only if --plotting_mode=2) - [default = 0.04] - - -P, --plot_size Plotsize [default = 10] - - -L, --length_scaling Scale plot size for pairwise comparison (only if --plotting_mode=1) - Y or 1 = Scaling ON (axes scaled according to sequence length) - N or 0 = Scaling OFF (squared plots) [default] - - -T, --title_length Limit title length for self dotplot comparison (only if --plotting_mode=0) - [default = infinite] - - - GFF SHADING (for -p/--plotting_mode=0 only)... - - -g, --input_gff_files GFF3 file used for markup in self-dotplots - (provide multiple files: Recall -g or provide comma-separated file names) - - -G, --gff_color_config_file Tab-delimited config file for custom gff shading - column 1: feature type - column 2: color - column 3: alpha - column 4: zoom factor (for small regions) - - - LCS SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -x, --lcs_shading Shade subdotplot based on the length of the longest common substring (LCS) - Y or 1 = ON - N or 0 = OFF [default] - - -X, --lcs_shading_num Number of shading intervals (hues) for LCS (-x) and user matrix shading (-u) - [default = 5] - - -y, --lcs_shading_ref Reference for LCS shading - 0 = maximal LCS length [default] - 1 = maximally possible length (length of shorter sequence in pairwise comparison) - 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] - see -Y - - -Y, --lcs_shading_interval_len Length of intervals for LCS shading (only if --lcs_shading_ref=2) - [default for nucleotides = 50; default for amino acids = 10] - - -z, --lcs_shading_ori Shade subdotplots according to LCS on - 0 = forward [default], - 1 = reverse, or - 2 = both strands (forward shading above diagonal, reverse shading on diagonal and below; - if using --input_user_matrix_file, best LCS is used below diagonal) - - - CUSTOM USER MATRIX SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -u, --input_user_matrix_file Shading above diagonal according to values in matrix file specified by the user - (tab-delimited or comma-separated matrix with sequence name in column 1 and numbers in columns 2-n - e.g. identity matrix from multiple sequence alignment - strings are ignored) - - -U, --user_matrix_print Display provided matrix entries in the fields above diagonal of all-against-all dotplot - Y or 1 = ON - N or 0 = OFF [default] - - - OTHERS... - - -h, --help Help screen - - -v, --verbose Verbose - - - - - """ - -def check_input(argv, trial_mode=False): - """ - commandline argument parsing - """ - - global log_txt, aa_bp_unit - - # helpers for argument parsing - ###################################### - - arguments = ["-a", "--auto_fas", "a", "auto_fas", - "-i", "--input_fasta", "i:", "input_fasta=", - "-o", "--output_file_prefix", "o:", "output_file_prefix=", - "-c", "--collage_output", "c:", "collage_output=", - "-m", "--m_col", "m:", "m_col=", - "-n", "--n_row", "n:", "n_row=", - "-f", "--filetype", "f:", "filetype=", - "-t", "--type_nuc", "t:", "type_nuc=", - "-g", "--input_gff_files", "g:", "input_gff_files", - "-G", "--gff_color_config_file", "G:", "gff_color_config_file", - "-k", "--wordsize", "k:", "wordsize=", - "-p", "--plotting_mode", "p:", "plotting_mode=", - "-w", "--wobble_conversion", "w:", "wobble_conversion=", - "-S", "--substitution_count", "S:", "substitution_count=", - "-r", "--rc_option", "r:", "rc_option=", - "-s", "--alphabetic_sorting", "s:", "alphabetic_sorting=", - "-x", "--lcs_shading", "x:", "lcs_shading=", - "-X", "--lcs_shading_num", "X:", "lcs_shading_num=", - "-y", "--lcs_shading_ref", "y:", "lcs_shading_ref=", - "-Y", "--lcs_shading_interval_len", "Y:", "lcs_shading_interval_len=", - "-z", "--lcs_shading_ori", "z:", "lcs_shading_ori=", - "-u", "--input_user_matrix_file", "u:", "input_user_matrix_file=", - "-U", "--user_matrix_print", "U:", "user_matrix_print=", - "-P", "--plot_size", "P:", "plot_size=", - "-A", "--line_width", "A:", "line_width=", - "-B", "--line_col_for", "B:", "line_col_for=", - "-C", "--line_col_rev", "C:", "line_col_rev=", - "-D", "--x_label_pos", "D:", "x_label_pos=", - "-E", "--label_size", "E:", "label_size=", - "-F", "--spacing", "F:", "spacing=", - "-L", "--length_scaling", "L:", "length_scaling=", - "-T", "--title_length", "T:", "title_length=", - "-h", "--help", "h", "help", - "-v", "--verbose", "v", "verbose"] - - arguments_sysargv = tuple(arguments[0::4] + arguments[1::4]) - arguments_opts = "".join(arguments[2::4]) - arguments_args = arguments[3::4] - - - # setting defaults - ###################################### - - auto_fas = False # 0 - input_fasta = [] - output_file_prefix = None - collage_output = True # 1 - m_col = 4 - n_row = 5 - filetype = 0 - type_nuc = True - input_gff_files = [] - gff_color_config_file = "" - - wordsize = 7 - plotting_modes = [0] - wobble_conversion = False # 0 - substitution_count = 0 - rc_option = True # 1 - alphabetic_sorting = False # 0 - - lcs_shading = False # 0 - lcs_shading_num = 4 - lcs_shading_ref = 0 - lcs_shading_interval_len = 50 # interval default changes to "10" for amino acids [type_nuc = n] - lcs_shading_ori = 0 - - input_user_matrix_file = "" - user_matrix_print = False - - plot_size = 10 - line_width = 1 - line_col_for = "black" - line_col_rev = "#009243" - x_label_pos = True # 0 - label_size = 10 - spacing = 0.04 - length_scaling = False # 0 - title_length = float("Inf") - - aa_bp_unit = "bp" - - verbose = False # 0 - - filetype_dict = {0: "png", 1: "pdf", 2: "svg"} - lcs_shading_ref_dict = {0: "maximal LCS length", 1: "maximally possible length", 2: "given interval sizes"} - plotting_mode_dict = {0: "self", 1: "paired", 2: "all-against-all"} - lcs_shading_ori_dict = {0: "forward", 1: "reverse complement", 2: "both"} - - # return default parameters for testing purposes - if trial_mode: - print "ATTENTION: YOU ARE IN THE TRIAL MODE!!!\n\n" - - commandline = "trial_mode\n" - - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, verbose] - return parameters - - - # read arguments - ###################################### - - commandline = "" - for arg in sys.argv: - commandline += arg + " " - - log_txt = "\n...reading input arguments..." - print log_txt - - if len(sys.argv) < 2: - print "\nERROR: More arguments are needed. Exit..." - log_txt += "\nERROR: More arguments are needed. Exit..." - usage() - sys.exit() - - elif sys.argv[1] not in arguments_sysargv: - print "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - log_txt += "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - # usage() - sys.exit() - - try: - opts, args = getopt.getopt(sys.argv[1:], arguments_opts, arguments_args) - - except getopt.GetoptError: - print "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - log_txt += "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - # usage() - sys.exit() - - for opt, arg in opts: - - if opt in ("-h", "--help"): - print "...fetch help screen" - log_txt += "\n...fetch help screen" - usage(), sys.exit() - - if opt in ("-v", "--verbose"): - print "...verbose output" - log_txt += "\n...verbose output" - verbose = True - - elif opt in ("-i", "--input_fasta"): - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: fasta_file '%s' was not found!" % str(temp_file) - sys.exit(message) - else: - input_fasta.append(str(temp_file)) - print "fasta file #%i: %s" % (len(input_fasta), str(temp_file)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: fasta_file '%s' was not found!" % str(arg) - log_txt += message - sys.exit(message) - else: - input_fasta.append(str(arg)) - print "fasta file #%i: %s" % (len(input_fasta), str(arg)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(arg)) - - - elif opt in ("-a", "--auto_fas"): - auto_fas = True - - - # multiple gff files: reads them into a list - elif opt in ("-g", "--input_gff_files"): - - # append gff file only if existing - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: gff_file '%s' was not found!" % str(temp_file) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - print "GFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - input_gff_files.append(str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: gff_file '%s' was not found!" % str(arg) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - input_gff_files.append(str(arg)) - print "GFF file #%i: %s" %(len(input_gff_files), str(arg)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(arg)) - - - elif opt in ("-G", "--gff_color_config_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: gff_color_config_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot with default gff coloring specification!" - log_txt += message + "\n -->Running FlexiDot with default gff coloring specification!" - else: - gff_color_config_file = str(arg) - - - elif opt in ("-u", "--input_user_matrix_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: input_user_matrix_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot without input_user_matrix_file %s!" % arg - log_txt += message + "\n -->Running FlexiDot withdefault matrix shading file!" - else: - input_user_matrix_file = str(arg) - - elif opt in ("-U", "--user_matrix_print"): - user_matrix_print = check_bools(str(arg), default=user_matrix_print) - - elif opt in ("-o", "--output_file_prefix"): - output_file_prefix = arg - - elif opt in ("-c", "--collage_output"): - collage_output = check_bools(str(arg), default=collage_output) - - elif opt in ("-m", "--m_col"): - try: m_col = int(arg) - except: - print "m_col - invalid argument - using default value" - log_txt += "\nm_col - invalid argument - using default value" - - elif opt in ("-n", "--n_row"): - try: n_row = int(arg) - except: - print "n_row - invalid argument - using default value" - log_txt += "\nn_row - invalid argument - using default value" - - elif opt in ("-f", "--filetype"): - if 0 <= int(arg) <= 2: - filetype = int(arg) - else: - print "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - log_txt += "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - - elif opt in ("-t", "--type_nuc"): - type_nuc = check_bools(str(arg), default=type_nuc) - - if type_nuc == False: - # interval default changed for amino acids - lcs_shading_interval_len = 10 - aa_bp_unit = "aa" - - elif opt in ("-k", "--wordsize"): - try: wordsize = int(arg) - except: - print "wordsize - invalid argument - using default value" - log_txt += "\nwordsize - invalid argument - using default value" - - elif opt in ("-p", "--plotting_mode"): - if "," in arg: - temp_modes = arg.split(",") - for item in temp_modes: - if item in ["0","1","2"]: - plotting_modes.append(int(item)) - elif arg in ["0","1","2"]: - plotting_modes = [int(arg)] - else: - print "Please provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - log_txt += "\nPlease provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - - elif opt in ("-w", "--wobble_conversion"): - wobble_conversion = check_bools(str(arg), default=wobble_conversion) - - elif opt in ("-S", "--substitution_count"): - try: substitution_count = int(arg) - except: - print "substitution_count - invalid argument - using default value" - log_txt += "\nsubstitution_count - invalid argument - using default value" - - elif opt in ("-r", "--rc_option"): - rc_option = check_bools(str(arg), default=rc_option) - - elif opt in ("-s", "--alphabetic_sorting"): - alphabetic_sorting = check_bools(str(arg), default=alphabetic_sorting) - - elif opt in ("-x", "--lcs_shading"): - lcs_shading = check_bools(str(arg), default=lcs_shading) - - elif opt in ("-X", "--lcs_shading_num"): - try: lcs_shading_num = int(arg) - 1 - except: - print "lcs_shading_num - invalid argument - using default value" - log_txt += "\nlcs_shading_num - invalid argument - using default value" - - elif opt in ("-y", "--lcs_shading_ref"): - try: - if 0 <= int(arg) <= 2: - lcs_shading_ref = int(arg) - else: - print "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - log_txt += "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - except: - print "lcs_shading_ref - invalid argument - using default value" - log_txt += "\nlcs_shading_ref - invalid argument - using default value" - - elif opt in ("-Y", "--lcs_shading_interval_len"): - try: lcs_shading_interval_len = int(arg) - except: - print "lcs_shading_interval_len - invalid argument - using default value" - log_txt += "\nlcs_shading_interval_len - invalid argument - using default value" - - elif opt in ("-z", "--lcs_shading_ori"): - if 0 <= int(arg) <= 2: - lcs_shading_ori = int(arg) - else: - print "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - log_txt += "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - - elif opt in ("-P", "--plot_size"): - try: plot_size = float(arg) - except: - print "plot_size - invalid argument - using default value" - log_txt += "\nplot_size - invalid argument - using default value" - - - elif opt in ("-A", "--line_width"): - try: line_width = float(arg) - except: - print "line_width - invalid argument - using default value" - log_txt += "\nline_width - invalid argument - using default value" - - elif opt in ("-B", "--line_col_for"): - if mcolors.is_color_like(arg): - line_col_for = arg - else: - print "line_col_for - invalid argument - using default value" - log_txt += "\nline_col_for - invalid argument - using default value" - - elif opt in ("-C", "--line_col_rev"): - if mcolors.is_color_like(arg): - line_col_rev = arg - else: - print "line_col_rev - invalid argument - using default value" - log_txt += "\nline_col_rev - invalid argument - using default value" - - elif opt in ("-D", "--x_label_pos"): - x_label_pos = check_bools(str(arg), default=x_label_pos) - - elif opt in ("-E", "--label_size"): - try: label_size = float(arg) - except: - print "label_size - invalid argument - using default value" - log_txt += "\nlabel_size - invalid argument - using default value" - - elif opt in ("-F", "--spacing"): - try: spacing = float(arg) - except: - print "spacing - invalid argument - using default value" - log_txt += "\nspacing - invalid argument - using default value" - - elif opt in ("-L", "--length_scaling"): - length_scaling = check_bools(str(arg), default=length_scaling) - - elif opt in ("-T", "--title_length"): - try: title_length = int(arg) - except: - print "title_length - invalid argument - using default value" - log_txt += "\ntitle_length - invalid argument - using default value" - - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - logprint(log_txt, start=False, printing=False) - - - # print chosen arguments - ###################################### - - text = "\n%s\n" % (70 * "-") - text += "\n" + "INPUT/OUTPUT OPTIONS...\n" - text += "\n" + "Input fasta file: " + ", ".join(input_fasta) - text += "\n" + "Automatic fasta collection from current directory: " + str(auto_fas) - text += "\n" + "Collage output: " + str(collage_output) - text += "\n" + "Number of columns per page: " + str(m_col) - text += "\n" + "Number of rows per page: " + str(n_row) - text += "\n" + "File format: " + filetype_dict[filetype] - text += "\n" + "Residue type is nucleotide: " + str(type_nuc) - - text += "\n" + "\n\nCALCULATION PARAMETERS...\n" - text += "\n" + "Wordsize: " + str(wordsize) - text += "\n" + "Plotting mode: " + str(plotting_modes).replace("[", "").replace("]", "") + "\n" + 51 * " " - for item in plotting_modes: - text += plotting_mode_dict[item] + " " - text += "\n" + "Ambiguity handling: " + str(wobble_conversion) - text += "\n" + "Reverse complement scanning: " + str(rc_option) - text += "\n" + "Alphabetic sorting: " + str(alphabetic_sorting) - - if 0 in plotting_modes and input_gff_files != []: - text += "\n" + "Input gff files: " + ", ".join(input_gff_files) - if gff_color_config_file != "": - text += "\n" + "GFF color config file: " + gff_color_config_file - text += "\n" + "Prefix for output files: " + str(output_file_prefix) - - if 2 in plotting_modes: - text += "\n" + "\n\nLCS SHADING OPTIONS (plotting_mode 'all-against-all' only)...\n" - text += "\n" + "LCS shading: " + str(lcs_shading) - text += "\n" + "LCS shading interval number: " + str(lcs_shading_num + 1) - text += "\n" + "LCS shading reference: " + lcs_shading_ref_dict[lcs_shading_ref] - if lcs_shading_ref == 2: - text += "\n" + "LCS shading interval size [%s]: " % (aa_bp_unit) + str(lcs_shading_interval_len) - text += "\n" + "LCS shading orientation: " + lcs_shading_ori_dict[lcs_shading_ori] - if input_user_matrix_file != "": - text += "\n" + "Custom user shading matrix file: " + input_user_matrix_file - text += "\n" + "Print user matrix values (instead of dotplot): " + str(user_matrix_print) - - text += "\n" + "\n\nGRAPHIC FORMATTING...\n" - text += "\n" + "Plot size: " + str(plot_size) - text += "\n" + "Line width: " + str(line_width) - text += "\n" + "Line color: " + line_col_for - text += "\n" + "Reverse line color: " + line_col_rev - text += "\n" + "X label position: " + str(x_label_pos) - text += "\n" + "Label size: " + str(label_size) - text += "\n" + "Spacing: " + str(spacing) - text += "\n" + "Title length (limit number of characters): " + str(title_length) - text += "\n" + "Length scaling: " + str(length_scaling) - text += "\n%s\n" % (70 * "-") - logprint(text) - - - # collect settings - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, verbose] - - return parameters - - -############################### -# Helper Functions # -############################### - -def alphabets(type_nuc=True): - """ - provide ambiguity code for sequences - """ - - nucleotide_alphabet = ["A", "C", "G", "T"] - - nucleotide_alphabet_full = ["A", "C", "G", "T", "N", "B", "D", "H", - "V", "Y", "R", "W", "S", "K", "M"] - - nucleotide_ambiguity_code = {"N": ["A", "C", "G", "T"], # any - "B": ["C", "G", "T"], # not A - "D": ["A", "G", "T"], # not C - "H": ["A", "C", "T"], # not G - "V": ["A", "C", "G"], # not T - "Y": ["C", "T"], # pyrimidine - "R": ["A", "G"], # purine - "W": ["A", "T"], # weak - "S": ["C", "G"], # strong - "K": ["G", "T"], # keto - "M": ["A", "C"]} # amino - - nucleotide_match_dict = {"N": "[ACGTNBDHVYRWSKM]", # any - "B": "[CGTNBDHVYRWSKM]", # not A - "D": "[AGTNBDHVYRWSKM]", # not C - "H": "[ACTNBDHVYRWSKM]", # not G - "V": "[ACGNBDHVYRWSKM]", # not T - "K": "[GTNBDHVYRWSK]", # keto - not A,C,M - "M": "[ACNBDHVYRWSM]", # amino - not G,T,K - "W": "[ATNBDHVYRWKM]", # weak - not C,G,S - "S": "[CGNBDHVYRSKM]", # strong - not A,G,W - "Y": "[CTNBDHVYWSKM]", # pyrimidine - not A,G,R - "R": "[AGNBDHVRWSKM]", # purine - not C,T,Y - "A": "[ANDHVRWM]", - "C": "[CNBHVYSM]", - "G": "[GNBDVRSK]", - "T": "[TNBDHYWK]"} - - # nucleotide_match_dict = {"N": ".", # any - # "B": "[^A]", # not A - # "D": "[^C]", # not C - # "H": "[^G]", # not G - # "V": "[^T]", # not T - # "K": "[^ACM]", # keto - not A,C,M - # "M": "[^GTK]", # amino - not G,T,K - # "W": "[^CGS]", # weak - not C,G,S - # "S": "[^AGW]", # strong - not A,G,W - # "Y": "[^AGR]", # pyrimidine - not A,G,R - # "R": "[^CTY]", # purine - not C,T,Y - # "A": "[ANDHVRWM]", - # "C": "[CNBHVYSM]", - # "G": "[GNBDVRSK]", - # "T": "[TNBDHYWK]"} - - aminoacid_alphabet = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"] - - aminoacid_alphabet_full = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*", "J", - "Z", "B", "X"] - - aminoacid_ambiguity_code = {"J": ["I", "L"], - "Z": ["Q", "E"], - "B": ["N", "D"], - "X": ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"]} # any - - aminoacid_match_dict = {"J": "[ILJ]", - "Z": "[QEZ]", - "B": "[NDB]", - # "X": ".", - "X": "[ARNDCEQGHILKMFPSTWYVUO*XBZJ]", - "A": "[AX]", - "R": "[RX]", - "N": "[NXB]", - "D": "[DXB]", - "C": "[CX]", - "E": "[EXZ]", - "Q": "[QXZ]", - "G": "[GX]", - "H": "[HX]", - "I": "[IXJ]", - "L": "[LXJ]", - "K": "[KX]", - "M": "[MX]", - "F": "[FX]", - "P": "[PX]", - "S": "[SX]", - "T": "[TX]", - "W": "[WX]", - "Y": "[YX]", - "V": "[VX]", - "U": "[UX]", - "O": "[OX]", - "*": "[*X]"} - - aa_only = set(['E', 'F', 'I', 'J', 'L', 'O', 'Q', 'P', 'U', 'X', 'Z', '*']) - # return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aa_only - - if type_nuc: - return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, nucleotide_match_dict - else: - return aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aminoacid_match_dict - -def logprint(text, start=False, printing=True, prefix=""): - """ - log output to log_file and optionally print - """ - - # define log file name and open file - global log_file_name - if start and trial_mode: - log_file_name = "log_file.txt" - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - elif start: - date = datetime.date.today() - time = str(datetime.datetime.now()).split(" ")[1].split(".")[0].replace(":", "-") - log_file_name = "%s_%s_log_file.txt" % (date, time) - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - else: - log_file = open(log_file_name, 'a') - - # write log (and print) - log_file.write(text + "\n") - if printing: - print text - log_file.close() - -def time_track(starting_time, show=True): - """ - calculate time passed since last time measurement - """ - now = time.time() - delta = now - starting_time - if show: - text = "\n\t %s seconds\n" % str(delta) - logprint(text, start=False, printing=True) - return now - -def calc_fig_ratio(ncols, nrows, plot_size, verbose=False): - """ - calculate size ratio for given number of columns (ncols) and rows (nrows) - with plot_size as maximum width and length - """ - ratio = ncols*1./nrows - if verbose: - text = " ".join([ncols, nrows, ratio]) - logprint(text, start=False, printing=True) - if ncols >= nrows: - figsize_x = plot_size - figsize_y = plot_size / ratio - else: - figsize_x = plot_size * ratio - figsize_y = plot_size - return figsize_x, figsize_y - -def shorten_name(seq_name, max_len=float("Inf"), delim="_"): - """ - shorten sequence names (for diagram titles) - """ - - if len(seq_name) <= max_len: - return seq_name - - # keep first and last part if multiple parts separated by delimiter (e.g. species_prefix + sequence_id) - if delim in seq_name: - if seq_name.count(delim) >= 2: - name = "%s..." % delim.join(seq_name.split(delim)[:1]) + seq_name.split(delim)[-1] # .replace("_000", "-") - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - - if len(name) > max_len: - name = name[:((max_len-2)//2)] + "..." + name[((max_len-2)//2):] - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - - return name - -def unicode_name(name): - """ - replace non-ascii characters in string (e.g. for use in matplotlib) - """ - unicode_string = eval('u"%s"' % name) - return unicodedata.normalize('NFKD', unicode_string).encode('ascii','ignore') - -def check_bools(arg, update_log_txt = True, default=None): - """ - converts commandline arguments into boolean - """ - - - # convert valid arguments - if str(arg).lower() == "y" or str(arg) == "1": - return True - elif str(arg).lower() == "n" or str(arg) == "0": - return False - - # use default in case of invalid argument - else: - if update_log_txt: - global log_txt - log_txt += "using default for " + str(arg) - else: - try: - logprint("using default for " + str(arg)) - except: - print "using default for " + str(arg) - return default - -def create_color_list(number, color_map=None, logging=False, max_grey="#595959"): - """ - create color list with given number of entries - grey by default, matplotlib color_map can be provided - """ - - try: - # create pylab colormap - cmap = eval("P.cm." + color_map) - # get descrete color list from pylab - cmaplist = [cmap(i) for i in range(cmap.N)] # extract colors from map - # determine positions for number of colors required - steps = (len(cmaplist)-1)/(number) - numbers = range(0, len(cmaplist), steps) - - # extract color and convert to hex code - colors = [] - for idx in numbers[:-1]: - rgb_color = cmaplist[idx] - col = rgb2hex(rgb_color[0]*255, rgb_color[1]*255, rgb_color[2]*255) - colors.append(col) - - # grey - except: - if not color_map == None: - logprint("Invalid color_map (%s) provided! - Examples: jet, Blues, OrRd, bwr,..." % color_map) - logprint("See https://matplotlib.org/users/colormaps.html\n") - old_max_grey = "#373737" - old_max_grey = "#444444" - colors = list(Color("#FFFFFF").range_to(Color(max_grey), number)) # grey - for idx in range(len(colors)): - colors[idx] = str(colors[idx]).replace("Color ", "") - if "#" in colors[idx] and len(colors[idx]) != 7: - # print colors[idx] - colors[idx] = colors[idx] + colors[idx][-(7-len(colors[idx])):] - - text = "%d Colors: %s" % (len(colors), ", ".join(colors)) - if logging: logprint(text, start=False, printing=True) - - if len(colors) < number: - logprint("\nError in color range definition! %d colors missing\n" % (number - len(colors))) - - return colors - - -############################### -# File Handling # -############################### - -def read_seq(input_fasta, verbose=False): - """ - read fasta sequences from (all) file(s) - """ - - # check if file provided - if input_fasta == [] or input_fasta == "": - text = "Attention: No valid file names provided: >%s<" % input_fasta - logprint(text, start=False, printing=True) - return {}, [] - - # combine sequence files, if required - if type(input_fasta) == list: - # concatenate fasta files - if len(input_fasta) > 1: - if verbose: - print "concatenating fastas...", - text = "concatenating fastas..." - input_fasta_combi = concatenate_files(input_fasta) - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - else: - input_fasta_combi = input_fasta[0] - else: - input_fasta_combi = input_fasta - - # read sequences - if verbose: - print "reading fasta...", - text = "reading fasta...", - try: - seq_dict = SeqIO.index(input_fasta_combi, "fasta") - except ValueError: - logprint("Error reading fasta sequences - please check input files, e.g. for duplicate names!") - return {}, [] - except: - logprint("Error reading fasta sequences - please check input files!") - return {}, [] - - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - - for seq in seq_dict: - if "-" in seq_dict[seq].seq: - # ungapped = seq_dict[seq].seq.ungap("-") # cannot be assigned back to sequence record - text = "\nSequences degapped prior Analysis!!!" - logprint(text, start=False, printing=True) - return read_seq(degap_fasta(input_fasta), verbose=verbose) - - # get ordered sequence names - sequences = [] - for item in SeqIO.parse(input_fasta_combi, "fasta"): - sequences.append(item.id) - return seq_dict, sequences - -def read_gff_color_config(gff_color_config_file=""): - """ - define coloring options for gff-based color shading of self-dotplots - """ - - # default aestetics for annotation shading (e.g. if no user config file is provided) - # dictionary with feature_type as key and tuple(color, transparency, zoom) as value - gff_feat_colors = {"orf": ("#b41a31", 0.2, 0), - "orf_rev": ("#ff773b", 0.3, 0), - "gene": ("#b41a31", 0.2, 0), - "cds": ("darkorange", 0.2, 0), - "exon": ("orange", 0.2, 0), - "intron": ("lightgrey", 0.2, 0), - "utr": ("lightblue", 0.2, 0), - "repeat_region": ("green", 0.3, 0), - "repeat": ("green", 0.3, 0), - "tandem_repeat": ("red", 0.3, 0), - "transposable_element": ("blue", 0.3, 0), - "ltr_retrotransposon": ("#cccccc", 0.5, 0), - "ltr-retro": ("#cccccc", 0.5, 0), - "long_terminal_repeat": ("#2dd0f0", 0.75, 2), - "ltr": ("#2dd0f0", 0.75, 2), - "pbs": ("purple", 0.75, 2), - "ppt": ("#17805a", 0.5, 2), - "target_site_duplication": ("red", 0.75, 2), - "misc_feature": ("grey", 0.3, 0), - "misc_feat": ("grey", 0.3, 0), - "misc": ("grey", 0.3, 0), - "others": ("grey", 0.5, 0)} - if gff_color_config_file in ["", None] or not os.path.exists(str(gff_color_config_file)): - return gff_feat_colors - - text = "Updating GFF color configuration with custom specifications\n" - logprint(text, start=False, printing=True) - - # read custom gff_color_config_file - in_file = open(gff_color_config_file, 'rb') - overwritten = set([]) - for line in in_file: - if not line.startswith("#") and len(line.strip().split("\t")) >= 4: - data = line.strip().split("\t") - feat = data[0].lower() - color = data[1].lower() - - # check, if settings are valid - if not mcolors.is_color_like(color): - color = "grey" - text = "Invalid color specified for %s: %s - default grey" % (data[0], data[1]) - logprint(text) - try: - alpha = float(data[2]) - except: - alpha = 0.75 - text = "Invalid alpha specified for %s: %s - default 0.75" % (data[0], data[2]) - logprint(text) - try: - zoom = float(data[3]) - except: - zoom = 0 - text = "Invalid zoom specified for %s: %s - default 0" % (data[0], data[3]) - logprint(text) - - # track changes of predefined settings - if feat in gff_feat_colors.keys(): - overwritten.add(data[0].lower()) - - gff_feat_colors[feat] = (color, alpha, zoom) - in_file.close() - - # default coloring for unknown annotations - if not "others" in gff_feat_colors.keys(): - gff_feat_colors["others"] = ("grey", 0.5, 0) - - if verbose: - # print configuration - text = "\n\nGFF color specification:\n%s\n" % (60 * ".") - for item in sorted(gff_feat_colors.keys()): - text += "%-30s\t%-10s\t%-5s\t%s\n" % (item, str(gff_feat_colors[item][0]), str(gff_feat_colors[item][1]), str(gff_feat_colors[item][2])) - logprint (text, printing=True) - - # print overwritting feature type specifications - if len(overwritten) != 0: - text = "%d feature type specifications overwritten:" % len(overwritten) - text += "\n\t"+ ", ".join(overwritten) + "\n" - logprint(text, start=False, printing=True) - - text = "GFF color specification updated acc. to %s\n\t%s\n\n" % (gff_color_config_file, ", ".join(gff_feat_colors)) - logprint(text, start=False, printing=True) - - return gff_feat_colors - -def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=True, prefix="", filetype='png', verbose=False): - """ - create feature dictionary from input_gff - sequence name as key and (feature type, start, stop) as value - """ - if type(input_gff_files) != list: - input_gff_files = [input_gff_files] - - # create dictionary with seq_name as key and (type, start and stop) as value - unknown_feats = set([]) - used_feats = set([]) - feat_dict = {} - for input_gff in input_gff_files: - text = "...reading " + input_gff - logprint(text, start=False, printing=True) - - in_file = open(input_gff, 'rb') - for line in in_file: - if not line.startswith("#") and line.strip() != "": - data = line.strip().split("\t") - feat_type = data[2].lower() - if data[6] == "-": - feat_type += "_rev" - if not feat_type.lower() in color_dict.keys(): - if feat_type.lower().replace("_rev", "") in color_dict.keys(): - feat_type = feat_type.replace("_rev", "") - else: - unknown_feats.add(feat_type) - feat_type = "others" - used_feats.add(feat_type) - if not data[0] in feat_dict.keys(): - feat_dict[data[0]] = [(feat_type, int(data[3]), int(data[4]))] # feature type, start, stop - else: - feat_dict[data[0]].append((feat_type, int(data[3]), int(data[4]))) # feature type, start, stop - if verbose: - text = "\nAnnotations for: %s\n" % ", ".join(feat_dict.keys()[:10]) - if len(feat_dict.keys()) > 10: - text = text[:-1] + ", ...\n" - logprint(text, start=False, printing=True) - in_file.close() - - # print feature types without specific shading settings - if len(unknown_feats) != 0: - text = "Missing shading specification for %d feature type(s):\n\t%s\n" % (len(unknown_feats), ", ".join(sorted(unknown_feats))) - logprint(text, start=False, printing=True) - - # create color legend - colors, alphas = [], [] - for item in sorted(used_feats): - colors.append(color_dict[item][0]) - alphas.append(color_dict[item][1]) - legend_figure(colors=colors, lcs_shading_num=len(used_feats), type_nuc=type_nuc, bins=sorted(used_feats), alphas=alphas, gff_legend=True, prefix=prefix, filetype=filetype) - - # print settings - text = "GFF Feature Types: %s\nGFF Colors: %s" % (", ".join(sorted(used_feats)), ", ".join(sorted(colors))) - logprint(text, start=False, printing=True) - - return feat_dict - -def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, verbose=False): - input_file = open(matrix_file_name, 'rb') - - # read sequence names from first column - names = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - names.append(line.strip().split(delim)[0]) - logprint("Delimiter '%s': %d names - %s\n" % (delim, len(names), ", ".join(names))) - - # check if names were found - otherwise try another delimiter - if names == [] and not recursion: - if delim == "\t": - new_delim = "," - else: - new_delim = "\t" - logprint("\nMatrix file not containing data delimited by '%s' - trying to read matrix with delimiter '%s'" % (delim.replace("\t", "\\t"), new_delim)) - info_dict = read_matrix(matrix_file_name, delim=new_delim, symmetric=symmetric, recursion=True, verbose=verbose) - return info_dict - elif names == []: - logprint("Empty matrix file with alternative delimiter!") - return info_dict - input_file.close() - - input_file = open(matrix_file_name, 'rb') - # read matrix entries as values in dictionary with tuple(names) as key - info_dict = {} - contradictory_entries = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - data = line.strip().split(delim) - for idx in range(len(data[1:])): - # print tuple(sorted([data[0], names[idx]])), data[idx+1] - if symmetric: - key = tuple(sorted([names[idx], data[0]])) - else: - key = tuple(names[idx], data[0]) - if key in info_dict.keys(): - if symmetric and info_dict[key] != data[idx+1] and data[idx+1] not in ["", "-"] and info_dict[key] not in ["", "-"]: - contradictory_entries.append(key) - info_dict[key] = data[idx+1] - input_file.close() - - if len(contradictory_entries) != 0: - try: - logprint("\nContradictory entries in matrix file %s:\n\t%s" % (matrix_file_name, ", ".join(contradictory_entries))) - except: - log_txt = "\nContradictory entries in matrix file %s:\n\t" % (matrix_file_name) - for item in contradictory_entries: - log_txt += str(item).replace("'", "") + ", " - log_txt = log_txt[:-2] - logprint(log_txt) - logprint("Using value from bottom left triangle!") - if verbose: - logprint("\nMatrix information for Sequences named: " % ", ".join(names)) - - return info_dict - -def concatenate_files(file_list, combi_filename="temp_combined.fasta", verbose=False): - """ - concatenate content of all files in file_list into a combined file named combi_filename - """ - out_file = open(combi_filename, 'w') - text = "" - for item in file_list: - if verbose: - text += item + " " - print item, - # read in_file linewise and write to out_file - in_file = open(item, 'rb') - for line in in_file: - out_file.write(line.strip()+"\n") - in_file.close() - out_file.close() - if verbose: - logprint(text, start=False, printing=False) - return combi_filename - -def degap_fasta(input_fasta): - """ - remove gaps from fasta - new degapped sequence file created - """ - - # degap all sequence files - output_fastas = [] - if type(input_fasta) != list: - input_fasta = list(input_fasta) - for input_fas in input_fasta: - output_fas = input_fas[:input_fas.rfind(".")] + "_degapped.fas" - in_file = open(input_fas, 'rb') - out_file = open(output_fas, 'w') - for line in in_file: - if line.startswith(">"): - out_file.write(line.strip()+"\n") - else: - out_file.write(line.strip().replace("-", "")+"\n") - out_file.close() - in_file.close() - output_fastas.append(output_fas) - return output_fastas - -def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="png", max_lcs_len=None, min_lcs_len=0, bins=[], alphas=[], gff_legend=False, prefix="", verbose=False): - """ - create figure color legend - """ - max_legend_length_row = 8 - max_legend_length_col = 4 - - # define output file - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg" - logprint(text, start=False, printing=True) - filetype="png" - - # check if length of information fit - if not gff_legend and ((bins != [] and len(colors) != lcs_shading_num+1) or (bins != [] and len(colors) != len(bins)+1)): - if bins != [] and len(colors) != lcs_shading_num+1: - text = "**Attention**\nlcs_shading_num (%d) does not match number of colors (%d)!\n"% (lcs_shading_num, len(bins)) - elif bins != [] and len(colors) != len(bins)+1: - text = "**Attention**\nnumber of LCS length bins (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - elif gff_legend and len(bins) != len(colors): - text = "**Attention**\nnumber of GFF Feature Types (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - - # set alpha values to opaque if none are provided - if alphas == []: - for item in colors: - alphas.append(1) - - # legend data points - data_points = range(len(colors)) - if not gff_legend: - - # specify intervals, if max_lcs_len provided - if max_lcs_len != None: - multi_factor = 100 # one digit - if max_lcs_len <= 1: - multi_factor = 1000 # two digits - # len_interval_size = (max_lcs_len-min_lcs_len) * multi_factor *1. // lcs_shading_num * (1./ multi_factor) - len_interval_size = (max_lcs_len-min_lcs_len) * 1. / lcs_shading_num - len_pos = [float("%.2f" % (min_lcs_len))] - # calculate interval positions - for idx in range(lcs_shading_num): - len_pos.append(float("%.2f" % (len_pos[-1] + len_interval_size))) - - if prefix.startswith("custom-matrix") and (0 <= max_lcs_len <= 100 and 0 <= min_lcs_len <= 100): - unit = "%" - elif prefix.startswith("custom-matrix"): - unit = "" - - text = "\n%d Legend intervals from %.2f to %.2f: \n\t%s - number: %d, step: %.2f, unit: %s\n" % (lcs_shading_num+1, min_lcs_len, max_lcs_len, str(len_pos), len(len_pos), len_interval_size, unit) - logprint(text, start=False, printing=True) - pos = len_pos - interval_size = len_interval_size - else: - # generate legend labels acc. to standard interval notation - interval_size = 100 // lcs_shading_num - pos = range(interval_size, 101+interval_size, interval_size) - - if bins != []: # labels provided - legend_labels = bins[:] - legend_labels.append("max") - legend_labels_lengths = [] - for item in bins: - legend_labels_lengths.append("[%d %s, %d %s)" % (item - min(bins), unit, item, unit)) - if len(bins) == len(colors) - 1: - legend_labels_lengths.append("[%d %s, %s]" % (max(bins), unit, u"\u221E")) # infinite - - else: - legend_labels = [] - legend_labels_lengths = [] - for idx in range(len(pos)): - num = pos[idx] - legend_labels.append("[%d%%, %d%%)" % (num - interval_size, num)) - if max_lcs_len != None: - num = len_pos[idx] - # as int or float - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths.append("[%d %s, %d %s)" % (num, unit, num + len_interval_size, unit)) - else: - legend_labels_lengths.append("[%.2f %s, %.2f %s)" % (num, unit, num + len_interval_size, unit)) - legend_labels[-1] = "100" + unit - if max_lcs_len != None: - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths[-1] = "%d %s" % (max_lcs_len, unit) - else: - legend_labels_lengths[-1] = "%.2f %s" % (max_lcs_len, unit) - - # set labels and choose file name - if gff_legend: - label_text = bins[:] - edge_col = None - legend_file_name = "Selfdotplot_GFF_Shading_Legend_n%d." % lcs_shading_num + filetype - elif max_lcs_len != None: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_max%d%s_n%d." % (max_lcs_len, unit, lcs_shading_num) + filetype - elif bins != []: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%d%s_n%d." % (bins[0], unit, lcs_shading_num) + filetype - else: - label_text = legend_labels[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%%len_n%d." % lcs_shading_num + filetype - - if prefix != None and prefix != "": - if not prefix.endswith("-"): - prefix = prefix + "-" - legend_type = "LCS" - if prefix.startswith("custom-matrix"): - prefix = prefix.replace("custom-matrix", "")[1:] - legend_type = "CustomMatrix" - legend_file_name = prefix + legend_file_name.replace("LCS", legend_type) - - # plot legend figure - fig, ax = P.subplots(3, 1, figsize=(len(colors)*2, len(colors)*2)) - for idx in range(len(colors)): - ax[0].bar(data_points[idx]+1, data_points[idx]+1, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[2].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].set_ylim(0,1) - ax[2].set_ylim(0,1) - ax[1].legend(ncol=((len(colors)-1)//max_legend_length_row)+1, framealpha=1) # vertical legend - col_num = len(colors) - if len(colors) > max_legend_length_col: - remainder = 0 - if len(colors) % max_legend_length_col != 0: - remainder = 1 - row_num = len(colors) // max_legend_length_col + remainder - remainder = 0 - if len(colors) % row_num != 0: - remainder = 1 - col_num = len(colors) // row_num + remainder - ax[2].legend(ncol=col_num, framealpha=1) # horizontal legend - - P.savefig(legend_file_name) - - return legend_file_name - - -############################### -# Analysis Functions # -############################### - -def wobble_replacement(sequence, general_ambiguity_code, verbose=False): - """ - get all degenerated sequences for sequence with ambiguous residues - (only residues considered that are keys in wobble_dictionary) - """ - - # get positions of ambiguous residues - wobble_pos = [] - for idx in range(len(sequence)): - letter = sequence[idx] - if letter in general_ambiguity_code.keys(): - wobble_pos.append(idx) - - if verbose: - text = "\t%d wobbles" % len(wobble_pos) - logprint(text, start=False, printing=True) - - # replace one wobble through each iteration by all possible residues - # repeat if still wobbles in new kmers - kmer_variants = [sequence] - while True: - if verbose: - text = "\t\t%d kmer variants" % len(kmer_variants) - logprint(text, start=False, printing=True) - temp_kmers = set([]) - for kmer in kmer_variants: - for idx in wobble_pos: - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - for base in general_ambiguity_code[kmer[idx]]: - newkmer = kmer[:idx] + base + kmer[idx+1:] - temp_kmers.add(newkmer) - wobble = False - for kmer in temp_kmers: - for idx in range(len(kmer)): - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - wobble = True - break - if wobble: - break - kmer_variants = set(list(temp_kmers)[:]) - if not wobble: - break - - return kmer_variants - -def split_diagonals(data, stepsize=1): - """ - split array if point difference exceeds stepsize - data = sorted list of numbers - """ - return np.split(data, np.where(np.diff(data) != stepsize)[0]+1) - -def longest_common_substring(s1, s2): - m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))] - longest, x_longest = 0, 0 - for x in xrange(1, 1 + len(s1)): - for y in xrange(1, 1 + len(s2)): - if s1[x - 1] == s2[y - 1]: - m[x][y] = m[x - 1][y - 1] + 1 - if m[x][y] > longest: - longest = m[x][y] - x_longest = x - else: - m[x][y] = 0 - return longest - -def lcs_from_x_values(x_values): - """ - calculate length of longest common substring based on nested list of numbers - """ - if len(x_values) == 0: - return 0 - # get lengths of each subarray data - lengths = np.array([len(i) for i in x_values]) - return max(lengths) - - -############################### -# Matching Functions # -############################### - -def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - - # forward - ################################# - kmer_pos_dict_one = {}; kmer_pos_dict_two = {} # dictionaries for both sequences - - # reverse complement - ################################# - kmer_pos_dict_three = {}; kmer_pos_dict_four = {} # dictionaries for both sequences - - # create dictionaries with kmers (wordsize) and there position(s) in the sequence - if rc_option: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two), - (str(seq_one), kmer_pos_dict_three), - (str(seq_two.reverse_complement()), kmer_pos_dict_four)] - else: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two)] - for (seq, kmer_pos_dict) in data_list: - for i in range(len(seq)-wordsize+1): - kmer = seq[i:i+wordsize] - # discard kmer, if too many Ns included - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - if not convert_wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - wobbles = False - for item in general_ambiguity_code.keys(): - if item in kmer: - wobbles = True - break - if not wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - kmer_variants = wobble_replacement(kmer, general_ambiguity_code) - for new_kmer in kmer_variants: - # print "\t", new_kmer - try: - kmer_pos_dict[new_kmer].append(i) - except KeyError: - kmer_pos_dict[new_kmer] = [i] - - # find kmers shared between both sequences - matches_for = set(kmer_pos_dict_one).intersection(kmer_pos_dict_two) # forward - matches_rc = set(kmer_pos_dict_three).intersection(kmer_pos_dict_four) # reverse complement - - if verbose: - text = "[matches: %i for; %.i rc]" % (len(matches_for), len(matches_rc)) - logprint(text, start=False, printing=True) - - # create lists of x and y co-ordinates for scatter plot - # keep all coordinates of all shared kmers (may match multiple times) - diag_dict_for = {} - diag_dict_rc = {} - for (match_list, pos_dict1, pos_dict2, diag_dict) in [(matches_for, kmer_pos_dict_one, kmer_pos_dict_two, diag_dict_for), - (matches_rc, kmer_pos_dict_three, kmer_pos_dict_four, diag_dict_rc)]: - for kmer in match_list: - for i in pos_dict1[kmer]: - for j in pos_dict2[kmer]: - diag = i-j - points = set(range(i+1, i+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - -def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize via regular expression search - fuzzy matching - allow up to substitution_count substitutions - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - ambiguity_match_dict = alphabets(type_nuc)[3] - - ambiq_residues = "[%s]" % "".join(general_ambiguity_code.keys()) - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # check for wobble presence - if not (regex.search(ambiq_residues, str(seq_one)) == None and regex.search(ambiq_residues, str(seq_two)) == None): - wobble_found = True - else: - wobble_found = False - - # dictionary for matches - diag_dict_for = {} - diag_dict_rc = {} - counter = [0, 0] - - # one-way matching - if rc_option: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0), - (str(seq_one), str(seq_two.reverse_complement()), diag_dict_rc, 1)] - else: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0)] - - for seq_query, seq_target, diag_dict, counter_pos in data_list: - # split query sequence into kmers - if not rc_option and counter_pos == 1: - break - - for idx in range(len(str(seq_query))-wordsize+1): - kmer = str(seq_query)[idx:idx+wordsize] - - # skip excessive N/X stretches (big black areas) - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - # convert kmer to regular expression for wobble_matching - if convert_wobbles and wobble_found: - kmer_string = "" - # replace each residue with matching residues or wobbles - for jdx in range(len(kmer)): - kmer_string += ambiguity_match_dict[kmer[jdx]] - else: - kmer_string = kmer - - # convert to regular expression tolerating substitution errors - if type(substitution_count) == int and substitution_count != 0: - kmer_string = "(%s){s<=%d}" % (kmer_string, substitution_count) - - # search for regular expression in target sequence - kdx = 0 - start = True - if regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - while regex.search(kmer_string, seq_target[kdx:]) != None: - # search for regular expression pattern in target sequence - result = regex.search(kmer_string, seq_target[kdx:]) - - kmer2 = seq_target[kdx:][result.start():result.end()] - - # skip excessive N/X stretches (big black areas) - if kmer2.count(any_residue)*100./wordsize <= max_N_percentage: - diag = idx-(kdx+result.start()) - points = set(range(idx+1, idx+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - kdx += result.start() + 1 - if kdx >= len(seq_target): - break - elif regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - - if verbose: - text = "%5.i \tforward matches" % counter[0] - text += "\n%5.i \treverse complementary matches" % counter[1] - logprint(text, start=False, printing=True) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - - -############################### -# Dot Plot Functions # -############################### - -def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, title_length=float("Inf")): - """ - self-against-self dotplot - partially from biopython cookbook - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least one input sequence - if len(sequences) == 0: - text = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - text += " No sequences provided for selfdotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1 and multi: - text = "\n\nCreating collage output for single selfdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nSelfdotplot Collage: Invalid collage - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences): - ncols = len(sequences) - nrows = 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=prefix, filetype=filetype, verbose=verbose) - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage=49 - text = "Provide valid max_N_percentage, kmers with >=50% Ns are ignored\n" - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given:%s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - global t1 - - print "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-"), - log_txt = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - - # preparations for file name - name_graph = "Selfdotplots" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if multi: - suffix += "_collage" - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - list_of_png_names = [] - - counter = 0 - for seq_name in sequences: - print seq_name, - log_txt += " " + seq_name - - counter += 1 - if not multi: - P.cla() # clear any prior graph - - # read sequence - seq_record = seq_dict[seq_name] - name_seq = seq_record.id - seq_one = seq_record.seq.upper() - length_seq = len(seq_one) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_regex(seq_one, seq_one, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG", - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_diag(seq_one, seq_one, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlim(0, length_seq+1) - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - P.title(unicode_name(shorten_name(name_seq, max_len=title_length)), fontsize=label_size, fontweight='bold') - # P.title(unicode_name(name_seq), fontsize=label_size*1.3, fontweight='bold') - - # save figure and reinitiate if page is full - if counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' % (prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - else: # not multi - - fig = P.figure(figsize=(plot_size, plot_size)) # figure size needs to be a square - ax = P.subplot(1, 1, 1) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - number = 0 - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlim(0, length_seq+1) - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - P.title(unicode_name(shorten_name(name_seq, max_len=title_length)), fontsize=label_size*1.3, fontweight='bold') - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_%s_wordsize%i%s.%s' %(prefix, name_graph, counter, name_seq, wordsize, suffix, filetype) - P.savefig(fig_name, bbox_inches='tight') - - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - if multi and counter >= 1: - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - print "\n\nDrawing selfdotplots done" - log_txt += "\n\nDrawing selfdotplots done" - logprint(log_txt, start=False, printing=False) - - return list_of_png_names - -def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, length_scaling=True, scale_delim_col="red", title_length=float("Inf")): - """ - pairwise dotplot (all-against-all) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least two input sequences - if len(sequences) < 2: - text = "\n%s\n\nCreating %d paired dotplot image \n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += " Please provide at least two sequences for pairdotplot!\n\nTerminating paired dotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 2 and multi: - text = "\n\nCreating collage output for single pairdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nPairdotplot Collage: Invalid collage settings - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences)*(len(sequences)-1): - ncols = len(sequences) - nrows = 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences)*(len(sequences)-1): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - - text = "\n%s\n\nCreating %d paired dotplot image for\n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += ", ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - y_label_rotation = "vertical" - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage=49 - text = "Provide valid max_N_percentage, kmers with >50% are ignored\n" - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given: %s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - # preparations for file name - name_graph = "Pairdotplot" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if length_scaling: - suffix += "_scaled" - if multi: - suffix += "_collage" - - - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - list_of_png_names = [] - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - - # prepare LCS data file - lcs_data_file = open("%sPairdotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - counter, seq_counter = 0, 0 - print "Drawing pairwise dotplot...", - log_txt = "Drawing pairwise dotplot..." - if verbose: - seq_text = "" - for idx in range(len(sequences)-1): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx+1, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += " " + str(seq_counter) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - - # write LCS data file - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - else: - # calculate figure size for separate figures - if len_one >= len_two: - sizing = (plot_size, max(2, (plot_size)*len_two*1./len_one)) - # sizing = (plot_size, min(plot_size, max(2, (plot_size-2)*len_two*1./len_one+2))) - else: - sizing = (max(2, (plot_size)*len_one*1./len_two), plot_size) - # sizing = (min(plot_size, max(2, (plot_size-2)*len_one*1./len_two+2)), plot_size) - fig = P.figure(figsize=(plot_size, plot_size)) - - ax = P.subplot(1, 1, 1) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlabel(unicode_name(shorten_name(name_one, max_len=title_length)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.ylabel(unicode_name(shorten_name(name_two, max_len=title_length)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - if not multi: - if length_scaling: - ax.set_aspect(aspect='equal', adjustable='box', anchor='NW') - P.xlim(0, len_one+1) - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - elif not length_scaling: - P.xlim(0, len_one+1) - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - else: - max_len = max(len_one, len_two) - P.xlim(0, max_len+1) - P.ylim(max_len+1, 0) # rotate y axis (point downwards) - - # plot line deliminating shorter sequence - if max_len != len_one: - ax.plot((len_one+1, len_one+1), (0, len_two), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - if max_len != len_two: - ax.plot((0, len_one), (len_two+1, len_two+1), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - - # evtl. switch x axis position - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - P.setp(ax.get_xticklabels(), fontsize=label_size*.9) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) - - # save figure and reinitiate if page is full - if multi and counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=.5, wspace=.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=.5, wspace=.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - elif not multi: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, bottom=0.05) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_wordsize%i%s.%s' % (prefix, name_graph, counter, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - list_of_png_names.append(fig_name) - fig = P.figure() - - # save figure - if multi and counter >= 1: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=0.5, wspace=0.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.5, wspace=0.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - print - logprint(seq_text, start=False, printing=False) - - return list_of_png_names - -def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, lcs_shading=True, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, lcs_shading_num=5, spacing=0.04, input_user_matrix_file="", user_matrix_print=True, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, title_length=float("Inf"), rotate_labels=False): - """ - all-against-all dotplot - derived from dotplot function - - lcs_shading_refs: - 0 color relative to maximum lcs observed in dataset [default] - 1 color by coverage of shorter sequence (e.g. lcs = 70% of seq1) - lcs_shading_ori - 0 forward only - 1 reverse only - 2 both orientations (in opposite plot) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - if len(sequences) == 0: - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " No sequences provided for polydotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1: - text = "\n\nCreating polydotplot for single sequence!" - text += "\nRecommendation: Use selfdotplot via '--plotting_mode 0'!\n\n" - logprint(text, start=False, printing=True) - - - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " " + " ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage=49 - text = "Provide valid max_N_percentage, kmers with >50% are ignored\n" - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given: %s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - if lcs_shading and not type_nuc: - if lcs_shading_ori != 0: - lcs_shading_ori = 0 - text = "Protein shading does not support reverse complementary matching!\n" - logprint(text, start=False, printing=True) - - # read custom shading matrix & match names of sequences to fasta - if input_user_matrix_file != "" and input_user_matrix_file != None: - logprint("Reading user matrix file: %s" % input_user_matrix_file) - # lcs_shading_ori = 2 - custom_dict = read_matrix(input_user_matrix_file) - if custom_dict != {}: - custom_shading = True - custom_similarity_dict = {} - invalid_entries = [] - custom_max = 0 - custom_min = float("Inf") - for key in custom_dict.keys(): - number_key = [] - - # convert number into float - try: - value = float(custom_dict[key]) - if not "." in custom_dict[key]: - value = int(custom_dict[key]) - custom_max = max(custom_max, value) - custom_min = min(custom_min, value) - except: - value = custom_dict[key] - if value == "": - value = None - invalid_entries.append(key) - # match matrix names with sequence names - for item in key: - if item in sequences: - number_key.append(sequences.index(item)) - else: - number_key.append(-1) - # dictionary with tuple of sorted sequence indices as key and number as value - custom_similarity_dict[tuple(sorted(number_key))] = value - if len(invalid_entries) != 0: - text = "No valid number in custom similarity matrix for %d entries: \n\t" % (len(invalid_entries)) - for key in invalid_entries: - text += str(key) + " - " + str(custom_dict[key]) + "; " - logprint(text[:-2]+"\n") - - text = "Custom user matrix given: min %.2f, max %.2f\n" % (custom_min, custom_max) - - # artificially rounding intervals if likely identity/divergence percentages - if 0 <= custom_min < 1 and 0 < custom_max <= 1: - rounding_factor = 5 - multi_factor = 100 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (multi_factor*custom_min // rounding_factor) * (1.*rounding_factor/multi_factor)) - custom_max = min((multi_factor*custom_max // rounding_factor) * (1.*rounding_factor/multi_factor), 1) - text += "new (%.2f, %2f)\n" % (custom_min, custom_max) - - elif 0 <= custom_min < 100 and 0 < custom_max <= 100: - rounding_factor = 5 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (custom_min // rounding_factor) * rounding_factor) - custom_max = min((custom_max // rounding_factor) * rounding_factor, 100) - text += "new (%d, %d)\n" % (custom_min, custom_max) - - logprint(text) - - else: - custom_shading = False - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=prefix, filetype=filetype, verbose=verbose) - - name_graph = "Polydotplot" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if custom_shading: - suffix += "_matrix" - if lcs_shading: - suffix += "_%dshades_ref%d_ori%s" % (lcs_shading_num+1, lcs_shading_ref, lcs_shading_ori) - if "ref2" in suffix and type_nuc: - suffix = suffix.replace("ref2", "%dbp" % lcs_shading_interval_len) - elif "ref2" in suffix: - suffix = suffix.replace("ref2", "%daa" % lcs_shading_interval_len) - - - # name and create output files (names derived from SEQNAME) - if prefix != None and str(prefix) != "": - prefix = str(prefix) + "-" - else: - prefix = "" - - # preparations for background shading - if lcs_shading or custom_shading: - # create color range white to grey - colors = create_color_list(lcs_shading_num+1, color_map=None, logging=True) - colors_2 = create_color_list(lcs_shading_num+1, color_map="OrRd", logging=True) - - if custom_shading: - text = "Custom Matrix Colors: " + ", ".join(colors_2) - - # write lcs lengths to file - lcs_data_file = open("%sPolydotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - # compare sequences pairwise - save lcs and line information in dictionary for plotting - data_dict = {} # keys = tuple(idx, jdx), value = x1, y1, x2, y2 (line positions) - lcs_dict = {} # keys = tuple(idx, jdx), value = length of lcs: lcs_len or (lcs_for, lcs_rev) - for_lcs_set = set([]) # keep lengths to calculate max (excluding self comparisons) - rev_lcs_set = set([]) # keep lengths to calculate max (all) - - text = "\nTotal plot count: %d" % (len(sequences)*(len(sequences))) - text += "\nTotal calculations: %d" % (len(sequences)*(len(sequences)+1)/2) - logprint(text, start=False, printing=True) - - print "\nCalculating shared regions and lengths of longest_common_substring...", - log_txt = "\nCalculating shared regions and lengths of longest_common_substring..." - # determine matches and length of lcs by comparing all sequence pairs - if verbose: - seq_text = "" - counter = 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif len(sequences) < 5: - print "\t%s (%d %s), %s (%d %s)" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - log_txt += "\t%s (%d %s), %s (%d %s)\n" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - else: - if not counter % 25: - print counter, - log_txt += str(counter) - - # get positions of matches & length of longest common substring based on match lengths - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - data_dict[(idx, jdx)] = x1[:], y1[:], x2[:], y2[:] - lcs_dict[idx, jdx] = lcs_for, lcs_rev - - if idx != jdx: - for_lcs_set.add(lcs_for) - rev_lcs_set.add(lcs_rev) - - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - if not verbose: - print len(sequences)*(len(sequences)+1)/2, " done\n" - log_txt += str(len(sequences)*(len(sequences)+1)/2) + " done\n" - else: - print "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - log_txt += "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - logprint(log_txt, start=False, printing=False) - - if verbose: - logprint ("\n\nlcs_dict\n" + str(lcs_dict)) - if custom_shading: - logprint ("\ncustom_dict\n" + str(custom_dict)) - logprint ("\ncustom_similarity_dict\n\n" + str(custom_similarity_dict)) - - if verbose: - print - logprint(seq_text+"\n", start=False, printing=False) - - if lcs_shading_ref == 2: - color_bins = [] - text = "\nLCS lengh bins: " - for idx in range(lcs_shading_num): - color_bins.append(lcs_shading_interval_len*(idx+1)) - text += " " + str(lcs_shading_interval_len*(idx+1)) - logprint(text, start=False, printing=True) - - # calculate maximum lcs length - if lcs_shading_ori == 0: # forward only - if len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - elif lcs_shading_ori == 1: # reverse complement only - if len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - else: - max_lcs = None - else: # both orientations - if len(rev_lcs_set) != 0 and len(for_lcs_set) != 0: - max_lcs = max(max(rev_lcs_set), max(for_lcs_set)) - elif len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - elif len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - - if not max_lcs == None: - text = "Maximum LCS: %d %s" % (max_lcs, aa_bp_unit) - logprint(text, start=False, printing=True) - if custom_shading: - text = "Maximum custom value: %d\n" % custom_max - logprint(text, start=False, printing=True) - - # count sequences - ncols = len(sequences); nrows = len(sequences) - - # get sequence lengths to scale plot widths and heights accordingly - size_ratios = [] - for item in sequences: - size_ratios.append(len(seq_dict[item].seq)) - - P.cla() # clear any prior graph - # use GridSpec to resize plots according to sequence length - gs = gridspec.GridSpec(nrows, ncols, - width_ratios=size_ratios, - height_ratios=size_ratios) - fig = P.figure(figsize=(plot_size, plot_size)) - - # determine label orientations - if len(sequences) > 5 or rotate_labels: - x_label_rotation = 45 - y_label_rotation = "horizontal" - if x_label_pos_top: - xhalign = 'left' - xvalign = 'bottom' - else: - xhalign = 'right' - xvalign = 'top' - yhalign = "right" - else: - x_label_rotation = "horizontal" - y_label_rotation = "vertical" - xvalign = "center" - xhalign = "center" - yhalign = "center" - yvalign = 'center' - - print "\nDrawing polydotplot...", - log_txt = "\nDrawing polydotplot..." - - # draw subplots - if verbose: - if lcs_shading and custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "Custom matrix value", "Matrix color index", "LCS color index"]) + "\n" - elif lcs_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "LCS color index for", "LCS color index rev"]) + "\n" - elif custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "Custom matrix value", "Color index for", "Color index rev"]) + "\n" - - if verbose: - seq_text = "" - counter, seq_counter = 0, 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - len_two = len(rec_two.seq) - name_two = rec_two.id - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - len_one = len(rec_one.seq) - name_one = rec_one.id - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - # optional shade background according to length of LCS and/or user matrix - ######################################################################### - - # get interval based on LCS - background_colors = [None, None] - if lcs_shading and (lcs_shading_ref==1 or lcs_shading_ref==2 or max_lcs!=None): # self plot max_lcs_for == None - lcs_len = lcs_dict[(idx, jdx)] - l1 = lcs_len[0] # forward - l2 = lcs_len[1] # reverse complement - - lcs_shading_bool = True - - # calculate shading acc. to chosen option - if lcs_shading_ref == 1: # percentage of shorter sequence - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // min(len_one, len_two)) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // min(len_one, len_two)) - elif lcs_shading_ref == 2: # by given interval size - color_idx0 = min(len(colors)-1, l1 // lcs_shading_interval_len) - color_idx1 = min(len(colors)-1, l2 // lcs_shading_interval_len) - if color_idx0 >= len(colors): - color_idx0 = len(colors) - if color_idx1 >= len(colors): - color_idx1 = len(colors) - else: # percentage of maximum lcs length - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // max_lcs) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // max_lcs) - else: - lcs_shading_bool = False - - # get interval based on custom matrix - if custom_shading: - # matrix value - try: - custom_value = custom_similarity_dict[(idx, jdx)] - except: - custom_value = "" - - # bottom left triangle = LCS forward/reverse or best of both - if lcs_shading_bool: - if lcs_shading_ori == 0: # forward - color_idx1 = color_idx0 - elif lcs_shading_ori == 2: # both directions - color_idx1 = max(color_idx0, color_idx1) - - # top right triangle = custom value (not colored if text matrix provided) - if type(custom_value) == int or type(custom_value) == float: - color_idx0 = int((custom_value-custom_min)*lcs_shading_num // (custom_max-custom_min)) - # if string is proviced - else: - color_idx0 = 0 - - # set colors dependent on lcs dependent on orientation - if lcs_shading_bool and not custom_shading: - if idx != jdx: - if lcs_shading_ori == 0: - color_idx1 = color_idx0 - elif lcs_shading_ori == 1: - color_idx0 = color_idx1 - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx1] - # for selfcomparison, only color reverse complement - elif lcs_shading_ori != 0 and not custom_shading: - background_colors[0] = colors[color_idx1] - # set different colors for shading by LCS + user matrix - elif lcs_shading_bool and custom_shading: - # print colors, background_colors, color_idx0, color_idx1 - background_colors[0] = colors_2[color_idx0] - background_colors[1] = colors[color_idx1] - # set grey color range for user matrix if no LCS shading - elif custom_shading: - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx0] - - if verbose: - if custom_shading and lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - elif lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(color_idx0), str(color_idx1)]) + "\n" - elif custom_shading: - lcs_text += "\t".join([name_one, name_two, str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - - # diagonal (self-dotplots) - if idx == jdx: - # skip positions below diagonal - counter = counter + (counter - 1) // (nrows) # + row_pos - counters = [counter] - # draw both graphs at once (due to symmetry) - else: - col_pos = (counter - 1) % ncols - row_pos = (counter - 1) // (nrows) - counter2 = col_pos * ncols + row_pos + 1 - counters = [counter, counter2] - - if len(counters) == 2: - seq_counter += 1 - if not verbose and not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - x_lists, y_lists, x_lists_rc, y_lists_rc = data_dict[(idx, jdx)] - - # plot diagram(s) - for kdx in range(len(counters)): - - # shade annotated regions if gff file(s) provided - if idx == jdx and gff_files != None and gff_files != []: - if name_one in feat_dict.keys(): - features = feat_dict[name_one] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # if custom matrix value printed into upper matrix triangle, skip data plotting - # text print in top triangle - if user_matrix_print and custom_shading and kdx==0 and idx!=jdx: - data_plotting = False - # dotplot in bottom triangle - else: - data_plotting = True - - fig_pos = counters[kdx] - # plotting subplot with matplotlib - ax = P.subplot(gs[fig_pos-1]) # rows, columns, plotnumber - - # mirror plot, if plotting below diagonal - if kdx == 0: - l1, l2 = len_one, len_two - n1, n2 = name_one, name_two - x1, y1 = x_lists, y_lists - x2, y2 = x_lists_rc, y_lists_rc - else: - l2, l1 = len_one, len_two - n2, n1 = name_one, name_two - x1, y1 = y_lists, x_lists - x2, y2 = y_lists_rc, x_lists_rc - - if data_plotting: - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # plot value provided by customer instead of dotplot - else: - alignment = {'horizontalalignment': 'center', 'verticalalignment': 'center'} - # P.text(0.5, 0.5, custom_value, size='medium', transform=ax.transAxes, **alignment) - P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, **alignment) - # P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, - # horizontalalignment='center', verticalalignment='center', color="black") - - if custom_shading: - # omit diagonal - if idx == jdx: - ax.set_facecolor("white") - # use white background for text fields (top right triangle only [kdx 0]) - elif type(custom_value) != int and type(custom_value) != float and kdx == 0: - ax.set_facecolor("white") - else: - ax.set_facecolor(background_colors[kdx]) - # set background color if lcs shading - elif lcs_shading_bool and background_colors[kdx] != None: - ax.set_facecolor(background_colors[kdx]) - - # set axis limits - P.xlim(0, l1+1) - P.ylim(l2+1, 0) # rotate y axis (point downwards) - - # determine axis positions - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - x_label_bool = fig_pos <= ncols - x_tick_bool = fig_pos > ncols*(ncols-1) - else: - x_label_bool = fig_pos > ncols*(ncols-1) - x_tick_bool = fig_pos <= ncols - - # x axis labels dependent on plot position/number - if x_label_bool: # x title and labels on top or bottom - P.xlabel(unicode_name(shorten_name(n1, max_len=title_length)), fontsize=label_size, rotation=x_label_rotation, verticalalignment=xvalign, horizontalalignment=xhalign, fontweight='bold', labelpad=8) # axis naming - if not x_label_rotation in ["horizontal", "vertical"]: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation="vertical") - else: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation=x_label_rotation) - elif x_tick_bool and x_label_pos_top: # x ticks on bottom row - ax.xaxis.tick_bottom() # ticks without labels on bottom - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) - elif x_tick_bool: # x ticks on top row - ax.xaxis.tick_top() # # ticks without labels on top - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) # inner diagrams without labelling - else: # no x ticks on internal rows - ax.axes.get_xaxis().set_visible(False) - - # y axis labels dependent on plot position/number - if fig_pos % ncols == 1 or (ncols == 1 and nrows == 1): # y title and labels in 1st column - P.ylabel(unicode_name(shorten_name(n2, max_len=title_length)), fontsize=label_size, rotation=y_label_rotation, verticalalignment=yvalign, horizontalalignment=yhalign, fontweight='bold', labelpad=8) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) # axis naming - elif fig_pos % ncols == 0: # y ticks in last column - ax.yaxis.tick_right() - P.setp(ax.get_yticklabels(), visible=False) # inner diagrams without labelling - else: - ax.axes.get_yaxis().set_visible(False) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - try: - logprint(lcs_text, start=False, printing=True) - except: - pass - - # finalize layout - margins & spacing between plots - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - # gs.tight_layout(fig, h_pad=.02, w_pad=.02) # less overlapping tick labels, but also disturbingly large spacing - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, top=0.87) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, bottom=0.13) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing) # space between rows - def 0.4 - - # save figure and close instance - fig_name = '%s%s_wordsize%i%s.%s' % (prefix, name_graph, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - - # create figure color legend - if lcs_shading: - if lcs_shading_ref == 1: # percentage of shorter sequence - legend_file_name = legend_figure(colors, lcs_shading_num, unit="%", filetype=filetype, prefix=prefix) - elif lcs_shading_ref == 2: # interval sizes - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, bins=color_bins) - else: # relative of maximum lcs - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, max_lcs_len=max_lcs) - - if custom_shading: - custom_prefix = "custom-matrix-" + prefix - legend_file_name_custom = legend_figure(colors_2, lcs_shading_num, unit="%", filetype=filetype, prefix=custom_prefix, max_lcs_len=custom_max, min_lcs_len=custom_min) - - if lcs_shading and custom_shading: - return [fig_name, legend_file_name, legend_file_name_custom] - elif lcs_shading: - return [fig_name, legend_file_name] - elif custom_shading: - return [fig_name, legend_file_name_custom] - else: - return [fig_name] - - -############################### -# Function Call # -############################### - -def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_size=10, filetype="png", type_nuc=True, convert_wobbles=False, substitution_count=0, rc_option=True, alphabetic_sorting=False, gff=None, multi=True, ncols=1, nrows=1, lcs_shading=True, lcs_shading_num=5, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, gff_color_config_file="", input_user_matrix_file="", user_matrix_print=False, length_scaling=True, title_length=50, spacing=0.04, verbose=False): - - global t1, line_col_rev - - # read gff color config file if provided - if len(input_gff_files) != 0 and input_gff_files != None: - if gff_color_config_file not in ["", None]: - text = "\n%s\n\nReading GFF color configuration file\n%s\n\n=> %s\n" % (50*"=", 28*"-", gff_color_config_file) - logprint(text, start=False, printing=True) - gff_feat_colors = read_gff_color_config(gff_color_config_file) - else: - gff_feat_colors = {} - if gff_color_config_file not in ["", None]: - text = "Please provide GFF annotation files to use configuration file", gff_color_config_file - logprint(text, start=False, printing=True) - - # if color is set to white, reverse complementary matches are skipped - if not rc_option: - line_col_rev = "white" # reverse matches not calculated - elif not type_nuc: - logprint("Reverse complement deactivated for proteins!") - line_col_rev = "white" # reverse matches not calculated - - mode_text = [] - for item in modes: - mode_text.append(str(item)) - text = "%s\n\nRunning plotting modes %s" % (50*"=", ", ".join(mode_text)) - logprint(text, start=False, printing=True) - - - # create dotplots - ########################################## - - # self dotplots - t1 = time.time() - if 0 in modes: - list_of_png_names = selfdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, gff_files=gff, gff_color_dict=gff_feat_colors, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # paired dotplots - if 1 in modes: - if multi: - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=length_scaling, verbose=verbose) - t1 = time_track(t1) - else: - if not length_scaling: - text = "\nPairwise dotplot with individual output files scaled by sequence length automatically!" - logprint(text, start=False, printing=True) - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=True, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # all-against-all dotplot - if 2 in modes: - list_of_png_names = polydotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, spacing=spacing, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - text = "\n" + 50 * "#" + "\n" + 50 * "#" - text += "\n\nThank you for using FlexiDot!\n" - logprint(text, start=False, printing=True) - -# testing mode for debugging -trial_mode = False -# trial_mode = True - -# parameters = check_input(sys.argv) -parameters = check_input(sys.argv, trial_mode=trial_mode) - -# read out parameters -commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype, type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos_top, label_size, spacing, length_scaling, title_length, verbose = parameters - -# evtl. overwrite parameters for testing purposes in trial mode -if trial_mode: - # input_user_matrix_file = "AngioSINE-v18-alignment-identities.csv" - input_fasta = ["test-sequences-9-Ns.fas"] - input_fasta = ["Beta_SINEs__select_consensus.fas"] - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix.txt" - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix-01.txt" - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix-comma-str.txt" - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix-100+.txt" - # user_matrix_print = True - output_file_prefix = "SINEmatrix" - output_file_prefix = "SINEmatrix-NoShading" - plot_size = 10 - plotting_modes = [0,1,2] - plotting_modes = [2] - lcs_shading = False - lcs_shading = True - lcs_shading_ref = 2 - lcs_shading_num = 4 - lcs_shading_ori = 0 - lcs_shading_interval_len = 15 - wordsize = 10 - wordsize = 7 - x_label_pos_top = True - filetype = "pdf" - filetype = "png" - - wobble_conversion = False - wobble_conversion = True - - substitution_count = 0 - - rc_option = True - rc_option = False - label_size = 10 - - verbose = False - verbose = True - -if auto_fas: - path = os.path.dirname(os.path.abspath(__file__)) - files_long = glob.glob(path+"/*.fasta") - files_long.extend(glob.glob(path+"/*.fas")) - files_long.extend(glob.glob(path+"/*.fa")) - files_long.extend(glob.glob(path+"/*.fna")) - input_fasta = [] - for i in files_long: - if not "combined" in i: - filename = i[i.rfind('\\')+1:] - input_fasta.append(filename) - -if trial_mode: - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - -main(input_fasta, wordsize, modes=plotting_modes, prefix=output_file_prefix, plot_size=plot_size, label_size=label_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=wobble_conversion, substitution_count=substitution_count, rc_option=rc_option, gff=input_gff_files, multi=collage_output, ncols=m_col, nrows=n_row, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, gff_color_config_file=gff_color_config_file, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, length_scaling=length_scaling, title_length=title_length, spacing=spacing, verbose=verbose) - - diff --git a/code/flexidot_v1.02.py b/code/flexidot_v1.02.py deleted file mode 100644 index 21cb5ae..0000000 --- a/code/flexidot_v1.02.py +++ /dev/null @@ -1,3177 +0,0 @@ -#!/usr/bin/python2.7 -#!/usr/bin/python2.7 -# -*- coding: utf-8 -*- - -""" -FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation - -Kathrin M. Seibt, Thomas Schmidt and Tony Heitkam -Institute of Botany, TU Dresden, Dresden, 01277, Germany - -(Bioinformatics, 2018, doi 10.1093/bioinformatics/bty395) -""" - - -############################### -# Requirements # -############################### - -# import system modules -import os, glob -import time, datetime -import sys -import shutil, getopt -import unicodedata - -def module_install_command(module_name, upgrade=False): - """ - create installation commands for Python modules and print information - """ - if upgrade: - load_command = "python -m pip install --upgrade %s" % module_name - else: - load_command = "python -m pip install %s" % module_name - - try: - logprint("Installing Python module: %s\n\t%s\n" % (module_name, load_command)) - except: - print "Installing Python module: %s\n\t%s\n" % (module_name, load_command) - - return load_command - -def load_modules(): - """ - load Python modules, if possible - otherwise try to install them - """ - - # make module names global - global cllct, gridspec, patches, rcParams, mplrc, P, Color, SeqIO, np, ccv, mcolors, rgb2hex, regex - - # matplotlib - try: - import matplotlib.collections as cllct - except: - command = module_install_command("matplotlib", upgrade=True) - try: - os.system(command) - print "\n" - import matplotlib.collections as cllct - except: - print "Please install module matplotlib manually" - from matplotlib.colors import colorConverter as ccv - import matplotlib.colors as mcolors - import matplotlib.gridspec as gridspec - import matplotlib.patches as patches - import pylab as P - - # specify matplotlib font settings - from matplotlib import rc as mplrc - mplrc('pdf', fonttype=42, compression=0) - from matplotlib import rcParams - rcParams['font.family'] = 'sans-serif' - rcParams['font.sans-serif'] = ['Helvetica', 'Verdana', 'Tahoma', ] - - # colour for color gradient palette - try: - from colour import Color - except: - command = module_install_command("colour") - try: - os.system(command) - print "\n" - from colour import Color - except: - print "Please install module colour manually" - - # color converter - try: - from colormap import rgb2hex - except: - command = module_install_command("colormap") - # additional module easydev.tools required by colormap - command2 = module_install_command("easydev") - try: - os.system(command) - os.system(command2) - print "\n" - from colormap import rgb2hex - except: - print "Please install module colormap manually" - - # biopython - try: - from Bio import SeqIO - except: - command = module_install_command("biopython") - try: - os.system(command) - print "\n" - from Bio import SeqIO - except: - print "Please install module biopython manually" - - # numpy - try: - import numpy as np - except: - command = module_install_command("numpy") - try: - os.system(command) - print "\n" - import numpy as np - except: - print "Please install module numpy manually" - - # regex for pattern matching - try: - import regex - except: - command = module_install_command("regex") - try: - os.system(command) - print "\n" - import regex - except: - print "Please install module regex manually" - -load_modules() - - -############################### -# Usage & Input # -############################### - -def usage(): - """ - usage and help - """ - - print """\n\n FLEXIDOT - ------------------------------------------------------------------- - - Version: - 1.02 - - Citation: - Kathrin M. Seibt, Thomas Schmidt, Tony Heitkam (2018) - "FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation" - Bioinformatics, doi: 10.1093/bioinformatics/bty395 - - - General usage: - $ python flexidot.py -a [ARGUMENTS] - $ python flexidot.py -i [ARGUMENTS] - - - ARGUMENTS - ------------------------------------------------------------------- - - - INPUT/OUTPUT OPTIONS... required are [-a] OR [-i] - - -a, --auto_fas Imports all fasta files from current directory (*.fasta, *.fas, *.fa, *.fna) - -i is not needed, if -a is activated - [inactive by default] - - -i, --in_file Input fasta file (fasta file name or comma-separated file list) - > Provide multiple files: Recall -i or provide comma-separated file names - - -o, --output_file_prefix File prefix to be added to the generated filenames [default = NONE] - - -c, --collage_output Multiple dotplots are combined in a collage - Y or 1 = ON [default] - N or 0 = OFF - - -m, --m_col Number of columns per page [default = 4] (only if --collage_output is ON) - - -n, --n_row Number of rows per page [default = 5] (only if --collage_output is ON) - - -f, --filetype Output file format - 0 = PNG [default] - 1 = PDF - 2 = SVG - - -s, --alphabetic_sorting Sort sequences alphabetically according to titles - Y or 1 = ON - N or 0 = OFF [default] - - - CALCULATION PARAMETERS... - - -k, --wordsize Wordsize (kmer length) for dotplot comparison [default = 7] - - -p, --plotting_mode Mode of FlexiDot dotplotting - 0 = self [default] - 1 = paired - 2 = poly (matrix with all-against-all dotplots) - > Run multiple plotting modes: Recall -p or provide comma-separated numbers - - -t, --type_nuc Type of residue is nucleotide - Y or 1 = nucleotide [default] - N or 0 = amino acid - - -w, --wobble_conversion Ambiguity handling for relaxed matching - Y or 1 = ON - N or 0 = OFF [default] - - -S, --substitution_count Number of substitutions (mismatches) allowed per window for relaxed matching - [default = 0] - - -r, --rc_option Find reverse complementary matches (only if type_nuc=y) - Y or 1 = ON [default] - N or 0 = OFF - - - GRAPHIC FORMATTING... - - -A, --line_width Line width [default = 1] - - -B, --line_col_for Line color [default = black] - - -C, --line_col_rev Reverse line color [default = green] - - -D, --x_label_pos Position of the X-label - Y or 1 = top [default] - N or 0 = bottom - - -E, --label_size Font size [default = 10] - - -F, --spacing Spacing between all-against-all dotplots (only if --plotting_mode=2) - [default = 0.04] - - -P, --plot_size Plotsize [default = 10] - - -L, --length_scaling Scale plot size for pairwise comparison (only if --plotting_mode=1) - Y or 1 = Scaling ON (axes scaled according to sequence length) - N or 0 = Scaling OFF (squared plots) [default] - - -T, --title_length Limit title length for self dotplot comparison - Use last characters of the title name (instead of the first): add an "E" (end), e.g. -T 20E - [default = 20, the first 20 characters] - - - GFF SHADING (for -p/--plotting_mode=0 only)... - - -g, --input_gff_files GFF3 file used for markup in self-dotplots - (provide multiple files: Recall -g or provide comma-separated file names) - - -G, --gff_color_config_file Tab-delimited config file for custom gff shading - column 1: feature type - column 2: color - column 3: alpha - column 4: zoom factor (for small regions) - - - LCS SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -x, --lcs_shading Shade subdotplot based on the length of the longest common substring (LCS) - Y or 1 = ON - N or 0 = OFF [default] - - -X, --lcs_shading_num Number of shading intervals (hues) for LCS (-x) and user matrix shading (-u) - [default = 5] - - -y, --lcs_shading_ref Reference for LCS shading - 0 = maximal LCS length [default] - 1 = maximally possible length (length of shorter sequence in pairwise comparison) - 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] - see -Y - - -Y, --lcs_shading_interval_len Length of intervals for LCS shading (only if --lcs_shading_ref=2) - [default for nucleotides = 50; default for amino acids = 10] - - -z, --lcs_shading_ori Shade subdotplots according to LCS on - 0 = forward [default], - 1 = reverse, or - 2 = both strands (forward shading above diagonal, reverse shading on diagonal and below; - if using --input_user_matrix_file, best LCS is used below diagonal) - - - CUSTOM USER MATRIX SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -u, --input_user_matrix_file Shading above diagonal according to values in matrix file specified by the user - (tab-delimited or comma-separated matrix with sequence name in column 1 and numbers in columns 2-n - e.g. identity matrix from multiple sequence alignment - strings are ignored) - - -U, --user_matrix_print Display provided matrix entries in the fields above diagonal of all-against-all dotplot - Y or 1 = ON - N or 0 = OFF [default] - - - OTHERS... - - -h, --help Help screen - - -v, --verbose Verbose - - - - - """ - -def check_input(argv, trial_mode=False): - """ - commandline argument parsing - """ - - global log_txt, aa_bp_unit - - # helpers for argument parsing - ###################################### - - arguments = ["-a", "--auto_fas", "a", "auto_fas", - "-i", "--input_fasta", "i:", "input_fasta=", - "-o", "--output_file_prefix", "o:", "output_file_prefix=", - "-c", "--collage_output", "c:", "collage_output=", - "-m", "--m_col", "m:", "m_col=", - "-n", "--n_row", "n:", "n_row=", - "-f", "--filetype", "f:", "filetype=", - "-t", "--type_nuc", "t:", "type_nuc=", - "-g", "--input_gff_files", "g:", "input_gff_files", - "-G", "--gff_color_config_file", "G:", "gff_color_config_file", - "-k", "--wordsize", "k:", "wordsize=", - "-p", "--plotting_mode", "p:", "plotting_mode=", - "-w", "--wobble_conversion", "w:", "wobble_conversion=", - "-S", "--substitution_count", "S:", "substitution_count=", - "-r", "--rc_option", "r:", "rc_option=", - "-s", "--alphabetic_sorting", "s:", "alphabetic_sorting=", - "-x", "--lcs_shading", "x:", "lcs_shading=", - "-X", "--lcs_shading_num", "X:", "lcs_shading_num=", - "-y", "--lcs_shading_ref", "y:", "lcs_shading_ref=", - "-Y", "--lcs_shading_interval_len", "Y:", "lcs_shading_interval_len=", - "-z", "--lcs_shading_ori", "z:", "lcs_shading_ori=", - "-u", "--input_user_matrix_file", "u:", "input_user_matrix_file=", - "-U", "--user_matrix_print", "U:", "user_matrix_print=", - "-P", "--plot_size", "P:", "plot_size=", - "-A", "--line_width", "A:", "line_width=", - "-B", "--line_col_for", "B:", "line_col_for=", - "-C", "--line_col_rev", "C:", "line_col_rev=", - "-D", "--x_label_pos", "D:", "x_label_pos=", - "-E", "--label_size", "E:", "label_size=", - "-F", "--spacing", "F:", "spacing=", - "-L", "--length_scaling", "L:", "length_scaling=", - "-T", "--title_length", "T:", "title_length=", - "-h", "--help", "h", "help", - "-v", "--verbose", "v", "verbose"] - - arguments_sysargv = tuple(arguments[0::4] + arguments[1::4]) - arguments_opts = "".join(arguments[2::4]) - arguments_args = arguments[3::4] - - - # setting defaults - ###################################### - - auto_fas = False # 0 - input_fasta = [] - output_file_prefix = None - collage_output = True # 1 - m_col = 4 - n_row = 5 - filetype = 0 - type_nuc = True - input_gff_files = [] - gff_color_config_file = "" - - wordsize = 7 - plotting_modes = [0] - wobble_conversion = False # 0 - substitution_count = 0 - rc_option = True # 1 - alphabetic_sorting = False # 0 - - lcs_shading = False # 0 - lcs_shading_num = 4 - lcs_shading_ref = 0 - lcs_shading_interval_len = 50 # interval default changes to "10" for amino acids [type_nuc = n] - lcs_shading_ori = 0 - - input_user_matrix_file = "" - user_matrix_print = False - - plot_size = 10 - line_width = 1 - line_col_for = "black" - line_col_rev = "#009243" - x_label_pos = True # 0 - label_size = 10 - spacing = 0.04 - length_scaling = False # 0 - title_length = 20 #float("Inf") - title_selectpos = "B" # B (begin), E (end) - - aa_bp_unit = "bp" - - verbose = False # 0 - - filetype_dict = {0: "png", 1: "pdf", 2: "svg"} - lcs_shading_ref_dict = {0: "maximal LCS length", 1: "maximally possible length", 2: "given interval sizes"} - plotting_mode_dict = {0: "self", 1: "paired", 2: "all-against-all"} - lcs_shading_ori_dict = {0: "forward", 1: "reverse complement", 2: "both"} - - # return default parameters for testing purposes - if trial_mode: - print "ATTENTION: YOU ARE IN THE TRIAL MODE!!!\n\n" - - commandline = "trial_mode\n" - - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, verbose] - return parameters - - - # read arguments - ###################################### - - commandline = "" - for arg in sys.argv: - commandline += arg + " " - - log_txt = "\n...reading input arguments..." - print log_txt - - if len(sys.argv) < 2: - print "\nERROR: More arguments are needed. Exit..." - log_txt += "\nERROR: More arguments are needed. Exit..." - usage() - sys.exit() - - elif sys.argv[1] not in arguments_sysargv: - print "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - log_txt += "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - # usage() - sys.exit() - - try: - opts, args = getopt.getopt(sys.argv[1:], arguments_opts, arguments_args) - - except getopt.GetoptError: - print "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - log_txt += "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - # usage() - sys.exit() - - for opt, arg in opts: - - if opt in ("-h", "--help"): - print "...fetch help screen" - log_txt += "\n...fetch help screen" - usage(), sys.exit() - - if opt in ("-v", "--verbose"): - print "...verbose output" - log_txt += "\n...verbose output" - verbose = True - - elif opt in ("-i", "--input_fasta"): - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: fasta_file '%s' was not found!" % str(temp_file) - sys.exit(message) - else: - input_fasta.append(str(temp_file)) - print "fasta file #%i: %s" % (len(input_fasta), str(temp_file)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: fasta_file '%s' was not found!" % str(arg) - log_txt += message - sys.exit(message) - else: - input_fasta.append(str(arg)) - print "fasta file #%i: %s" % (len(input_fasta), str(arg)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(arg)) - - - elif opt in ("-a", "--auto_fas"): - auto_fas = True - - - # multiple gff files: reads them into a list - elif opt in ("-g", "--input_gff_files"): - - # append gff file only if existing - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: gff_file '%s' was not found!" % str(temp_file) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - print "GFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - input_gff_files.append(str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: gff_file '%s' was not found!" % str(arg) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - input_gff_files.append(str(arg)) - print "GFF file #%i: %s" %(len(input_gff_files), str(arg)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(arg)) - - - elif opt in ("-G", "--gff_color_config_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: gff_color_config_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot with default gff coloring specification!" - log_txt += message + "\n -->Running FlexiDot with default gff coloring specification!" - else: - gff_color_config_file = str(arg) - - - elif opt in ("-u", "--input_user_matrix_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: input_user_matrix_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot without input_user_matrix_file %s!" % arg - log_txt += message + "\n -->Running FlexiDot withdefault matrix shading file!" - else: - input_user_matrix_file = str(arg) - - elif opt in ("-U", "--user_matrix_print"): - user_matrix_print = check_bools(str(arg), default=user_matrix_print) - - elif opt in ("-o", "--output_file_prefix"): - output_file_prefix = arg - - elif opt in ("-c", "--collage_output"): - collage_output = check_bools(str(arg), default=collage_output) - - elif opt in ("-m", "--m_col"): - try: m_col = int(arg) - except: - print "m_col - invalid argument - using default value" - log_txt += "\nm_col - invalid argument - using default value" - - elif opt in ("-n", "--n_row"): - try: n_row = int(arg) - except: - print "n_row - invalid argument - using default value" - log_txt += "\nn_row - invalid argument - using default value" - - elif opt in ("-f", "--filetype"): - if 0 <= int(arg) <= 2: - filetype = int(arg) - else: - print "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - log_txt += "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - - elif opt in ("-t", "--type_nuc"): - type_nuc = check_bools(str(arg), default=type_nuc) - - if type_nuc == False: - # interval default changed for amino acids - lcs_shading_interval_len = 10 - aa_bp_unit = "aa" - - elif opt in ("-k", "--wordsize"): - try: wordsize = int(arg) - except: - print "wordsize - invalid argument - using default value" - log_txt += "\nwordsize - invalid argument - using default value" - - elif opt in ("-p", "--plotting_mode"): - if "," in arg: - temp_modes = arg.split(",") - for item in temp_modes: - if item in ["0","1","2"]: - plotting_modes.append(int(item)) - elif arg in ["0","1","2"]: - plotting_modes = [int(arg)] - else: - print "Please provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - log_txt += "\nPlease provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - - elif opt in ("-w", "--wobble_conversion"): - wobble_conversion = check_bools(str(arg), default=wobble_conversion) - - elif opt in ("-S", "--substitution_count"): - try: substitution_count = int(arg) - except: - print "substitution_count - invalid argument - using default value" - log_txt += "\nsubstitution_count - invalid argument - using default value" - - elif opt in ("-r", "--rc_option"): - rc_option = check_bools(str(arg), default=rc_option) - - elif opt in ("-s", "--alphabetic_sorting"): - alphabetic_sorting = check_bools(str(arg), default=alphabetic_sorting) - - elif opt in ("-x", "--lcs_shading"): - lcs_shading = check_bools(str(arg), default=lcs_shading) - - elif opt in ("-X", "--lcs_shading_num"): - try: lcs_shading_num = int(arg) - 1 - except: - print "lcs_shading_num - invalid argument - using default value" - log_txt += "\nlcs_shading_num - invalid argument - using default value" - - elif opt in ("-y", "--lcs_shading_ref"): - try: - if 0 <= int(arg) <= 2: - lcs_shading_ref = int(arg) - else: - print "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - log_txt += "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - except: - print "lcs_shading_ref - invalid argument - using default value" - log_txt += "\nlcs_shading_ref - invalid argument - using default value" - - elif opt in ("-Y", "--lcs_shading_interval_len"): - try: lcs_shading_interval_len = int(arg) - except: - print "lcs_shading_interval_len - invalid argument - using default value" - log_txt += "\nlcs_shading_interval_len - invalid argument - using default value" - - elif opt in ("-z", "--lcs_shading_ori"): - if 0 <= int(arg) <= 2: - lcs_shading_ori = int(arg) - else: - print "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - log_txt += "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - - elif opt in ("-P", "--plot_size"): - try: plot_size = float(arg) - except: - print "plot_size - invalid argument - using default value" - log_txt += "\nplot_size - invalid argument - using default value" - - - elif opt in ("-A", "--line_width"): - try: line_width = float(arg) - except: - print "line_width - invalid argument - using default value" - log_txt += "\nline_width - invalid argument - using default value" - - elif opt in ("-B", "--line_col_for"): - if mcolors.is_color_like(arg): - line_col_for = arg - else: - print "line_col_for - invalid argument - using default value" - log_txt += "\nline_col_for - invalid argument - using default value" - - elif opt in ("-C", "--line_col_rev"): - if mcolors.is_color_like(arg): - line_col_rev = arg - else: - print "line_col_rev - invalid argument - using default value" - log_txt += "\nline_col_rev - invalid argument - using default value" - - elif opt in ("-D", "--x_label_pos"): - x_label_pos = check_bools(str(arg), default=x_label_pos) - - elif opt in ("-E", "--label_size"): - try: label_size = float(arg) - except: - print "label_size - invalid argument - using default value" - log_txt += "\nlabel_size - invalid argument - using default value" - - elif opt in ("-F", "--spacing"): - try: spacing = float(arg) - except: - print "spacing - invalid argument - using default value" - log_txt += "\nspacing - invalid argument - using default value" - - elif opt in ("-L", "--length_scaling"): - length_scaling = check_bools(str(arg), default=length_scaling) - - elif opt in ("-T", "--title_length"): - try: title_length = int(arg) - except: - try: - title_length = int(str(arg)[:-1]) - title_selectpos = arg[-1].upper() # B (beginning), E (end) - except: - print "title_length - invalid argument - using default value" - log_txt += "\ntitle_length - invalid argument - using default value" - - print "test", title_length, title_selectpos - - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - logprint(log_txt, start=False, printing=False) - - - # print chosen arguments - ###################################### - - text = "\n%s\n" % (70 * "-") - text += "\n" + "INPUT/OUTPUT OPTIONS...\n" - text += "\n" + "Input fasta file: " + ", ".join(input_fasta) - text += "\n" + "Automatic fasta collection from current directory: " + str(auto_fas) - text += "\n" + "Collage output: " + str(collage_output) - text += "\n" + "Number of columns per page: " + str(m_col) - text += "\n" + "Number of rows per page: " + str(n_row) - text += "\n" + "File format: " + filetype_dict[filetype] - text += "\n" + "Residue type is nucleotide: " + str(type_nuc) - - text += "\n" + "\n\nCALCULATION PARAMETERS...\n" - text += "\n" + "Wordsize: " + str(wordsize) - text += "\n" + "Plotting mode: " + str(plotting_modes).replace("[", "").replace("]", "") + "\n" + 51 * " " - for item in plotting_modes: - text += plotting_mode_dict[item] + " " - text += "\n" + "Ambiguity handling: " + str(wobble_conversion) - text += "\n" + "Reverse complement scanning: " + str(rc_option) - text += "\n" + "Alphabetic sorting: " + str(alphabetic_sorting) - - if 0 in plotting_modes and input_gff_files != []: - text += "\n" + "Input gff files: " + ", ".join(input_gff_files) - if gff_color_config_file != "": - text += "\n" + "GFF color config file: " + gff_color_config_file - text += "\n" + "Prefix for output files: " + str(output_file_prefix) - - if 2 in plotting_modes: - text += "\n" + "\n\nLCS SHADING OPTIONS (plotting_mode 'all-against-all' only)...\n" - text += "\n" + "LCS shading: " + str(lcs_shading) - text += "\n" + "LCS shading interval number: " + str(lcs_shading_num + 1) - text += "\n" + "LCS shading reference: " + lcs_shading_ref_dict[lcs_shading_ref] - if lcs_shading_ref == 2: - text += "\n" + "LCS shading interval size [%s]: " % (aa_bp_unit) + str(lcs_shading_interval_len) - text += "\n" + "LCS shading orientation: " + lcs_shading_ori_dict[lcs_shading_ori] - if input_user_matrix_file != "": - text += "\n" + "Custom user shading matrix file: " + input_user_matrix_file - text += "\n" + "Print user matrix values (instead of dotplot): " + str(user_matrix_print) - - text += "\n" + "\n\nGRAPHIC FORMATTING...\n" - text += "\n" + "Plot size: " + str(plot_size) - text += "\n" + "Line width: " + str(line_width) - text += "\n" + "Line color: " + line_col_for - text += "\n" + "Reverse line color: " + line_col_rev - text += "\n" + "X label position: " + str(x_label_pos) - text += "\n" + "Label size: " + str(label_size) - text += "\n" + "Spacing: " + str(spacing) - text += "\n" + "Title length (limit number of characters): " + str(title_length) - if title_selectpos == "E": - text += "the last %i characters are printed." %int(title_length) - else: - text += "the first %i characters are printed." %int(title_length) - - text += "\n" + "Length scaling: " + str(length_scaling) - text += "\n%s\n" % (70 * "-") - logprint(text) - - - # collect settings - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_selectpos, verbose] - - return parameters - - -############################### -# Helper Functions # -############################### - -def alphabets(type_nuc=True): - """ - provide ambiguity code for sequences - """ - - nucleotide_alphabet = ["A", "C", "G", "T"] - - nucleotide_alphabet_full = ["A", "C", "G", "T", "N", "B", "D", "H", - "V", "Y", "R", "W", "S", "K", "M"] - - nucleotide_ambiguity_code = {"N": ["A", "C", "G", "T"], # any - "B": ["C", "G", "T"], # not A - "D": ["A", "G", "T"], # not C - "H": ["A", "C", "T"], # not G - "V": ["A", "C", "G"], # not T - "Y": ["C", "T"], # pyrimidine - "R": ["A", "G"], # purine - "W": ["A", "T"], # weak - "S": ["C", "G"], # strong - "K": ["G", "T"], # keto - "M": ["A", "C"]} # amino - - nucleotide_match_dict = {"N": "[ACGTNBDHVYRWSKM]", # any - "B": "[CGTNBDHVYRWSKM]", # not A - "D": "[AGTNBDHVYRWSKM]", # not C - "H": "[ACTNBDHVYRWSKM]", # not G - "V": "[ACGNBDHVYRWSKM]", # not T - "K": "[GTNBDHVYRWSK]", # keto - not A,C,M - "M": "[ACNBDHVYRWSM]", # amino - not G,T,K - "W": "[ATNBDHVYRWKM]", # weak - not C,G,S - "S": "[CGNBDHVYRSKM]", # strong - not A,G,W - "Y": "[CTNBDHVYWSKM]", # pyrimidine - not A,G,R - "R": "[AGNBDHVRWSKM]", # purine - not C,T,Y - "A": "[ANDHVRWM]", - "C": "[CNBHVYSM]", - "G": "[GNBDVRSK]", - "T": "[TNBDHYWK]"} - - # nucleotide_match_dict = {"N": ".", # any - # "B": "[^A]", # not A - # "D": "[^C]", # not C - # "H": "[^G]", # not G - # "V": "[^T]", # not T - # "K": "[^ACM]", # keto - not A,C,M - # "M": "[^GTK]", # amino - not G,T,K - # "W": "[^CGS]", # weak - not C,G,S - # "S": "[^AGW]", # strong - not A,G,W - # "Y": "[^AGR]", # pyrimidine - not A,G,R - # "R": "[^CTY]", # purine - not C,T,Y - # "A": "[ANDHVRWM]", - # "C": "[CNBHVYSM]", - # "G": "[GNBDVRSK]", - # "T": "[TNBDHYWK]"} - - aminoacid_alphabet = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"] - - aminoacid_alphabet_full = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*", "J", - "Z", "B", "X"] - - aminoacid_ambiguity_code = {"J": ["I", "L"], - "Z": ["Q", "E"], - "B": ["N", "D"], - "X": ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"]} # any - - aminoacid_match_dict = {"J": "[ILJ]", - "Z": "[QEZ]", - "B": "[NDB]", - # "X": ".", - "X": "[ARNDCEQGHILKMFPSTWYVUO*XBZJ]", - "A": "[AX]", - "R": "[RX]", - "N": "[NXB]", - "D": "[DXB]", - "C": "[CX]", - "E": "[EXZ]", - "Q": "[QXZ]", - "G": "[GX]", - "H": "[HX]", - "I": "[IXJ]", - "L": "[LXJ]", - "K": "[KX]", - "M": "[MX]", - "F": "[FX]", - "P": "[PX]", - "S": "[SX]", - "T": "[TX]", - "W": "[WX]", - "Y": "[YX]", - "V": "[VX]", - "U": "[UX]", - "O": "[OX]", - "*": "[*X]"} - - aa_only = set(['E', 'F', 'I', 'J', 'L', 'O', 'Q', 'P', 'U', 'X', 'Z', '*']) - # return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aa_only - - if type_nuc: - return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, nucleotide_match_dict - else: - return aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aminoacid_match_dict - -def logprint(text, start=False, printing=True, prefix=""): - """ - log output to log_file and optionally print - """ - - # define log file name and open file - global log_file_name - if start and trial_mode: - log_file_name = "log_file.txt" - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - elif start: - date = datetime.date.today() - time = str(datetime.datetime.now()).split(" ")[1].split(".")[0].replace(":", "-") - log_file_name = "%s_%s_log_file.txt" % (date, time) - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - else: - log_file = open(log_file_name, 'a') - - # write log (and print) - log_file.write(text + "\n") - if printing: - print text - log_file.close() - -def time_track(starting_time, show=True): - """ - calculate time passed since last time measurement - """ - now = time.time() - delta = now - starting_time - if show: - text = "\n\t %s seconds\n" % str(delta) - logprint(text, start=False, printing=True) - return now - -def calc_fig_ratio(ncols, nrows, plot_size, verbose=False): - """ - calculate size ratio for given number of columns (ncols) and rows (nrows) - with plot_size as maximum width and length - """ - ratio = ncols*1./nrows - if verbose: - text = " ".join([ncols, nrows, ratio]) - logprint(text, start=False, printing=True) - if ncols >= nrows: - figsize_x = plot_size - figsize_y = plot_size / ratio - else: - figsize_x = plot_size * ratio - figsize_y = plot_size - return figsize_x, figsize_y - -def shorten_name(seq_name, max_len=20, title_selectpos="B"): #, delim="_"): - """ - shorten sequence names (for diagram titles) - """ - - if len(seq_name) <= max_len: - return seq_name - - # take last characters - if title_selectpos == "E": - name = seq_name[len(seq_name)-max_len:] - - # takefirst characters - else: - name = seq_name[:max_len] - - - """# keep first and last part if multiple parts separated by delimiter (e.g. species_prefix + sequence_id) - if delim in seq_name: - if seq_name.count(delim) >= 2: - name = "%s..." % delim.join(seq_name.split(delim)[:1]) + seq_name.split(delim)[-1] # .replace("_000", "-") - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - - if len(name) > max_len: - name = name[:((max_len-2)//2)] + "..." + name[((max_len-2)//2):] - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - """ - - return name - -def unicode_name(name): - """ - replace non-ascii characters in string (e.g. for use in matplotlib) - """ - unicode_string = eval('u"%s"' % name) - return unicodedata.normalize('NFKD', unicode_string).encode('ascii','ignore') - -def check_bools(arg, update_log_txt = True, default=None): - """ - converts commandline arguments into boolean - """ - - - # convert valid arguments - if str(arg).lower() == "y" or str(arg) == "1": - return True - elif str(arg).lower() == "n" or str(arg) == "0": - return False - - # use default in case of invalid argument - else: - if update_log_txt: - global log_txt - log_txt += "using default for " + str(arg) - else: - try: - logprint("using default for " + str(arg)) - except: - print "using default for " + str(arg) - return default - -def create_color_list(number, color_map=None, logging=False, max_grey="#595959"): - """ - create color list with given number of entries - grey by default, matplotlib color_map can be provided - """ - - try: - # create pylab colormap - cmap = eval("P.cm." + color_map) - # get descrete color list from pylab - cmaplist = [cmap(i) for i in range(cmap.N)] # extract colors from map - # determine positions for number of colors required - steps = (len(cmaplist)-1)/(number) - numbers = range(0, len(cmaplist), steps) - - # extract color and convert to hex code - colors = [] - for idx in numbers[:-1]: - rgb_color = cmaplist[idx] - col = rgb2hex(rgb_color[0]*255, rgb_color[1]*255, rgb_color[2]*255) - colors.append(col) - - # grey - except: - if not color_map == None: - logprint("Invalid color_map (%s) provided! - Examples: jet, Blues, OrRd, bwr,..." % color_map) - logprint("See https://matplotlib.org/users/colormaps.html\n") - old_max_grey = "#373737" - old_max_grey = "#444444" - colors = list(Color("#FFFFFF").range_to(Color(max_grey), number)) # grey - for idx in range(len(colors)): - colors[idx] = str(colors[idx]).replace("Color ", "") - if "#" in colors[idx] and len(colors[idx]) != 7: - # print colors[idx] - colors[idx] = colors[idx] + colors[idx][-(7-len(colors[idx])):] - - text = "%d Colors: %s" % (len(colors), ", ".join(colors)) - if logging: logprint(text, start=False, printing=True) - - if len(colors) < number: - logprint("\nError in color range definition! %d colors missing\n" % (number - len(colors))) - - return colors - - -############################### -# File Handling # -############################### - -def read_seq(input_fasta, verbose=False): - """ - read fasta sequences from (all) file(s) - """ - - # check if file provided - if input_fasta == [] or input_fasta == "": - text = "Attention: No valid file names provided: >%s<" % input_fasta - logprint(text, start=False, printing=True) - return {}, [] - - # combine sequence files, if required - if type(input_fasta) == list: - # concatenate fasta files - if len(input_fasta) > 1: - if verbose: - print "concatenating fastas...", - text = "concatenating fastas..." - input_fasta_combi = concatenate_files(input_fasta) - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - else: - input_fasta_combi = input_fasta[0] - else: - input_fasta_combi = input_fasta - - # read sequences - if verbose: - print "reading fasta...", - text = "reading fasta...", - try: - seq_dict = SeqIO.index(input_fasta_combi, "fasta") - except ValueError: - logprint("Error reading fasta sequences - please check input files, e.g. for duplicate names!") - return {}, [] - except: - logprint("Error reading fasta sequences - please check input files!") - return {}, [] - - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - - for seq in seq_dict: - if "-" in seq_dict[seq].seq: - # ungapped = seq_dict[seq].seq.ungap("-") # cannot be assigned back to sequence record - text = "\nSequences degapped prior Analysis!!!" - logprint(text, start=False, printing=True) - return read_seq(degap_fasta(input_fasta), verbose=verbose) - - # get ordered sequence names - sequences = [] - for item in SeqIO.parse(input_fasta_combi, "fasta"): - sequences.append(item.id) - return seq_dict, sequences - -def read_gff_color_config(gff_color_config_file=""): - """ - define coloring options for gff-based color shading of self-dotplots - """ - - # default aestetics for annotation shading (e.g. if no user config file is provided) - # dictionary with feature_type as key and tuple(color, transparency, zoom) as value - gff_feat_colors = {"orf": ("#b41a31", 0.2, 0), - "orf_rev": ("#ff773b", 0.3, 0), - "gene": ("#b41a31", 0.2, 0), - "cds": ("darkorange", 0.2, 0), - "exon": ("orange", 0.2, 0), - "intron": ("lightgrey", 0.2, 0), - "utr": ("lightblue", 0.2, 0), - "repeat_region": ("green", 0.3, 0), - "repeat": ("green", 0.3, 0), - "tandem_repeat": ("red", 0.3, 0), - "transposable_element": ("blue", 0.3, 0), - "ltr_retrotransposon": ("#cccccc", 0.5, 0), - "ltr-retro": ("#cccccc", 0.5, 0), - "long_terminal_repeat": ("#2dd0f0", 0.75, 2), - "ltr": ("#2dd0f0", 0.75, 2), - "pbs": ("purple", 0.75, 2), - "ppt": ("#17805a", 0.5, 2), - "target_site_duplication": ("red", 0.75, 2), - "misc_feature": ("grey", 0.3, 0), - "misc_feat": ("grey", 0.3, 0), - "misc": ("grey", 0.3, 0), - "others": ("grey", 0.5, 0)} - if gff_color_config_file in ["", None] or not os.path.exists(str(gff_color_config_file)): - return gff_feat_colors - - text = "Updating GFF color configuration with custom specifications\n" - logprint(text, start=False, printing=True) - - # read custom gff_color_config_file - in_file = open(gff_color_config_file, 'rb') - overwritten = set([]) - for line in in_file: - if not line.startswith("#") and len(line.strip().split("\t")) >= 4: - data = line.strip().split("\t") - feat = data[0].lower() - color = data[1].lower() - - # check, if settings are valid - if not mcolors.is_color_like(color): - color = "grey" - text = "Invalid color specified for %s: %s - default grey" % (data[0], data[1]) - logprint(text) - try: - alpha = float(data[2]) - except: - alpha = 0.75 - text = "Invalid alpha specified for %s: %s - default 0.75" % (data[0], data[2]) - logprint(text) - try: - zoom = float(data[3]) - except: - zoom = 0 - text = "Invalid zoom specified for %s: %s - default 0" % (data[0], data[3]) - logprint(text) - - # track changes of predefined settings - if feat in gff_feat_colors.keys(): - overwritten.add(data[0].lower()) - - gff_feat_colors[feat] = (color, alpha, zoom) - in_file.close() - - # default coloring for unknown annotations - if not "others" in gff_feat_colors.keys(): - gff_feat_colors["others"] = ("grey", 0.5, 0) - - if verbose: - # print configuration - text = "\n\nGFF color specification:\n%s\n" % (60 * ".") - for item in sorted(gff_feat_colors.keys()): - text += "%-30s\t%-10s\t%-5s\t%s\n" % (item, str(gff_feat_colors[item][0]), str(gff_feat_colors[item][1]), str(gff_feat_colors[item][2])) - logprint (text, printing=True) - - # print overwritting feature type specifications - if len(overwritten) != 0: - text = "%d feature type specifications overwritten:" % len(overwritten) - text += "\n\t"+ ", ".join(overwritten) + "\n" - logprint(text, start=False, printing=True) - - text = "GFF color specification updated acc. to %s\n\t%s\n\n" % (gff_color_config_file, ", ".join(gff_feat_colors)) - logprint(text, start=False, printing=True) - - return gff_feat_colors - -def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=True, prefix="", filetype='png', verbose=False): - """ - create feature dictionary from input_gff - sequence name as key and (feature type, start, stop) as value - """ - if type(input_gff_files) != list: - input_gff_files = [input_gff_files] - - # create dictionary with seq_name as key and (type, start and stop) as value - unknown_feats = set([]) - used_feats = set([]) - feat_dict = {} - for input_gff in input_gff_files: - text = "...reading " + input_gff - logprint(text, start=False, printing=True) - - in_file = open(input_gff, 'rb') - for line in in_file: - if not line.startswith("#") and line.strip() != "": - data = line.strip().split("\t") - feat_type = data[2].lower() - if data[6] == "-": - feat_type += "_rev" - if not feat_type.lower() in color_dict.keys(): - if feat_type.lower().replace("_rev", "") in color_dict.keys(): - feat_type = feat_type.replace("_rev", "") - else: - unknown_feats.add(feat_type) - feat_type = "others" - used_feats.add(feat_type) - if not data[0] in feat_dict.keys(): - feat_dict[data[0]] = [(feat_type, int(data[3]), int(data[4]))] # feature type, start, stop - else: - feat_dict[data[0]].append((feat_type, int(data[3]), int(data[4]))) # feature type, start, stop - if verbose: - text = "\nAnnotations for: %s\n" % ", ".join(feat_dict.keys()[:10]) - if len(feat_dict.keys()) > 10: - text = text[:-1] + ", ...\n" - logprint(text, start=False, printing=True) - in_file.close() - - # print feature types without specific shading settings - if len(unknown_feats) != 0: - text = "Missing shading specification for %d feature type(s):\n\t%s\n" % (len(unknown_feats), ", ".join(sorted(unknown_feats))) - logprint(text, start=False, printing=True) - - # create color legend - colors, alphas = [], [] - for item in sorted(used_feats): - colors.append(color_dict[item][0]) - alphas.append(color_dict[item][1]) - legend_figure(colors=colors, lcs_shading_num=len(used_feats), type_nuc=type_nuc, bins=sorted(used_feats), alphas=alphas, gff_legend=True, prefix=prefix, filetype=filetype) - - # print settings - text = "GFF Feature Types: %s\nGFF Colors: %s" % (", ".join(sorted(used_feats)), ", ".join(sorted(colors))) - logprint(text, start=False, printing=True) - - return feat_dict - -def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, verbose=False): - input_file = open(matrix_file_name, 'rb') - - # read sequence names from first column - names = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - names.append(line.strip().split(delim)[0]) - logprint("Delimiter '%s': %d names - %s\n" % (delim, len(names), ", ".join(names))) - - # check if names were found - otherwise try another delimiter - if names == [] and not recursion: - if delim == "\t": - new_delim = "," - else: - new_delim = "\t" - logprint("\nMatrix file not containing data delimited by '%s' - trying to read matrix with delimiter '%s'" % (delim.replace("\t", "\\t"), new_delim)) - info_dict = read_matrix(matrix_file_name, delim=new_delim, symmetric=symmetric, recursion=True, verbose=verbose) - return info_dict - elif names == []: - logprint("Empty matrix file with alternative delimiter!") - return info_dict - input_file.close() - - input_file = open(matrix_file_name, 'rb') - # read matrix entries as values in dictionary with tuple(names) as key - info_dict = {} - contradictory_entries = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - data = line.strip().split(delim) - for idx in range(len(data[1:])): - # print tuple(sorted([data[0], names[idx]])), data[idx+1] - if symmetric: - key = tuple(sorted([names[idx], data[0]])) - else: - key = tuple(names[idx], data[0]) - if key in info_dict.keys(): - if symmetric and info_dict[key] != data[idx+1] and data[idx+1] not in ["", "-"] and info_dict[key] not in ["", "-"]: - contradictory_entries.append(key) - info_dict[key] = data[idx+1] - input_file.close() - - if len(contradictory_entries) != 0: - try: - logprint("\nContradictory entries in matrix file %s:\n\t%s" % (matrix_file_name, ", ".join(contradictory_entries))) - except: - log_txt = "\nContradictory entries in matrix file %s:\n\t" % (matrix_file_name) - for item in contradictory_entries: - log_txt += str(item).replace("'", "") + ", " - log_txt = log_txt[:-2] - logprint(log_txt) - logprint("Using value from bottom left triangle!") - if verbose: - logprint("\nMatrix information for Sequences named: " % ", ".join(names)) - - return info_dict - -def concatenate_files(file_list, combi_filename="temp_combined.fasta", verbose=False): - """ - concatenate content of all files in file_list into a combined file named combi_filename - """ - out_file = open(combi_filename, 'w') - text = "" - for item in file_list: - if verbose: - text += item + " " - print item, - # read in_file linewise and write to out_file - in_file = open(item, 'rb') - for line in in_file: - out_file.write(line.strip()+"\n") - in_file.close() - out_file.close() - if verbose: - logprint(text, start=False, printing=False) - return combi_filename - -def degap_fasta(input_fasta): - """ - remove gaps from fasta - new degapped sequence file created - """ - - # degap all sequence files - output_fastas = [] - if type(input_fasta) != list: - input_fasta = list(input_fasta) - for input_fas in input_fasta: - output_fas = input_fas[:input_fas.rfind(".")] + "_degapped.fas" - in_file = open(input_fas, 'rb') - out_file = open(output_fas, 'w') - for line in in_file: - if line.startswith(">"): - out_file.write(line.strip()+"\n") - else: - out_file.write(line.strip().replace("-", "")+"\n") - out_file.close() - in_file.close() - output_fastas.append(output_fas) - return output_fastas - -def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="png", max_lcs_len=None, min_lcs_len=0, bins=[], alphas=[], gff_legend=False, prefix="", verbose=False): - """ - create figure color legend - """ - max_legend_length_row = 8 - max_legend_length_col = 4 - - # define output file - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg" - logprint(text, start=False, printing=True) - filetype="png" - - # check if length of information fit - if not gff_legend and ((bins != [] and len(colors) != lcs_shading_num+1) or (bins != [] and len(colors) != len(bins)+1)): - if bins != [] and len(colors) != lcs_shading_num+1: - text = "**Attention**\nlcs_shading_num (%d) does not match number of colors (%d)!\n"% (lcs_shading_num, len(bins)) - elif bins != [] and len(colors) != len(bins)+1: - text = "**Attention**\nnumber of LCS length bins (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - elif gff_legend and len(bins) != len(colors): - text = "**Attention**\nnumber of GFF Feature Types (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - - # set alpha values to opaque if none are provided - if alphas == []: - for item in colors: - alphas.append(1) - - # legend data points - data_points = range(len(colors)) - if not gff_legend: - - # specify intervals, if max_lcs_len provided - if max_lcs_len != None: - multi_factor = 100 # one digit - if max_lcs_len <= 1: - multi_factor = 1000 # two digits - # len_interval_size = (max_lcs_len-min_lcs_len) * multi_factor *1. // lcs_shading_num * (1./ multi_factor) - len_interval_size = (max_lcs_len-min_lcs_len) * 1. / lcs_shading_num - len_pos = [float("%.2f" % (min_lcs_len))] - # calculate interval positions - for idx in range(lcs_shading_num): - len_pos.append(float("%.2f" % (len_pos[-1] + len_interval_size))) - - if prefix.startswith("custom-matrix") and (0 <= max_lcs_len <= 100 and 0 <= min_lcs_len <= 100): - unit = "%" - elif prefix.startswith("custom-matrix"): - unit = "" - - text = "\n%d Legend intervals from %.2f to %.2f: \n\t%s - number: %d, step: %.2f, unit: %s\n" % (lcs_shading_num+1, min_lcs_len, max_lcs_len, str(len_pos), len(len_pos), len_interval_size, unit) - logprint(text, start=False, printing=True) - pos = len_pos - interval_size = len_interval_size - else: - # generate legend labels acc. to standard interval notation - interval_size = 100 // lcs_shading_num - pos = range(interval_size, 101+interval_size, interval_size) - - if bins != []: # labels provided - legend_labels = bins[:] - legend_labels.append("max") - legend_labels_lengths = [] - for item in bins: - legend_labels_lengths.append("[%d %s, %d %s)" % (item - min(bins), unit, item, unit)) - if len(bins) == len(colors) - 1: - legend_labels_lengths.append("[%d %s, %s]" % (max(bins), unit, u"\u221E")) # infinite - - else: - legend_labels = [] - legend_labels_lengths = [] - for idx in range(len(pos)): - num = pos[idx] - legend_labels.append("[%d%%, %d%%)" % (num - interval_size, num)) - if max_lcs_len != None: - num = len_pos[idx] - # as int or float - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths.append("[%d %s, %d %s)" % (num, unit, num + len_interval_size, unit)) - else: - legend_labels_lengths.append("[%.2f %s, %.2f %s)" % (num, unit, num + len_interval_size, unit)) - legend_labels[-1] = "100" + unit - if max_lcs_len != None: - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths[-1] = "%d %s" % (max_lcs_len, unit) - else: - legend_labels_lengths[-1] = "%.2f %s" % (max_lcs_len, unit) - - # set labels and choose file name - if gff_legend: - label_text = bins[:] - edge_col = None - legend_file_name = "Selfdotplot_GFF_Shading_Legend_n%d." % lcs_shading_num + filetype - elif max_lcs_len != None: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_max%d%s_n%d." % (max_lcs_len, unit, lcs_shading_num) + filetype - elif bins != []: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%d%s_n%d." % (bins[0], unit, lcs_shading_num) + filetype - else: - label_text = legend_labels[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%%len_n%d." % lcs_shading_num + filetype - - if prefix != None and prefix != "": - if not prefix.endswith("-"): - prefix = prefix + "-" - legend_type = "LCS" - if prefix.startswith("custom-matrix"): - prefix = prefix.replace("custom-matrix", "")[1:] - legend_type = "CustomMatrix" - legend_file_name = prefix + legend_file_name.replace("LCS", legend_type) - - # plot legend figure - fig, ax = P.subplots(3, 1, figsize=(len(colors)*2, len(colors)*2)) - for idx in range(len(colors)): - ax[0].bar(data_points[idx]+1, data_points[idx]+1, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[2].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].set_ylim(0,1) - ax[2].set_ylim(0,1) - ax[1].legend(ncol=((len(colors)-1)//max_legend_length_row)+1, framealpha=1) # vertical legend - col_num = len(colors) - if len(colors) > max_legend_length_col: - remainder = 0 - if len(colors) % max_legend_length_col != 0: - remainder = 1 - row_num = len(colors) // max_legend_length_col + remainder - remainder = 0 - if len(colors) % row_num != 0: - remainder = 1 - col_num = len(colors) // row_num + remainder - ax[2].legend(ncol=col_num, framealpha=1) # horizontal legend - - P.savefig(legend_file_name) - - return legend_file_name - - -############################### -# Analysis Functions # -############################### - -def wobble_replacement(sequence, general_ambiguity_code, verbose=False): - """ - get all degenerated sequences for sequence with ambiguous residues - (only residues considered that are keys in wobble_dictionary) - """ - - # get positions of ambiguous residues - wobble_pos = [] - for idx in range(len(sequence)): - letter = sequence[idx] - if letter in general_ambiguity_code.keys(): - wobble_pos.append(idx) - - if verbose: - text = "\t%d wobbles" % len(wobble_pos) - logprint(text, start=False, printing=True) - - # replace one wobble through each iteration by all possible residues - # repeat if still wobbles in new kmers - kmer_variants = [sequence] - while True: - if verbose: - text = "\t\t%d kmer variants" % len(kmer_variants) - logprint(text, start=False, printing=True) - temp_kmers = set([]) - for kmer in kmer_variants: - for idx in wobble_pos: - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - for base in general_ambiguity_code[kmer[idx]]: - newkmer = kmer[:idx] + base + kmer[idx+1:] - temp_kmers.add(newkmer) - wobble = False - for kmer in temp_kmers: - for idx in range(len(kmer)): - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - wobble = True - break - if wobble: - break - kmer_variants = set(list(temp_kmers)[:]) - if not wobble: - break - - return kmer_variants - -def split_diagonals(data, stepsize=1): - """ - split array if point difference exceeds stepsize - data = sorted list of numbers - """ - return np.split(data, np.where(np.diff(data) != stepsize)[0]+1) - -def longest_common_substring(s1, s2): - m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))] - longest, x_longest = 0, 0 - for x in xrange(1, 1 + len(s1)): - for y in xrange(1, 1 + len(s2)): - if s1[x - 1] == s2[y - 1]: - m[x][y] = m[x - 1][y - 1] + 1 - if m[x][y] > longest: - longest = m[x][y] - x_longest = x - else: - m[x][y] = 0 - return longest - -def lcs_from_x_values(x_values): - """ - calculate length of longest common substring based on nested list of numbers - """ - if len(x_values) == 0: - return 0 - # get lengths of each subarray data - lengths = np.array([len(i) for i in x_values]) - return max(lengths) - - -############################### -# Matching Functions # -############################### - -def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - - # forward - ################################# - kmer_pos_dict_one = {}; kmer_pos_dict_two = {} # dictionaries for both sequences - - # reverse complement - ################################# - kmer_pos_dict_three = {}; kmer_pos_dict_four = {} # dictionaries for both sequences - - # create dictionaries with kmers (wordsize) and there position(s) in the sequence - if rc_option: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two), - (str(seq_one), kmer_pos_dict_three), - (str(seq_two.reverse_complement()), kmer_pos_dict_four)] - else: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two)] - for (seq, kmer_pos_dict) in data_list: - for i in range(len(seq)-wordsize+1): - kmer = seq[i:i+wordsize] - # discard kmer, if too many Ns included - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - if not convert_wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - wobbles = False - for item in general_ambiguity_code.keys(): - if item in kmer: - wobbles = True - break - if not wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - kmer_variants = wobble_replacement(kmer, general_ambiguity_code) - for new_kmer in kmer_variants: - # print "\t", new_kmer - try: - kmer_pos_dict[new_kmer].append(i) - except KeyError: - kmer_pos_dict[new_kmer] = [i] - - # find kmers shared between both sequences - matches_for = set(kmer_pos_dict_one).intersection(kmer_pos_dict_two) # forward - matches_rc = set(kmer_pos_dict_three).intersection(kmer_pos_dict_four) # reverse complement - - if verbose: - text = "[matches: %i for; %.i rc]" % (len(matches_for), len(matches_rc)) - logprint(text, start=False, printing=True) - - # create lists of x and y co-ordinates for scatter plot - # keep all coordinates of all shared kmers (may match multiple times) - diag_dict_for = {} - diag_dict_rc = {} - for (match_list, pos_dict1, pos_dict2, diag_dict) in [(matches_for, kmer_pos_dict_one, kmer_pos_dict_two, diag_dict_for), - (matches_rc, kmer_pos_dict_three, kmer_pos_dict_four, diag_dict_rc)]: - for kmer in match_list: - for i in pos_dict1[kmer]: - for j in pos_dict2[kmer]: - diag = i-j - points = set(range(i+1, i+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - -def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize via regular expression search - fuzzy matching - allow up to substitution_count substitutions - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - ambiguity_match_dict = alphabets(type_nuc)[3] - - ambiq_residues = "[%s]" % "".join(general_ambiguity_code.keys()) - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # check for wobble presence - if not (regex.search(ambiq_residues, str(seq_one)) == None and regex.search(ambiq_residues, str(seq_two)) == None): - wobble_found = True - else: - wobble_found = False - - # dictionary for matches - diag_dict_for = {} - diag_dict_rc = {} - counter = [0, 0] - - # one-way matching - if rc_option: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0), - (str(seq_one), str(seq_two.reverse_complement()), diag_dict_rc, 1)] - else: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0)] - - for seq_query, seq_target, diag_dict, counter_pos in data_list: - # split query sequence into kmers - if not rc_option and counter_pos == 1: - break - - for idx in range(len(str(seq_query))-wordsize+1): - kmer = str(seq_query)[idx:idx+wordsize] - - # skip excessive N/X stretches (big black areas) - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - # convert kmer to regular expression for wobble_matching - if convert_wobbles and wobble_found: - kmer_string = "" - # replace each residue with matching residues or wobbles - for jdx in range(len(kmer)): - kmer_string += ambiguity_match_dict[kmer[jdx]] - else: - kmer_string = kmer - - # convert to regular expression tolerating substitution errors - if type(substitution_count) == int and substitution_count != 0: - kmer_string = "(%s){s<=%d}" % (kmer_string, substitution_count) - - # search for regular expression in target sequence - kdx = 0 - start = True - if regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - while regex.search(kmer_string, seq_target[kdx:]) != None: - # search for regular expression pattern in target sequence - result = regex.search(kmer_string, seq_target[kdx:]) - - kmer2 = seq_target[kdx:][result.start():result.end()] - - # skip excessive N/X stretches (big black areas) - if kmer2.count(any_residue)*100./wordsize <= max_N_percentage: - diag = idx-(kdx+result.start()) - points = set(range(idx+1, idx+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - kdx += result.start() + 1 - if kdx >= len(seq_target): - break - elif regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - - if verbose: - text = "%5.i \tforward matches" % counter[0] - text += "\n%5.i \treverse complementary matches" % counter[1] - logprint(text, start=False, printing=True) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - - -############################### -# Dot Plot Functions # -############################### - -def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, title_length=float("Inf"), title_selectpos="B"): - """ - self-against-self dotplot - partially from biopython cookbook - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least one input sequence - if len(sequences) == 0: - text = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - text += " No sequences provided for selfdotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1 and multi: - text = "\n\nCreating collage output for single selfdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nSelfdotplot Collage: Invalid collage - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences): - ncols = len(sequences) - nrows = 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=prefix, filetype=filetype, verbose=verbose) - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage=49 - text = "Provide valid max_N_percentage, kmers with >=50% Ns are ignored\n" - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given:%s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - global t1 - - print "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-"), - log_txt = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - - # preparations for file name - name_graph = "Selfdotplots" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if multi: - suffix += "_collage" - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - list_of_png_names = [] - - counter = 0 - for seq_name in sequences: - print seq_name, - log_txt += " " + seq_name - - counter += 1 - if not multi: - P.cla() # clear any prior graph - - # read sequence - seq_record = seq_dict[seq_name] - name_seq = seq_record.id - seq_one = seq_record.seq.upper() - length_seq = len(seq_one) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_regex(seq_one, seq_one, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG", - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_diag(seq_one, seq_one, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlim(0, length_seq+1) - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - P.title(unicode_name(shorten_name(name_seq, max_len=title_length, title_selectpos=title_selectpos)), fontsize=label_size, fontweight='bold') - # P.title(unicode_name(name_seq), fontsize=label_size*1.3, fontweight='bold') - - # save figure and reinitiate if page is full - if counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' % (prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - else: # not multi - - fig = P.figure(figsize=(plot_size, plot_size)) # figure size needs to be a square - ax = P.subplot(1, 1, 1) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - number = 0 - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlim(0, length_seq+1) - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - P.title(unicode_name(shorten_name(name_seq, max_len=title_length, title_selectpos=title_selectpos)), fontsize=label_size*1.3, fontweight='bold') - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_%s_wordsize%i%s.%s' %(prefix, name_graph, counter, name_seq, wordsize, suffix, filetype) - P.savefig(fig_name, bbox_inches='tight') - - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - if multi and counter >= 1: - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - print "\n\nDrawing selfdotplots done" - log_txt += "\n\nDrawing selfdotplots done" - logprint(log_txt, start=False, printing=False) - - return list_of_png_names - -def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, length_scaling=True, scale_delim_col="red", title_length=float("Inf"), title_selectpos="B"): - """ - pairwise dotplot (all-against-all) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least two input sequences - if len(sequences) < 2: - text = "\n%s\n\nCreating %d paired dotplot image \n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += " Please provide at least two sequences for pairdotplot!\n\nTerminating paired dotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 2 and multi: - text = "\n\nCreating collage output for single pairdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nPairdotplot Collage: Invalid collage settings - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences)*(len(sequences)-1): - ncols = len(sequences) - nrows = 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences)*(len(sequences)-1): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - - text = "\n%s\n\nCreating %d paired dotplot image for\n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += ", ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - y_label_rotation = "vertical" - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage=49 - text = "Provide valid max_N_percentage, kmers with >50% are ignored\n" - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given: %s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - # preparations for file name - name_graph = "Pairdotplot" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if length_scaling: - suffix += "_scaled" - if multi: - suffix += "_collage" - - - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - list_of_png_names = [] - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - - # prepare LCS data file - lcs_data_file = open("%sPairdotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - counter, seq_counter = 0, 0 - print "Drawing pairwise dotplot...", - log_txt = "Drawing pairwise dotplot..." - if verbose: - seq_text = "" - for idx in range(len(sequences)-1): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx+1, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += " " + str(seq_counter) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - - # write LCS data file - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - else: - # calculate figure size for separate figures - if len_one >= len_two: - sizing = (plot_size, max(2, (plot_size)*len_two*1./len_one)) - # sizing = (plot_size, min(plot_size, max(2, (plot_size-2)*len_two*1./len_one+2))) - else: - sizing = (max(2, (plot_size)*len_one*1./len_two), plot_size) - # sizing = (min(plot_size, max(2, (plot_size-2)*len_one*1./len_two+2)), plot_size) - fig = P.figure(figsize=(plot_size, plot_size)) - - ax = P.subplot(1, 1, 1) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlabel(unicode_name(shorten_name(name_one, max_len=title_length, title_selectpos=title_selectpos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.ylabel(unicode_name(shorten_name(name_two, max_len=title_length, title_selectpos=title_selectpos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - if not multi: - if length_scaling: - ax.set_aspect(aspect='equal', adjustable='box', anchor='NW') - P.xlim(0, len_one+1) - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - elif not length_scaling: - P.xlim(0, len_one+1) - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - else: - max_len = max(len_one, len_two) - P.xlim(0, max_len+1) - P.ylim(max_len+1, 0) # rotate y axis (point downwards) - - # plot line deliminating shorter sequence - if max_len != len_one: - ax.plot((len_one+1, len_one+1), (0, len_two), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - if max_len != len_two: - ax.plot((0, len_one), (len_two+1, len_two+1), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - - # evtl. switch x axis position - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - P.setp(ax.get_xticklabels(), fontsize=label_size*.9) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) - - # save figure and reinitiate if page is full - if multi and counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=.5, wspace=.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=.5, wspace=.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - elif not multi: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, bottom=0.05) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_wordsize%i%s.%s' % (prefix, name_graph, counter, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - list_of_png_names.append(fig_name) - fig = P.figure() - - # save figure - if multi and counter >= 1: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=0.5, wspace=0.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.5, wspace=0.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - print - logprint(seq_text, start=False, printing=False) - - return list_of_png_names - -def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, lcs_shading=True, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, lcs_shading_num=5, spacing=0.04, input_user_matrix_file="", user_matrix_print=True, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, title_length=float("Inf"), title_selectpos="B", rotate_labels=False): - """ - all-against-all dotplot - derived from dotplot function - - lcs_shading_refs: - 0 color relative to maximum lcs observed in dataset [default] - 1 color by coverage of shorter sequence (e.g. lcs = 70% of seq1) - lcs_shading_ori - 0 forward only - 1 reverse only - 2 both orientations (in opposite plot) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - if len(sequences) == 0: - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " No sequences provided for polydotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1: - text = "\n\nCreating polydotplot for single sequence!" - text += "\nRecommendation: Use selfdotplot via '--plotting_mode 0'!\n\n" - logprint(text, start=False, printing=True) - - - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " " + " ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage=49 - text = "Provide valid max_N_percentage, kmers with >50% are ignored\n" - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given: %s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - if lcs_shading and not type_nuc: - if lcs_shading_ori != 0: - lcs_shading_ori = 0 - text = "Protein shading does not support reverse complementary matching!\n" - logprint(text, start=False, printing=True) - - # read custom shading matrix & match names of sequences to fasta - if input_user_matrix_file != "" and input_user_matrix_file != None: - logprint("Reading user matrix file: %s" % input_user_matrix_file) - # lcs_shading_ori = 2 - custom_dict = read_matrix(input_user_matrix_file) - if custom_dict != {}: - custom_shading = True - custom_similarity_dict = {} - invalid_entries = [] - custom_max = 0 - custom_min = float("Inf") - for key in custom_dict.keys(): - number_key = [] - - # convert number into float - try: - value = float(custom_dict[key]) - if not "." in custom_dict[key]: - value = int(custom_dict[key]) - custom_max = max(custom_max, value) - custom_min = min(custom_min, value) - except: - value = custom_dict[key] - if value == "": - value = None - invalid_entries.append(key) - # match matrix names with sequence names - for item in key: - if item in sequences: - number_key.append(sequences.index(item)) - else: - number_key.append(-1) - # dictionary with tuple of sorted sequence indices as key and number as value - custom_similarity_dict[tuple(sorted(number_key))] = value - if len(invalid_entries) != 0: - text = "No valid number in custom similarity matrix for %d entries: \n\t" % (len(invalid_entries)) - for key in invalid_entries: - text += str(key) + " - " + str(custom_dict[key]) + "; " - logprint(text[:-2]+"\n") - - text = "Custom user matrix given: min %.2f, max %.2f\n" % (custom_min, custom_max) - - # artificially rounding intervals if likely identity/divergence percentages - if 0 <= custom_min < 1 and 0 < custom_max <= 1: - rounding_factor = 5 - multi_factor = 100 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (multi_factor*custom_min // rounding_factor) * (1.*rounding_factor/multi_factor)) - custom_max = min((multi_factor*custom_max // rounding_factor) * (1.*rounding_factor/multi_factor), 1) - text += "new (%.2f, %2f)\n" % (custom_min, custom_max) - - elif 0 <= custom_min < 100 and 0 < custom_max <= 100: - rounding_factor = 5 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (custom_min // rounding_factor) * rounding_factor) - custom_max = min((custom_max // rounding_factor) * rounding_factor, 100) - text += "new (%d, %d)\n" % (custom_min, custom_max) - - logprint(text) - - else: - custom_shading = False - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=prefix, filetype=filetype, verbose=verbose) - - name_graph = "Polydotplot" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if custom_shading: - suffix += "_matrix" - if lcs_shading: - suffix += "_%dshades_ref%d_ori%s" % (lcs_shading_num+1, lcs_shading_ref, lcs_shading_ori) - if "ref2" in suffix and type_nuc: - suffix = suffix.replace("ref2", "%dbp" % lcs_shading_interval_len) - elif "ref2" in suffix: - suffix = suffix.replace("ref2", "%daa" % lcs_shading_interval_len) - - - # name and create output files (names derived from SEQNAME) - if prefix != None and str(prefix) != "": - prefix = str(prefix) + "-" - else: - prefix = "" - - # preparations for background shading - if lcs_shading or custom_shading: - # create color range white to grey - colors = create_color_list(lcs_shading_num+1, color_map=None, logging=True) - colors_2 = create_color_list(lcs_shading_num+1, color_map="OrRd", logging=True) - - if custom_shading: - text = "Custom Matrix Colors: " + ", ".join(colors_2) - - # write lcs lengths to file - lcs_data_file = open("%sPolydotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - # compare sequences pairwise - save lcs and line information in dictionary for plotting - data_dict = {} # keys = tuple(idx, jdx), value = x1, y1, x2, y2 (line positions) - lcs_dict = {} # keys = tuple(idx, jdx), value = length of lcs: lcs_len or (lcs_for, lcs_rev) - for_lcs_set = set([]) # keep lengths to calculate max (excluding self comparisons) - rev_lcs_set = set([]) # keep lengths to calculate max (all) - - text = "\nTotal plot count: %d" % (len(sequences)*(len(sequences))) - text += "\nTotal calculations: %d" % (len(sequences)*(len(sequences)+1)/2) - logprint(text, start=False, printing=True) - - print "\nCalculating shared regions and lengths of longest_common_substring...", - log_txt = "\nCalculating shared regions and lengths of longest_common_substring..." - # determine matches and length of lcs by comparing all sequence pairs - if verbose: - seq_text = "" - counter = 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif len(sequences) < 5: - print "\t%s (%d %s), %s (%d %s)" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - log_txt += "\t%s (%d %s), %s (%d %s)\n" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - else: - if not counter % 25: - print counter, - log_txt += str(counter) - - # get positions of matches & length of longest common substring based on match lengths - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - data_dict[(idx, jdx)] = x1[:], y1[:], x2[:], y2[:] - lcs_dict[idx, jdx] = lcs_for, lcs_rev - - if idx != jdx: - for_lcs_set.add(lcs_for) - rev_lcs_set.add(lcs_rev) - - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - if not verbose: - print len(sequences)*(len(sequences)+1)/2, " done\n" - log_txt += str(len(sequences)*(len(sequences)+1)/2) + " done\n" - else: - print "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - log_txt += "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - logprint(log_txt, start=False, printing=False) - - if verbose: - logprint ("\n\nlcs_dict\n" + str(lcs_dict)) - if custom_shading: - logprint ("\ncustom_dict\n" + str(custom_dict)) - logprint ("\ncustom_similarity_dict\n\n" + str(custom_similarity_dict)) - - if verbose: - print - logprint(seq_text+"\n", start=False, printing=False) - - if lcs_shading_ref == 2: - color_bins = [] - text = "\nLCS lengh bins: " - for idx in range(lcs_shading_num): - color_bins.append(lcs_shading_interval_len*(idx+1)) - text += " " + str(lcs_shading_interval_len*(idx+1)) - logprint(text, start=False, printing=True) - - # calculate maximum lcs length - if lcs_shading_ori == 0: # forward only - if len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - elif lcs_shading_ori == 1: # reverse complement only - if len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - else: - max_lcs = None - else: # both orientations - if len(rev_lcs_set) != 0 and len(for_lcs_set) != 0: - max_lcs = max(max(rev_lcs_set), max(for_lcs_set)) - elif len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - elif len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - - if not max_lcs == None: - text = "Maximum LCS: %d %s" % (max_lcs, aa_bp_unit) - logprint(text, start=False, printing=True) - if custom_shading: - text = "Maximum custom value: %d\n" % custom_max - logprint(text, start=False, printing=True) - - # count sequences - ncols = len(sequences); nrows = len(sequences) - - # get sequence lengths to scale plot widths and heights accordingly - size_ratios = [] - for item in sequences: - size_ratios.append(len(seq_dict[item].seq)) - - P.cla() # clear any prior graph - # use GridSpec to resize plots according to sequence length - gs = gridspec.GridSpec(nrows, ncols, - width_ratios=size_ratios, - height_ratios=size_ratios) - fig = P.figure(figsize=(plot_size, plot_size)) - - # determine label orientations - if len(sequences) > 5 or rotate_labels: - x_label_rotation = 45 - y_label_rotation = "horizontal" - if x_label_pos_top: - xhalign = 'left' - xvalign = 'bottom' - else: - xhalign = 'right' - xvalign = 'top' - yhalign = "right" - else: - x_label_rotation = "horizontal" - y_label_rotation = "vertical" - xvalign = "center" - xhalign = "center" - yhalign = "center" - yvalign = 'center' - - print "\nDrawing polydotplot...", - log_txt = "\nDrawing polydotplot..." - - # draw subplots - if verbose: - if lcs_shading and custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "Custom matrix value", "Matrix color index", "LCS color index"]) + "\n" - elif lcs_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "LCS color index for", "LCS color index rev"]) + "\n" - elif custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "Custom matrix value", "Color index for", "Color index rev"]) + "\n" - - if verbose: - seq_text = "" - counter, seq_counter = 0, 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - len_two = len(rec_two.seq) - name_two = rec_two.id - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - len_one = len(rec_one.seq) - name_one = rec_one.id - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - # optional shade background according to length of LCS and/or user matrix - ######################################################################### - - # get interval based on LCS - background_colors = [None, None] - if lcs_shading and (lcs_shading_ref==1 or lcs_shading_ref==2 or max_lcs!=None): # self plot max_lcs_for == None - lcs_len = lcs_dict[(idx, jdx)] - l1 = lcs_len[0] # forward - l2 = lcs_len[1] # reverse complement - - lcs_shading_bool = True - - # calculate shading acc. to chosen option - if lcs_shading_ref == 1: # percentage of shorter sequence - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // min(len_one, len_two)) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // min(len_one, len_two)) - elif lcs_shading_ref == 2: # by given interval size - color_idx0 = min(len(colors)-1, l1 // lcs_shading_interval_len) - color_idx1 = min(len(colors)-1, l2 // lcs_shading_interval_len) - if color_idx0 >= len(colors): - color_idx0 = len(colors) - if color_idx1 >= len(colors): - color_idx1 = len(colors) - else: # percentage of maximum lcs length - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // max_lcs) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // max_lcs) - else: - lcs_shading_bool = False - - # get interval based on custom matrix - if custom_shading: - # matrix value - try: - custom_value = custom_similarity_dict[(idx, jdx)] - except: - custom_value = "" - - # bottom left triangle = LCS forward/reverse or best of both - if lcs_shading_bool: - if lcs_shading_ori == 0: # forward - color_idx1 = color_idx0 - elif lcs_shading_ori == 2: # both directions - color_idx1 = max(color_idx0, color_idx1) - - # top right triangle = custom value (not colored if text matrix provided) - if type(custom_value) == int or type(custom_value) == float: - color_idx0 = int((custom_value-custom_min)*lcs_shading_num // (custom_max-custom_min)) - # if string is proviced - else: - color_idx0 = 0 - - # set colors dependent on lcs dependent on orientation - if lcs_shading_bool and not custom_shading: - if idx != jdx: - if lcs_shading_ori == 0: - color_idx1 = color_idx0 - elif lcs_shading_ori == 1: - color_idx0 = color_idx1 - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx1] - # for selfcomparison, only color reverse complement - elif lcs_shading_ori != 0 and not custom_shading: - background_colors[0] = colors[color_idx1] - # set different colors for shading by LCS + user matrix - elif lcs_shading_bool and custom_shading: - # print colors, background_colors, color_idx0, color_idx1 - background_colors[0] = colors_2[color_idx0] - background_colors[1] = colors[color_idx1] - # set grey color range for user matrix if no LCS shading - elif custom_shading: - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx0] - - if verbose: - if custom_shading and lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - elif lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(color_idx0), str(color_idx1)]) + "\n" - elif custom_shading: - lcs_text += "\t".join([name_one, name_two, str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - - # diagonal (self-dotplots) - if idx == jdx: - # skip positions below diagonal - counter = counter + (counter - 1) // (nrows) # + row_pos - counters = [counter] - # draw both graphs at once (due to symmetry) - else: - col_pos = (counter - 1) % ncols - row_pos = (counter - 1) // (nrows) - counter2 = col_pos * ncols + row_pos + 1 - counters = [counter, counter2] - - if len(counters) == 2: - seq_counter += 1 - if not verbose and not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - x_lists, y_lists, x_lists_rc, y_lists_rc = data_dict[(idx, jdx)] - - # plot diagram(s) - for kdx in range(len(counters)): - - # shade annotated regions if gff file(s) provided - if idx == jdx and gff_files != None and gff_files != []: - if name_one in feat_dict.keys(): - features = feat_dict[name_one] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # if custom matrix value printed into upper matrix triangle, skip data plotting - # text print in top triangle - if user_matrix_print and custom_shading and kdx==0 and idx!=jdx: - data_plotting = False - # dotplot in bottom triangle - else: - data_plotting = True - - fig_pos = counters[kdx] - # plotting subplot with matplotlib - ax = P.subplot(gs[fig_pos-1]) # rows, columns, plotnumber - - # mirror plot, if plotting below diagonal - if kdx == 0: - l1, l2 = len_one, len_two - n1, n2 = name_one, name_two - x1, y1 = x_lists, y_lists - x2, y2 = x_lists_rc, y_lists_rc - else: - l2, l1 = len_one, len_two - n2, n1 = name_one, name_two - x1, y1 = y_lists, x_lists - x2, y2 = y_lists_rc, x_lists_rc - - if data_plotting: - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # plot value provided by customer instead of dotplot - else: - alignment = {'horizontalalignment': 'center', 'verticalalignment': 'center'} - # P.text(0.5, 0.5, custom_value, size='medium', transform=ax.transAxes, **alignment) - P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, **alignment) - # P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, - # horizontalalignment='center', verticalalignment='center', color="black") - - if custom_shading: - # omit diagonal - if idx == jdx: - ax.set_facecolor("white") - # use white background for text fields (top right triangle only [kdx 0]) - elif type(custom_value) != int and type(custom_value) != float and kdx == 0: - ax.set_facecolor("white") - else: - ax.set_facecolor(background_colors[kdx]) - # set background color if lcs shading - elif lcs_shading_bool and background_colors[kdx] != None: - ax.set_facecolor(background_colors[kdx]) - - # set axis limits - P.xlim(0, l1+1) - P.ylim(l2+1, 0) # rotate y axis (point downwards) - - # determine axis positions - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - x_label_bool = fig_pos <= ncols - x_tick_bool = fig_pos > ncols*(ncols-1) - else: - x_label_bool = fig_pos > ncols*(ncols-1) - x_tick_bool = fig_pos <= ncols - - # x axis labels dependent on plot position/number - if x_label_bool: # x title and labels on top or bottom - P.xlabel(unicode_name(shorten_name(n1, max_len=title_length, title_selectpos=title_selectpos)), fontsize=label_size, rotation=x_label_rotation, verticalalignment=xvalign, horizontalalignment=xhalign, fontweight='bold', labelpad=8) # axis naming - if not x_label_rotation in ["horizontal", "vertical"]: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation="vertical") - else: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation=x_label_rotation) - elif x_tick_bool and x_label_pos_top: # x ticks on bottom row - ax.xaxis.tick_bottom() # ticks without labels on bottom - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) - elif x_tick_bool: # x ticks on top row - ax.xaxis.tick_top() # # ticks without labels on top - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) # inner diagrams without labelling - else: # no x ticks on internal rows - ax.axes.get_xaxis().set_visible(False) - - # y axis labels dependent on plot position/number - if fig_pos % ncols == 1 or (ncols == 1 and nrows == 1): # y title and labels in 1st column - P.ylabel(unicode_name(shorten_name(n2, max_len=title_length, title_selectpos=title_selectpos)), fontsize=label_size, rotation=y_label_rotation, verticalalignment=yvalign, horizontalalignment=yhalign, fontweight='bold', labelpad=8) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) # axis naming - elif fig_pos % ncols == 0: # y ticks in last column - ax.yaxis.tick_right() - P.setp(ax.get_yticklabels(), visible=False) # inner diagrams without labelling - else: - ax.axes.get_yaxis().set_visible(False) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - try: - logprint(lcs_text, start=False, printing=True) - except: - pass - - # finalize layout - margins & spacing between plots - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - # gs.tight_layout(fig, h_pad=.02, w_pad=.02) # less overlapping tick labels, but also disturbingly large spacing - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, top=0.87) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, bottom=0.13) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing) # space between rows - def 0.4 - - # save figure and close instance - fig_name = '%s%s_wordsize%i%s.%s' % (prefix, name_graph, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - - # create figure color legend - if lcs_shading: - if lcs_shading_ref == 1: # percentage of shorter sequence - legend_file_name = legend_figure(colors, lcs_shading_num, unit="%", filetype=filetype, prefix=prefix) - elif lcs_shading_ref == 2: # interval sizes - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, bins=color_bins) - else: # relative of maximum lcs - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, max_lcs_len=max_lcs) - - if custom_shading: - custom_prefix = "custom-matrix-" + prefix - legend_file_name_custom = legend_figure(colors_2, lcs_shading_num, unit="%", filetype=filetype, prefix=custom_prefix, max_lcs_len=custom_max, min_lcs_len=custom_min) - - if lcs_shading and custom_shading: - return [fig_name, legend_file_name, legend_file_name_custom] - elif lcs_shading: - return [fig_name, legend_file_name] - elif custom_shading: - return [fig_name, legend_file_name_custom] - else: - return [fig_name] - - -############################### -# Function Call # -############################### - -def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_size=10, filetype="png", type_nuc=True, convert_wobbles=False, substitution_count=0, rc_option=True, alphabetic_sorting=False, gff=None, multi=True, ncols=1, nrows=1, lcs_shading=True, lcs_shading_num=5, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, gff_color_config_file="", input_user_matrix_file="", user_matrix_print=False, length_scaling=True, title_length=50, title_selectpos="B", spacing=0.04, verbose=False): - - global t1, line_col_rev - - # read gff color config file if provided - if len(input_gff_files) != 0 and input_gff_files != None: - if gff_color_config_file not in ["", None]: - text = "\n%s\n\nReading GFF color configuration file\n%s\n\n=> %s\n" % (50*"=", 28*"-", gff_color_config_file) - logprint(text, start=False, printing=True) - gff_feat_colors = read_gff_color_config(gff_color_config_file) - else: - gff_feat_colors = {} - if gff_color_config_file not in ["", None]: - text = "Please provide GFF annotation files to use configuration file", gff_color_config_file - logprint(text, start=False, printing=True) - - # if color is set to white, reverse complementary matches are skipped - if not rc_option: - line_col_rev = "white" # reverse matches not calculated - elif not type_nuc: - logprint("Reverse complement deactivated for proteins!") - line_col_rev = "white" # reverse matches not calculated - - mode_text = [] - for item in modes: - mode_text.append(str(item)) - text = "%s\n\nRunning plotting modes %s" % (50*"=", ", ".join(mode_text)) - logprint(text, start=False, printing=True) - - - # create dotplots - ########################################## - - # self dotplots - t1 = time.time() - if 0 in modes: - list_of_png_names = selfdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_selectpos=title_selectpos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, gff_files=gff, gff_color_dict=gff_feat_colors, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # paired dotplots - if 1 in modes: - if multi: - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_selectpos=title_selectpos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=length_scaling, verbose=verbose) - t1 = time_track(t1) - else: - if not length_scaling: - text = "\nPairwise dotplot with individual output files scaled by sequence length automatically!" - logprint(text, start=False, printing=True) - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_selectpos=title_selectpos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=True, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # all-against-all dotplot - if 2 in modes: - list_of_png_names = polydotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_selectpos=title_selectpos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, spacing=spacing, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - text = "\n" + 50 * "#" + "\n" + 50 * "#" - text += "\n\nThank you for using FlexiDot!\n" - logprint(text, start=False, printing=True) - -# testing mode for debugging -trial_mode = False -# trial_mode = True - -# parameters = check_input(sys.argv) -parameters = check_input(sys.argv, trial_mode=trial_mode) - -# read out parameters -commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype, type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos_top, label_size, spacing, length_scaling, title_length, title_selectpos, verbose = parameters - -# evtl. overwrite parameters for testing purposes in trial mode -if trial_mode: - # input_user_matrix_file = "AngioSINE-v18-alignment-identities.csv" - input_fasta = ["test-sequences-9-Ns.fas"] - input_fasta = ["Beta_SINEs__select_consensus.fas"] - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix.txt" - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix-01.txt" - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix-comma-str.txt" - # input_user_matrix_file = "Beta_SINEs__select_consensus_matrix-100+.txt" - # user_matrix_print = True - output_file_prefix = "SINEmatrix" - output_file_prefix = "SINEmatrix-NoShading" - plot_size = 10 - plotting_modes = [0,1,2] - plotting_modes = [2] - lcs_shading = False - lcs_shading = True - lcs_shading_ref = 2 - lcs_shading_num = 4 - lcs_shading_ori = 0 - lcs_shading_interval_len = 15 - wordsize = 10 - wordsize = 7 - x_label_pos_top = True - filetype = "pdf" - filetype = "png" - - wobble_conversion = False - wobble_conversion = True - - substitution_count = 0 - - rc_option = True - rc_option = False - label_size = 10 - - verbose = False - verbose = True - -if auto_fas: - path = os.path.dirname(os.path.abspath(__file__)) - files_long = glob.glob(path+"/*.fasta") - files_long.extend(glob.glob(path+"/*.fas")) - files_long.extend(glob.glob(path+"/*.fa")) - files_long.extend(glob.glob(path+"/*.fna")) - input_fasta = [] - for i in files_long: - if not "combined" in i: - filename = i[i.rfind('\\')+1:] - input_fasta.append(filename) - -if trial_mode: - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - -main(input_fasta, wordsize, modes=plotting_modes, prefix=output_file_prefix, plot_size=plot_size, label_size=label_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=wobble_conversion, substitution_count=substitution_count, rc_option=rc_option, gff=input_gff_files, multi=collage_output, ncols=m_col, nrows=n_row, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, gff_color_config_file=gff_color_config_file, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, length_scaling=length_scaling, title_length=title_length, title_selectpos=title_selectpos, spacing=spacing, verbose=verbose) - - diff --git a/code/flexidot_v1.03.py b/code/flexidot_v1.03.py deleted file mode 100644 index 867ce99..0000000 --- a/code/flexidot_v1.03.py +++ /dev/null @@ -1,3161 +0,0 @@ -#!/usr/bin/python2.7 -# -*- coding: utf-8 -*- - -""" -FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation - -Kathrin M. Seibt, Thomas Schmidt and Tony Heitkam -Institute of Botany, TU Dresden, Dresden, 01277, Germany - -(Bioinformatics, 2018, doi 10.1093/bioinformatics/bty395) -""" - - -############################### -# Requirements # -############################### - -# import system modules -import os, glob -import time, datetime -import sys -import shutil, getopt -import unicodedata - -def module_install_command(module_name, upgrade=False): - """ - create installation commands for Python modules and print information - """ - if upgrade: - load_command = "python -m pip install --upgrade %s" % module_name - else: - load_command = "python -m pip install %s" % module_name - - try: - logprint("Installing Python module: %s\n\t%s\n" % (module_name, load_command)) - except: - print "Installing Python module: %s\n\t%s\n" % (module_name, load_command) - - return load_command - -def load_modules(): - """ - load Python modules, if possible - otherwise try to install them - """ - - # make module names global - global cllct, gridspec, patches, rcParams, mplrc, P, Color, SeqIO, np, ccv, mcolors, rgb2hex, regex - - # matplotlib - try: - import matplotlib.collections as cllct - except: - command = module_install_command("matplotlib", upgrade=True) - try: - os.system(command) - print "\n" - import matplotlib.collections as cllct - except: - print "Please install module matplotlib manually" - from matplotlib.colors import colorConverter as ccv - import matplotlib.colors as mcolors - import matplotlib.gridspec as gridspec - import matplotlib.patches as patches - import pylab as P - - # specify matplotlib font settings - from matplotlib import rc as mplrc - mplrc('pdf', fonttype=42, compression=0) - from matplotlib import rcParams - rcParams['font.family'] = 'sans-serif' - rcParams['font.sans-serif'] = ['Helvetica', 'Verdana', 'Tahoma', ] - - # colour for color gradient palette - try: - from colour import Color - except: - command = module_install_command("colour") - try: - os.system(command) - print "\n" - from colour import Color - except: - print "Please install module colour manually" - - # color converter - try: - from colormap import rgb2hex - except: - command = module_install_command("colormap") - # additional module easydev.tools required by colormap - command2 = module_install_command("easydev") - try: - os.system(command) - os.system(command2) - print "\n" - from colormap import rgb2hex - except: - print "Please install module colormap manually" - - # biopython - try: - from Bio import SeqIO - except: - command = module_install_command("biopython") - try: - os.system(command) - print "\n" - from Bio import SeqIO - except: - print "Please install module biopython manually" - - # numpy - try: - import numpy as np - except: - command = module_install_command("numpy") - try: - os.system(command) - print "\n" - import numpy as np - except: - print "Please install module numpy manually" - - # regex for pattern matching - try: - import regex - except: - command = module_install_command("regex") - try: - os.system(command) - print "\n" - import regex - except: - print "Please install module regex manually" - -load_modules() - - -############################### -# Usage & Input # -############################### - -def usage(): - """ - usage and help - """ - - print """\n\n FLEXIDOT - ------------------------------------------------------------------- - - Version: - 1.03 - - Citation: - Kathrin M. Seibt, Thomas Schmidt, Tony Heitkam (2018) - "FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation" - Bioinformatics, doi: 10.1093/bioinformatics/bty395 - - - General usage: - $ python flexidot.py -a [ARGUMENTS] - $ python flexidot.py -i [ARGUMENTS] - - - ARGUMENTS - ------------------------------------------------------------------- - - - INPUT/OUTPUT OPTIONS... required are [-a] OR [-i] - - -a, --auto_fas Imports all fasta files from current directory (*.fasta, *.fas, *.fa, *.fna) - -i is not needed, if -a is activated - [inactive by default] - - -i, --in_file Input fasta file (fasta file name or comma-separated file list) - > Provide multiple files: Recall -i or provide comma-separated file names - - -o, --output_file_prefix File prefix to be added to the generated filenames [default = NONE] - - -c, --collage_output Multiple dotplots are combined in a collage - Y or 1 = ON [default] - N or 0 = OFF - - -m, --m_col Number of columns per page [default = 4] (only if --collage_output is ON) - - -n, --n_row Number of rows per page [default = 5] (only if --collage_output is ON) - - -f, --filetype Output file format - 0 = PNG [default] - 1 = PDF - 2 = SVG - - -s, --alphabetic_sorting Sort sequences alphabetically according to titles - Y or 1 = ON - N or 0 = OFF [default] - - - CALCULATION PARAMETERS... - - -k, --wordsize Wordsize (kmer length) for dotplot comparison [default = 7] - - -p, --plotting_mode Mode of FlexiDot dotplotting - 0 = self [default] - 1 = paired - 2 = poly (matrix with all-against-all dotplots) - > Run multiple plotting modes: Recall -p or provide comma-separated numbers - - -t, --type_nuc Type of residue is nucleotide - Y or 1 = nucleotide [default] - N or 0 = amino acid - - -w, --wobble_conversion Ambiguity handling for relaxed matching - Y or 1 = ON - N or 0 = OFF [default] - - -S, --substitution_count Number of substitutions (mismatches) allowed per window for relaxed matching - [default = 0] - - -r, --rc_option Find reverse complementary matches (only if type_nuc=y) - Y or 1 = ON [default] - N or 0 = OFF - - - GRAPHIC FORMATTING... - - -A, --line_width Line width [default = 1] - - -B, --line_col_for Line color [default = black] - - -C, --line_col_rev Reverse line color [default = green] - - -D, --x_label_pos Position of the X-label - Y or 1 = top [default] - N or 0 = bottom - - -E, --label_size Font size [default = 10] - - -F, --spacing Spacing between all-against-all dotplots (only if --plotting_mode=2) - [default = 0.04] - - -P, --plot_size Plotsize [default = 10] - - -L, --length_scaling Scale plot size for pairwise comparison (only if --plotting_mode=1) - Y or 1 = Scaling ON (axes scaled according to sequence length) - N or 0 = Scaling OFF (squared plots) [default] - - -T, --title_length Limit title length for dotplot comparisons - [default = 20] - Position of selection can be specified by appending a letter (e.g. -T 20E) - B = beginning [default] - E = end - - - GFF SHADING (for -p/--plotting_mode=0,2 only)... - - -g, --input_gff_files GFF3 file used for markup in self-dotplots - (provide multiple files: Recall -g or provide comma-separated file names) - - -G, --gff_color_config_file Tab-delimited config file for custom gff shading - column 1: feature type - column 2: color - column 3: alpha - column 4: zoom factor (for small regions) - - - LCS SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -x, --lcs_shading Shade subdotplot based on the length of the longest common substring (LCS) - Y or 1 = ON - N or 0 = OFF [default] - - -X, --lcs_shading_num Number of shading intervals (hues) for LCS (-x) and user matrix shading (-u) - [default = 5] - - -y, --lcs_shading_ref Reference for LCS shading - 0 = maximal LCS length [default] - 1 = maximally possible length (length of shorter sequence in pairwise comparison) - 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] - see -Y - - -Y, --lcs_shading_interval_len Length of intervals for LCS shading (only if --lcs_shading_ref=2) - [default for nucleotides = 50; default for amino acids = 10] - - -z, --lcs_shading_ori Shade subdotplots according to LCS on - 0 = forward [default], - 1 = reverse, or - 2 = both strands (forward shading above diagonal, reverse shading on diagonal and below; - if using --input_user_matrix_file, best LCS is used below diagonal) - - - CUSTOM USER MATRIX SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -u, --input_user_matrix_file Shading above diagonal according to values in matrix file specified by the user - (tab-delimited or comma-separated matrix with sequence name in column 1 and numbers in columns 2-n - e.g. identity matrix from multiple sequence alignment - strings are ignored) - - -U, --user_matrix_print Display provided matrix entries in the fields above diagonal of all-against-all dotplot - Y or 1 = ON - N or 0 = OFF [default] - - - OTHERS... - - -h, --help Help screen - - -v, --verbose Verbose - - - - - """ - -def check_input(argv, trial_mode=False): - """ - commandline argument parsing - """ - - global log_txt, aa_bp_unit - - # helpers for argument parsing - ###################################### - - arguments = ["-a", "--auto_fas", "a", "auto_fas", - "-i", "--input_fasta", "i:", "input_fasta=", - "-o", "--output_file_prefix", "o:", "output_file_prefix=", - "-c", "--collage_output", "c:", "collage_output=", - "-m", "--m_col", "m:", "m_col=", - "-n", "--n_row", "n:", "n_row=", - "-f", "--filetype", "f:", "filetype=", - "-t", "--type_nuc", "t:", "type_nuc=", - "-g", "--input_gff_files", "g:", "input_gff_files", - "-G", "--gff_color_config_file", "G:", "gff_color_config_file", - "-k", "--wordsize", "k:", "wordsize=", - "-p", "--plotting_mode", "p:", "plotting_mode=", - "-w", "--wobble_conversion", "w:", "wobble_conversion=", - "-S", "--substitution_count", "S:", "substitution_count=", - "-r", "--rc_option", "r:", "rc_option=", - "-s", "--alphabetic_sorting", "s:", "alphabetic_sorting=", - "-x", "--lcs_shading", "x:", "lcs_shading=", - "-X", "--lcs_shading_num", "X:", "lcs_shading_num=", - "-y", "--lcs_shading_ref", "y:", "lcs_shading_ref=", - "-Y", "--lcs_shading_interval_len", "Y:", "lcs_shading_interval_len=", - "-z", "--lcs_shading_ori", "z:", "lcs_shading_ori=", - "-u", "--input_user_matrix_file", "u:", "input_user_matrix_file=", - "-U", "--user_matrix_print", "U:", "user_matrix_print=", - "-P", "--plot_size", "P:", "plot_size=", - "-A", "--line_width", "A:", "line_width=", - "-B", "--line_col_for", "B:", "line_col_for=", - "-C", "--line_col_rev", "C:", "line_col_rev=", - "-D", "--x_label_pos", "D:", "x_label_pos=", - "-E", "--label_size", "E:", "label_size=", - "-F", "--spacing", "F:", "spacing=", - "-L", "--length_scaling", "L:", "length_scaling=", - "-T", "--title_length", "T:", "title_length=", - "-h", "--help", "h", "help", - "-v", "--verbose", "v", "verbose"] - - arguments_sysargv = tuple(arguments[0::4] + arguments[1::4]) - arguments_opts = "".join(arguments[2::4]) - arguments_args = arguments[3::4] - - - # setting defaults - ###################################### - - auto_fas = False # 0 - input_fasta = [] - output_file_prefix = None - collage_output = True # 1 - m_col = 4 - n_row = 5 - filetype = 0 - type_nuc = True - input_gff_files = [] - gff_color_config_file = "" - - wordsize = 7 - plotting_modes = [0] - wobble_conversion = False # 0 - substitution_count = 0 - rc_option = True # 1 - alphabetic_sorting = False # 0 - - lcs_shading = False # 0 - lcs_shading_num = 4 - lcs_shading_ref = 0 - lcs_shading_interval_len = 50 # interval default changes to "10" for amino acids [type_nuc = n] - lcs_shading_ori = 0 - - input_user_matrix_file = "" - user_matrix_print = False - - plot_size = 10 - line_width = 1 - line_col_for = "black" - line_col_rev = "#009243" - x_label_pos = True # 0 - label_size = 10 - spacing = 0.04 - length_scaling = False # 0 - title_length = 20 # float("Inf") - title_clip_pos = "B" # B (begin), E (end) - max_N_percentage = 49 # fixed value, no user input - - aa_bp_unit = "bp" - - verbose = False # 0 - - filetype_dict = {0: "png", 1: "pdf", 2: "svg"} - lcs_shading_ref_dict = {0: "maximal LCS length", 1: "maximally possible length", 2: "given interval sizes"} - plotting_mode_dict = {0: "self", 1: "paired", 2: "all-against-all"} - lcs_shading_ori_dict = {0: "forward", 1: "reverse complement", 2: "both"} - - # return default parameters for testing purposes - if trial_mode: - print "ATTENTION: YOU ARE IN THE TRIAL MODE!!!\n\n" - - commandline = "trial_mode\n" - - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, verbose] - return parameters - - - # read arguments - ###################################### - - commandline = "" - for arg in sys.argv: - commandline += arg + " " - - log_txt = "\n...reading input arguments..." - print log_txt - - if len(sys.argv) < 2: - print "\nERROR: More arguments are needed. Exit..." - log_txt += "\nERROR: More arguments are needed. Exit..." - usage() - sys.exit() - - elif sys.argv[1] not in arguments_sysargv: - print "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - log_txt += "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - # usage() - sys.exit() - - try: - opts, args = getopt.getopt(sys.argv[1:], arguments_opts, arguments_args) - - except getopt.GetoptError: - print "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - log_txt += "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - # usage() - sys.exit() - - for opt, arg in opts: - - if opt in ("-h", "--help"): - print "...fetch help screen" - log_txt += "\n...fetch help screen" - usage(), sys.exit() - - if opt in ("-v", "--verbose"): - print "...verbose output" - log_txt += "\n...verbose output" - verbose = True - - elif opt in ("-i", "--input_fasta"): - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: fasta_file '%s' was not found!" % str(temp_file) - sys.exit(message) - else: - input_fasta.append(str(temp_file)) - print "fasta file #%i: %s" % (len(input_fasta), str(temp_file)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: fasta_file '%s' was not found!" % str(arg) - log_txt += message - sys.exit(message) - else: - input_fasta.append(str(arg)) - print "fasta file #%i: %s" % (len(input_fasta), str(arg)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(arg)) - - - elif opt in ("-a", "--auto_fas"): - auto_fas = True - - - # multiple gff files: reads them into a list - elif opt in ("-g", "--input_gff_files"): - - # append gff file only if existing - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: gff_file '%s' was not found!" % str(temp_file) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - print "GFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - input_gff_files.append(str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: gff_file '%s' was not found!" % str(arg) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - input_gff_files.append(str(arg)) - print "GFF file #%i: %s" %(len(input_gff_files), str(arg)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(arg)) - - - elif opt in ("-G", "--gff_color_config_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: gff_color_config_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot with default gff coloring specification!" - log_txt += message + "\n -->Running FlexiDot with default gff coloring specification!" - else: - gff_color_config_file = str(arg) - - - elif opt in ("-u", "--input_user_matrix_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: input_user_matrix_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot without input_user_matrix_file %s!" % arg - log_txt += message + "\n -->Running FlexiDot withdefault matrix shading file!" - else: - input_user_matrix_file = str(arg) - - elif opt in ("-U", "--user_matrix_print"): - user_matrix_print = check_bools(str(arg), default=user_matrix_print) - - elif opt in ("-o", "--output_file_prefix"): - output_file_prefix = arg - - elif opt in ("-c", "--collage_output"): - collage_output = check_bools(str(arg), default=collage_output) - - elif opt in ("-m", "--m_col"): - try: m_col = int(arg) - except: - print "m_col - invalid argument - using default value" - log_txt += "\nm_col - invalid argument - using default value" - - elif opt in ("-n", "--n_row"): - try: n_row = int(arg) - except: - print "n_row - invalid argument - using default value" - log_txt += "\nn_row - invalid argument - using default value" - - elif opt in ("-f", "--filetype"): - if 0 <= int(arg) <= 2: - filetype = int(arg) - else: - print "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - log_txt += "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - - elif opt in ("-t", "--type_nuc"): - type_nuc = check_bools(str(arg), default=type_nuc) - - if type_nuc == False: - # interval default changed for amino acids - lcs_shading_interval_len = 10 - aa_bp_unit = "aa" - - elif opt in ("-k", "--wordsize"): - try: wordsize = int(arg) - except: - print "wordsize - invalid argument - using default value" - log_txt += "\nwordsize - invalid argument - using default value" - - elif opt in ("-p", "--plotting_mode"): - if "," in arg: - temp_modes = arg.split(",") - for item in temp_modes: - if item in ["0","1","2"]: - plotting_modes.append(int(item)) - elif arg in ["0","1","2"]: - plotting_modes = [int(arg)] - else: - print "Please provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - log_txt += "\nPlease provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - - elif opt in ("-w", "--wobble_conversion"): - wobble_conversion = check_bools(str(arg), default=wobble_conversion) - - elif opt in ("-S", "--substitution_count"): - try: substitution_count = int(arg) - except: - print "substitution_count - invalid argument - using default value" - log_txt += "\nsubstitution_count - invalid argument - using default value" - - elif opt in ("-r", "--rc_option"): - rc_option = check_bools(str(arg), default=rc_option) - - elif opt in ("-s", "--alphabetic_sorting"): - alphabetic_sorting = check_bools(str(arg), default=alphabetic_sorting) - - elif opt in ("-x", "--lcs_shading"): - lcs_shading = check_bools(str(arg), default=lcs_shading) - - elif opt in ("-X", "--lcs_shading_num"): - try: lcs_shading_num = int(arg) - 1 - except: - print "lcs_shading_num - invalid argument - using default value" - log_txt += "\nlcs_shading_num - invalid argument - using default value" - - elif opt in ("-y", "--lcs_shading_ref"): - try: - if 0 <= int(arg) <= 2: - lcs_shading_ref = int(arg) - else: - print "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - log_txt += "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - except: - print "lcs_shading_ref - invalid argument - using default value" - log_txt += "\nlcs_shading_ref - invalid argument - using default value" - - elif opt in ("-Y", "--lcs_shading_interval_len"): - try: lcs_shading_interval_len = int(arg) - except: - print "lcs_shading_interval_len - invalid argument - using default value" - log_txt += "\nlcs_shading_interval_len - invalid argument - using default value" - - elif opt in ("-z", "--lcs_shading_ori"): - if 0 <= int(arg) <= 2: - lcs_shading_ori = int(arg) - else: - print "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - log_txt += "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - - elif opt in ("-P", "--plot_size"): - try: plot_size = float(arg) - except: - print "plot_size - invalid argument - using default value" - log_txt += "\nplot_size - invalid argument - using default value" - - - elif opt in ("-A", "--line_width"): - try: line_width = float(arg) - except: - print "line_width - invalid argument - using default value" - log_txt += "\nline_width - invalid argument - using default value" - - elif opt in ("-B", "--line_col_for"): - if mcolors.is_color_like(arg): - line_col_for = arg - else: - print "line_col_for - invalid argument - using default value" - log_txt += "\nline_col_for - invalid argument - using default value" - - elif opt in ("-C", "--line_col_rev"): - if mcolors.is_color_like(arg): - line_col_rev = arg - else: - print "line_col_rev - invalid argument - using default value" - log_txt += "\nline_col_rev - invalid argument - using default value" - - elif opt in ("-D", "--x_label_pos"): - x_label_pos = check_bools(str(arg), default=x_label_pos) - - elif opt in ("-E", "--label_size"): - try: label_size = float(arg) - except: - print "label_size - invalid argument - using default value" - log_txt += "\nlabel_size - invalid argument - using default value" - - elif opt in ("-F", "--spacing"): - try: spacing = float(arg) - except: - print "spacing - invalid argument - using default value" - log_txt += "\nspacing - invalid argument - using default value" - - elif opt in ("-L", "--length_scaling"): - length_scaling = check_bools(str(arg), default=length_scaling) - - elif opt in ("-T", "--title_length"): - try: title_length = int(arg) - except: - try: - title_length = int(str(arg)[:-1]) - if arg[-1].upper() in ["B", "E"]: # B (beginning), E (end) - title_clip_pos = arg[-1].upper() - else: - print "title_length position information invalid - using default value" - log_txt += "\ntitle_length position information invalid - using default value" - except: - print "title_length - invalid argument - using default value" - log_txt += "\ntitle_length - invalid argument - using default value" - - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - logprint(log_txt, start=False, printing=False) - - - # print chosen arguments - ###################################### - - text = "\n%s\n" % (70 * "-") - text += "\n" + "INPUT/OUTPUT OPTIONS...\n" - text += "\n" + "Input fasta file: " + ", ".join(input_fasta) - text += "\n" + "Automatic fasta collection from current directory: " + str(auto_fas) - text += "\n" + "Collage output: " + str(collage_output) - text += "\n" + "Number of columns per page: " + str(m_col) - text += "\n" + "Number of rows per page: " + str(n_row) - text += "\n" + "File format: " + filetype_dict[filetype] - text += "\n" + "Residue type is nucleotide: " + str(type_nuc) - - text += "\n" + "\n\nCALCULATION PARAMETERS...\n" - text += "\n" + "Wordsize: " + str(wordsize) - text += "\n" + "Plotting mode: " + str(plotting_modes).replace("[", "").replace("]", "") + "\n" + 51 * " " - for item in plotting_modes: - text += plotting_mode_dict[item] + " " - text += "\n" + "Ambiguity handling: " + str(wobble_conversion) - text += "\n" + "Reverse complement scanning: " + str(rc_option) - text += "\n" + "Alphabetic sorting: " + str(alphabetic_sorting) - - if 0 in plotting_modes and input_gff_files != []: - text += "\n" + "Input gff files: " + ", ".join(input_gff_files) - if gff_color_config_file != "": - text += "\n" + "GFF color config file: " + gff_color_config_file - text += "\n" + "Prefix for output files: " + str(output_file_prefix) - - if 2 in plotting_modes: - text += "\n" + "\n\nLCS SHADING OPTIONS (plotting_mode 'all-against-all' only)...\n" - text += "\n" + "LCS shading: " + str(lcs_shading) - text += "\n" + "LCS shading interval number: " + str(lcs_shading_num + 1) - text += "\n" + "LCS shading reference: " + lcs_shading_ref_dict[lcs_shading_ref] - if lcs_shading_ref == 2: - text += "\n" + "LCS shading interval size [%s]: " % (aa_bp_unit) + str(lcs_shading_interval_len) - text += "\n" + "LCS shading orientation: " + lcs_shading_ori_dict[lcs_shading_ori] - if input_user_matrix_file != "": - text += "\n" + "Custom user shading matrix file: " + input_user_matrix_file - text += "\n" + "Print user matrix values (instead of dotplot): " + str(user_matrix_print) - - text += "\n" + "\n\nGRAPHIC FORMATTING...\n" - text += "\n" + "Plot size: " + str(plot_size) - text += "\n" + "Line width: " + str(line_width) - text += "\n" + "Line color: " + line_col_for - text += "\n" + "Reverse line color: " + line_col_rev - text += "\n" + "X label position: " + str(x_label_pos) - text += "\n" + "Label size: " + str(label_size) - text += "\n" + "Spacing: " + str(spacing) - if title_clip_pos == "E": - text += "\n" + "Title length (limit number of characters): " + "last" + str(title_length) + "characters" - else: - text += "\n" + "Title length (limit number of characters): " + "first" + str(title_length) + "characters" - text += "\n" + "Length scaling: " + str(length_scaling) - text += "\n%s\n" % (70 * "-") - logprint(text) - - - # collect settings - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, verbose] - - return parameters - - -############################### -# Helper Functions # -############################### - -def alphabets(type_nuc=True): - """ - provide ambiguity code for sequences - """ - - nucleotide_alphabet = ["A", "C", "G", "T"] - - nucleotide_alphabet_full = ["A", "C", "G", "T", "N", "B", "D", "H", - "V", "Y", "R", "W", "S", "K", "M"] - - nucleotide_ambiguity_code = {"N": ["A", "C", "G", "T"], # any - "B": ["C", "G", "T"], # not A - "D": ["A", "G", "T"], # not C - "H": ["A", "C", "T"], # not G - "V": ["A", "C", "G"], # not T - "Y": ["C", "T"], # pyrimidine - "R": ["A", "G"], # purine - "W": ["A", "T"], # weak - "S": ["C", "G"], # strong - "K": ["G", "T"], # keto - "M": ["A", "C"]} # amino - - nucleotide_match_dict = {"N": "[ACGTNBDHVYRWSKM]", # any - "B": "[CGTNBDHVYRWSKM]", # not A - "D": "[AGTNBDHVYRWSKM]", # not C - "H": "[ACTNBDHVYRWSKM]", # not G - "V": "[ACGNBDHVYRWSKM]", # not T - "K": "[GTNBDHVYRWSK]", # keto - not A,C,M - "M": "[ACNBDHVYRWSM]", # amino - not G,T,K - "W": "[ATNBDHVYRWKM]", # weak - not C,G,S - "S": "[CGNBDHVYRSKM]", # strong - not A,G,W - "Y": "[CTNBDHVYWSKM]", # pyrimidine - not A,G,R - "R": "[AGNBDHVRWSKM]", # purine - not C,T,Y - "A": "[ANDHVRWM]", - "C": "[CNBHVYSM]", - "G": "[GNBDVRSK]", - "T": "[TNBDHYWK]"} - - # nucleotide_match_dict = {"N": ".", # any - # "B": "[^A]", # not A - # "D": "[^C]", # not C - # "H": "[^G]", # not G - # "V": "[^T]", # not T - # "K": "[^ACM]", # keto - not A,C,M - # "M": "[^GTK]", # amino - not G,T,K - # "W": "[^CGS]", # weak - not C,G,S - # "S": "[^AGW]", # strong - not A,G,W - # "Y": "[^AGR]", # pyrimidine - not A,G,R - # "R": "[^CTY]", # purine - not C,T,Y - # "A": "[ANDHVRWM]", - # "C": "[CNBHVYSM]", - # "G": "[GNBDVRSK]", - # "T": "[TNBDHYWK]"} - - aminoacid_alphabet = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"] - - aminoacid_alphabet_full = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*", "J", - "Z", "B", "X"] - - aminoacid_ambiguity_code = {"J": ["I", "L"], - "Z": ["Q", "E"], - "B": ["N", "D"], - "X": ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"]} # any - - aminoacid_match_dict = {"J": "[ILJ]", - "Z": "[QEZ]", - "B": "[NDB]", - # "X": ".", - "X": "[ARNDCEQGHILKMFPSTWYVUO*XBZJ]", - "A": "[AX]", - "R": "[RX]", - "N": "[NXB]", - "D": "[DXB]", - "C": "[CX]", - "E": "[EXZ]", - "Q": "[QXZ]", - "G": "[GX]", - "H": "[HX]", - "I": "[IXJ]", - "L": "[LXJ]", - "K": "[KX]", - "M": "[MX]", - "F": "[FX]", - "P": "[PX]", - "S": "[SX]", - "T": "[TX]", - "W": "[WX]", - "Y": "[YX]", - "V": "[VX]", - "U": "[UX]", - "O": "[OX]", - "*": "[*X]"} - - aa_only = set(['E', 'F', 'I', 'J', 'L', 'O', 'Q', 'P', 'U', 'X', 'Z', '*']) - # return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aa_only - - if type_nuc: - return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, nucleotide_match_dict - else: - return aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aminoacid_match_dict - -def logprint(text, start=False, printing=True, prefix=""): - """ - log output to log_file and optionally print - """ - - # define log file name and open file - global log_file_name - if start and trial_mode: - log_file_name = "log_file.txt" - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - elif start: - date = datetime.date.today() - time = str(datetime.datetime.now()).split(" ")[1].split(".")[0].replace(":", "-") - log_file_name = "%s_%s_log_file.txt" % (date, time) - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - else: - log_file = open(log_file_name, 'a') - - # write log (and print) - log_file.write(text + "\n") - if printing: - print text - log_file.close() - -def time_track(starting_time, show=True): - """ - calculate time passed since last time measurement - """ - now = time.time() - delta = now - starting_time - if show: - text = "\n\t %s seconds\n" % str(delta) - logprint(text, start=False, printing=True) - return now - -def calc_fig_ratio(ncols, nrows, plot_size, verbose=False): - """ - calculate size ratio for given number of columns (ncols) and rows (nrows) - with plot_size as maximum width and length - """ - ratio = ncols*1./nrows - if verbose: - text = " ".join([ncols, nrows, ratio]) - logprint(text, start=False, printing=True) - if ncols >= nrows: - figsize_x = plot_size - figsize_y = plot_size / ratio - else: - figsize_x = plot_size * ratio - figsize_y = plot_size - return figsize_x, figsize_y - -def shorten_name(seq_name, max_len=20, title_clip_pos="B"): #, delim="_"): - """ - shorten sequence names (for diagram titles) - """ - - if len(seq_name) <= max_len: - return seq_name - - # take last characters - if title_clip_pos == "E": - name = seq_name[len(seq_name)-max_len:] - - # take first characters - else: - name = seq_name[:max_len] - - """# keep first and last part if multiple parts separated by delimiter (e.g. species_prefix + sequence_id) - if delim in seq_name: - if seq_name.count(delim) >= 2: - name = "%s..." % delim.join(seq_name.split(delim)[:1]) + seq_name.split(delim)[-1] # .replace("_000", "-") - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - - if len(name) > max_len: - name = name[:((max_len-2)//2)] + "..." + name[((max_len-2)//2):] - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - """ - - return name - -def unicode_name(name): - """ - replace non-ascii characters in string (e.g. for use in matplotlib) - """ - unicode_string = eval('u"%s"' % name) - return unicodedata.normalize('NFKD', unicode_string).encode('ascii','ignore') - -def check_bools(arg, update_log_txt = True, default=None): - """ - converts commandline arguments into boolean - """ - - - # convert valid arguments - if str(arg).lower() == "y" or str(arg) == "1": - return True - elif str(arg).lower() == "n" or str(arg) == "0": - return False - - # use default in case of invalid argument - else: - if update_log_txt: - global log_txt - log_txt += "using default for " + str(arg) - else: - try: - logprint("using default for " + str(arg)) - except: - print "using default for " + str(arg) - return default - -def create_color_list(number, color_map=None, logging=False, max_grey="#595959"): - """ - create color list with given number of entries - grey by default, matplotlib color_map can be provided - """ - - try: - # create pylab colormap - cmap = eval("P.cm." + color_map) - # get descrete color list from pylab - cmaplist = [cmap(i) for i in range(cmap.N)] # extract colors from map - # determine positions for number of colors required - steps = (len(cmaplist)-1)/(number) - numbers = range(0, len(cmaplist), steps) - - # extract color and convert to hex code - colors = [] - for idx in numbers[:-1]: - rgb_color = cmaplist[idx] - col = rgb2hex(rgb_color[0]*255, rgb_color[1]*255, rgb_color[2]*255) - colors.append(col) - - # grey - except: - if not color_map == None: - logprint("Invalid color_map (%s) provided! - Examples: jet, Blues, OrRd, bwr,..." % color_map) - logprint("See https://matplotlib.org/users/colormaps.html\n") - old_max_grey = "#373737" - old_max_grey = "#444444" - colors = list(Color("#FFFFFF").range_to(Color(max_grey), number)) # grey - for idx in range(len(colors)): - colors[idx] = str(colors[idx]).replace("Color ", "") - if "#" in colors[idx] and len(colors[idx]) != 7: - # print colors[idx] - colors[idx] = colors[idx] + colors[idx][-(7-len(colors[idx])):] - - text = "%d Colors: %s" % (len(colors), ", ".join(colors)) - if logging: logprint(text, start=False, printing=True) - - if len(colors) < number: - logprint("\nError in color range definition! %d colors missing\n" % (number - len(colors))) - - return colors - - -############################### -# File Handling # -############################### - -def read_seq(input_fasta, verbose=False): - """ - read fasta sequences from (all) file(s) - """ - - # check if file provided - if input_fasta == [] or input_fasta == "": - text = "Attention: No valid file names provided: >%s<" % input_fasta - logprint(text, start=False, printing=True) - return {}, [] - - # combine sequence files, if required - if type(input_fasta) == list: - # concatenate fasta files - if len(input_fasta) > 1: - if verbose: - print "concatenating fastas...", - text = "concatenating fastas..." - input_fasta_combi = concatenate_files(input_fasta) - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - else: - input_fasta_combi = input_fasta[0] - else: - input_fasta_combi = input_fasta - - # read sequences - if verbose: - print "reading fasta...", - text = "reading fasta...", - try: - seq_dict = SeqIO.index(input_fasta_combi, "fasta") - except ValueError: - logprint("Error reading fasta sequences - please check input files, e.g. for duplicate names!") - return {}, [] - except: - logprint("Error reading fasta sequences - please check input files!") - return {}, [] - - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - - for seq in seq_dict: - if "-" in seq_dict[seq].seq: - # ungapped = seq_dict[seq].seq.ungap("-") # cannot be assigned back to sequence record - text = "\nSequences degapped prior Analysis!!!" - logprint(text, start=False, printing=True) - return read_seq(degap_fasta(input_fasta), verbose=verbose) - - # get ordered sequence names - sequences = [] - for item in SeqIO.parse(input_fasta_combi, "fasta"): - sequences.append(item.id) - return seq_dict, sequences - -def read_gff_color_config(gff_color_config_file=""): - """ - define coloring options for gff-based color shading of self-dotplots - """ - - # default aestetics for annotation shading (e.g. if no user config file is provided) - # dictionary with feature_type as key and tuple(color, transparency, zoom) as value - gff_feat_colors = {"orf": ("#b41a31", 0.2, 0), - "orf_rev": ("#ff773b", 0.3, 0), - "gene": ("#b41a31", 0.2, 0), - "cds": ("darkorange", 0.2, 0), - "exon": ("orange", 0.2, 0), - "intron": ("lightgrey", 0.2, 0), - "utr": ("lightblue", 0.2, 0), - "repeat_region": ("green", 0.3, 0), - "repeat": ("green", 0.3, 0), - "tandem_repeat": ("red", 0.3, 0), - "transposable_element": ("blue", 0.3, 0), - "ltr_retrotransposon": ("#cccccc", 0.5, 0), - "ltr-retro": ("#cccccc", 0.5, 0), - "long_terminal_repeat": ("#2dd0f0", 0.75, 2), - "ltr": ("#2dd0f0", 0.75, 2), - "pbs": ("purple", 0.75, 2), - "ppt": ("#17805a", 0.5, 2), - "target_site_duplication": ("red", 0.75, 2), - "misc_feature": ("grey", 0.3, 0), - "misc_feat": ("grey", 0.3, 0), - "misc": ("grey", 0.3, 0), - "others": ("grey", 0.5, 0)} - if gff_color_config_file in ["", None] or not os.path.exists(str(gff_color_config_file)): - return gff_feat_colors - - text = "Updating GFF color configuration with custom specifications\n" - logprint(text, start=False, printing=True) - - # read custom gff_color_config_file - in_file = open(gff_color_config_file, 'rb') - overwritten = set([]) - for line in in_file: - if not line.startswith("#") and len(line.strip().split("\t")) >= 4: - data = line.strip().split("\t") - feat = data[0].lower() - color = data[1].lower() - - # check, if settings are valid - if not mcolors.is_color_like(color): - color = "grey" - text = "Invalid color specified for %s: %s - default grey" % (data[0], data[1]) - logprint(text) - try: - alpha = float(data[2]) - except: - alpha = 0.75 - text = "Invalid alpha specified for %s: %s - default 0.75" % (data[0], data[2]) - logprint(text) - try: - zoom = float(data[3]) - except: - zoom = 0 - text = "Invalid zoom specified for %s: %s - default 0" % (data[0], data[3]) - logprint(text) - - # track changes of predefined settings - if feat in gff_feat_colors.keys(): - overwritten.add(data[0].lower()) - - gff_feat_colors[feat] = (color, alpha, zoom) - in_file.close() - - # default coloring for unknown annotations - if not "others" in gff_feat_colors.keys(): - gff_feat_colors["others"] = ("grey", 0.5, 0) - - if verbose: - # print configuration - text = "\n\nGFF color specification:\n%s\n" % (60 * ".") - for item in sorted(gff_feat_colors.keys()): - text += "%-30s\t%-10s\t%-5s\t%s\n" % (item, str(gff_feat_colors[item][0]), str(gff_feat_colors[item][1]), str(gff_feat_colors[item][2])) - logprint (text, printing=True) - - # print overwritting feature type specifications - if len(overwritten) != 0: - text = "%d feature type specifications overwritten:" % len(overwritten) - text += "\n\t"+ ", ".join(overwritten) + "\n" - logprint(text, start=False, printing=True) - - text = "GFF color specification updated acc. to %s\n\t%s\n\n" % (gff_color_config_file, ", ".join(gff_feat_colors)) - logprint(text, start=False, printing=True) - - return gff_feat_colors - -def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=True, prefix="", filetype='png', verbose=False): - """ - create feature dictionary from input_gff - sequence name as key and (feature type, start, stop) as value - """ - if type(input_gff_files) != list: - input_gff_files = [input_gff_files] - - # create dictionary with seq_name as key and (type, start and stop) as value - unknown_feats = set([]) - used_feats = set([]) - feat_dict = {} - for input_gff in input_gff_files: - text = "...reading " + input_gff - logprint(text, start=False, printing=True) - - in_file = open(input_gff, 'rb') - for line in in_file: - if not line.startswith("#") and line.strip() != "": - data = line.strip().split("\t") - feat_type = data[2].lower() - if data[6] == "-": - feat_type += "_rev" - if not feat_type.lower() in color_dict.keys(): - if feat_type.lower().replace("_rev", "") in color_dict.keys(): - feat_type = feat_type.replace("_rev", "") - else: - unknown_feats.add(feat_type) - feat_type = "others" - used_feats.add(feat_type) - if not data[0] in feat_dict.keys(): - feat_dict[data[0]] = [(feat_type, int(data[3]), int(data[4]))] # feature type, start, stop - else: - feat_dict[data[0]].append((feat_type, int(data[3]), int(data[4]))) # feature type, start, stop - if verbose: - text = "\nAnnotations for: %s\n" % ", ".join(feat_dict.keys()[:10]) - if len(feat_dict.keys()) > 10: - text = text[:-1] + ", ...\n" - logprint(text, start=False, printing=True) - in_file.close() - - # print feature types without specific shading settings - if len(unknown_feats) != 0: - text = "Missing shading specification for %d feature type(s):\n\t%s\n" % (len(unknown_feats), ", ".join(sorted(unknown_feats))) - logprint(text, start=False, printing=True) - - # create color legend - colors, alphas = [], [] - for item in sorted(used_feats): - colors.append(color_dict[item][0]) - alphas.append(color_dict[item][1]) - legend_figure(colors=colors, lcs_shading_num=len(used_feats), type_nuc=type_nuc, bins=sorted(used_feats), alphas=alphas, gff_legend=True, prefix=prefix, filetype=filetype) - - # print settings - text = "GFF Feature Types: %s\nGFF Colors: %s" % (", ".join(sorted(used_feats)), ", ".join(sorted(colors))) - logprint(text, start=False, printing=True) - - return feat_dict - -def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, verbose=False): - input_file = open(matrix_file_name, 'rb') - - # read sequence names from first column - names = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - names.append(line.strip().split(delim)[0]) - logprint("Delimiter '%s': %d names - %s\n" % (delim, len(names), ", ".join(names))) - - # check if names were found - otherwise try another delimiter - if names == [] and not recursion: - if delim == "\t": - new_delim = "," - else: - new_delim = "\t" - logprint("\nMatrix file not containing data delimited by '%s' - trying to read matrix with delimiter '%s'" % (delim.replace("\t", "\\t"), new_delim)) - info_dict = read_matrix(matrix_file_name, delim=new_delim, symmetric=symmetric, recursion=True, verbose=verbose) - return info_dict - elif names == []: - logprint("Empty matrix file with alternative delimiter!") - return info_dict - input_file.close() - - input_file = open(matrix_file_name, 'rb') - # read matrix entries as values in dictionary with tuple(names) as key - info_dict = {} - contradictory_entries = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - data = line.strip().split(delim) - for idx in range(len(data[1:])): - # print tuple(sorted([data[0], names[idx]])), data[idx+1] - if symmetric: - key = tuple(sorted([names[idx], data[0]])) - else: - key = tuple(names[idx], data[0]) - if key in info_dict.keys(): - if symmetric and info_dict[key] != data[idx+1] and data[idx+1] not in ["", "-"] and info_dict[key] not in ["", "-"]: - contradictory_entries.append(key) - info_dict[key] = data[idx+1] - input_file.close() - - if len(contradictory_entries) != 0: - try: - logprint("\nContradictory entries in matrix file %s:\n\t%s" % (matrix_file_name, ", ".join(contradictory_entries))) - except: - log_txt = "\nContradictory entries in matrix file %s:\n\t" % (matrix_file_name) - for item in contradictory_entries: - log_txt += str(item).replace("'", "") + ", " - log_txt = log_txt[:-2] - logprint(log_txt) - logprint("Using value from bottom left triangle!") - if verbose: - logprint("\nMatrix information for Sequences named: " % ", ".join(names)) - - return info_dict - -def concatenate_files(file_list, combi_filename="temp_combined.fasta", verbose=False): - """ - concatenate content of all files in file_list into a combined file named combi_filename - """ - out_file = open(combi_filename, 'w') - text = "" - for item in file_list: - if verbose: - text += item + " " - print item, - # read in_file linewise and write to out_file - in_file = open(item, 'rb') - for line in in_file: - out_file.write(line.strip()+"\n") - in_file.close() - out_file.close() - if verbose: - logprint(text, start=False, printing=False) - return combi_filename - -def degap_fasta(input_fasta): - """ - remove gaps from fasta - new degapped sequence file created - """ - - # degap all sequence files - output_fastas = [] - if type(input_fasta) != list: - input_fasta = list(input_fasta) - for input_fas in input_fasta: - output_fas = input_fas[:input_fas.rfind(".")] + "_degapped.fas" - in_file = open(input_fas, 'rb') - out_file = open(output_fas, 'w') - for line in in_file: - if line.startswith(">"): - out_file.write(line.strip()+"\n") - else: - out_file.write(line.strip().replace("-", "")+"\n") - out_file.close() - in_file.close() - output_fastas.append(output_fas) - return output_fastas - -def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="png", max_lcs_len=None, min_lcs_len=0, bins=[], alphas=[], gff_legend=False, prefix="", verbose=False): - """ - create figure color legend - """ - max_legend_length_row = 8 - max_legend_length_col = 4 - - # define output file - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg" - logprint(text, start=False, printing=True) - filetype="png" - - # check if length of information fit - if not gff_legend and ((bins != [] and len(colors) != lcs_shading_num+1) or (bins != [] and len(colors) != len(bins)+1)): - if bins != [] and len(colors) != lcs_shading_num+1: - text = "**Attention**\nlcs_shading_num (%d) does not match number of colors (%d)!\n"% (lcs_shading_num, len(bins)) - elif bins != [] and len(colors) != len(bins)+1: - text = "**Attention**\nnumber of LCS length bins (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - elif gff_legend and len(bins) != len(colors): - text = "**Attention**\nnumber of GFF Feature Types (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - - # set alpha values to opaque if none are provided - if alphas == []: - for item in colors: - alphas.append(1) - - # legend data points - data_points = range(len(colors)) - if not gff_legend: - - # specify intervals, if max_lcs_len provided - if max_lcs_len != None: - multi_factor = 100 # one digit - if max_lcs_len <= 1: - multi_factor = 1000 # two digits - # len_interval_size = (max_lcs_len-min_lcs_len) * multi_factor *1. // lcs_shading_num * (1./ multi_factor) - len_interval_size = (max_lcs_len-min_lcs_len) * 1. / lcs_shading_num - len_pos = [float("%.2f" % (min_lcs_len))] - # calculate interval positions - for idx in range(lcs_shading_num): - len_pos.append(float("%.2f" % (len_pos[-1] + len_interval_size))) - - if prefix.startswith("custom-matrix") and (0 <= max_lcs_len <= 100 and 0 <= min_lcs_len <= 100): - unit = "%" - elif prefix.startswith("custom-matrix"): - unit = "" - - text = "\n%d Legend intervals from %.2f to %.2f: \n\t%s - number: %d, step: %.2f, unit: %s\n" % (lcs_shading_num+1, min_lcs_len, max_lcs_len, str(len_pos), len(len_pos), len_interval_size, unit) - logprint(text, start=False, printing=True) - pos = len_pos - interval_size = len_interval_size - else: - # generate legend labels acc. to standard interval notation - interval_size = 100 // lcs_shading_num - pos = range(interval_size, 101+interval_size, interval_size) - - if bins != []: # labels provided - legend_labels = bins[:] - legend_labels.append("max") - legend_labels_lengths = [] - for item in bins: - legend_labels_lengths.append("[%d %s, %d %s)" % (item - min(bins), unit, item, unit)) - if len(bins) == len(colors) - 1: - legend_labels_lengths.append("[%d %s, %s]" % (max(bins), unit, u"\u221E")) # infinite - - else: - legend_labels = [] - legend_labels_lengths = [] - for idx in range(len(pos)): - num = pos[idx] - legend_labels.append("[%d%%, %d%%)" % (num - interval_size, num)) - if max_lcs_len != None: - num = len_pos[idx] - # as int or float - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths.append("[%d %s, %d %s)" % (num, unit, num + len_interval_size, unit)) - else: - legend_labels_lengths.append("[%.2f %s, %.2f %s)" % (num, unit, num + len_interval_size, unit)) - legend_labels[-1] = "100" + unit - if max_lcs_len != None: - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths[-1] = "%d %s" % (max_lcs_len, unit) - else: - legend_labels_lengths[-1] = "%.2f %s" % (max_lcs_len, unit) - - # set labels and choose file name - if gff_legend: - label_text = bins[:] - edge_col = None - legend_file_name = "GFF_Shading_Legend_n%d." % lcs_shading_num + filetype - elif max_lcs_len != None: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_max%d%s_n%d." % (max_lcs_len, unit, lcs_shading_num) + filetype - elif bins != []: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%d%s_n%d." % (bins[0], unit, lcs_shading_num) + filetype - else: - label_text = legend_labels[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%%len_n%d." % lcs_shading_num + filetype - - if prefix != None and prefix != "": - if not prefix.endswith("-"): - prefix = prefix + "-" - legend_type = "LCS" - if prefix.startswith("custom-matrix"): - prefix = prefix.replace("custom-matrix", "")[1:] - legend_type = "CustomMatrix" - legend_file_name = prefix + legend_file_name.replace("LCS", legend_type) - - # plot legend figure - fig, ax = P.subplots(3, 1, figsize=(len(colors)*2, len(colors)*2)) - for idx in range(len(colors)): - ax[0].bar(data_points[idx]+1, data_points[idx]+1, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[2].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].set_ylim(0,1) - ax[2].set_ylim(0,1) - ax[1].legend(ncol=((len(colors)-1)//max_legend_length_row)+1, framealpha=1) # vertical legend - col_num = len(colors) - if len(colors) > max_legend_length_col: - remainder = 0 - if len(colors) % max_legend_length_col != 0: - remainder = 1 - row_num = len(colors) // max_legend_length_col + remainder - remainder = 0 - if len(colors) % row_num != 0: - remainder = 1 - col_num = len(colors) // row_num + remainder - ax[2].legend(ncol=col_num, framealpha=1) # horizontal legend - - P.savefig(legend_file_name) - - return legend_file_name - - -############################### -# Analysis Functions # -############################### - -def wobble_replacement(sequence, general_ambiguity_code, verbose=False): - """ - get all degenerated sequences for sequence with ambiguous residues - (only residues considered that are keys in wobble_dictionary) - """ - - # get positions of ambiguous residues - wobble_pos = [] - for idx in range(len(sequence)): - letter = sequence[idx] - if letter in general_ambiguity_code.keys(): - wobble_pos.append(idx) - - if verbose: - text = "\t%d wobbles" % len(wobble_pos) - logprint(text, start=False, printing=True) - - # replace one wobble through each iteration by all possible residues - # repeat if still wobbles in new kmers - kmer_variants = [sequence] - while True: - if verbose: - text = "\t\t%d kmer variants" % len(kmer_variants) - logprint(text, start=False, printing=True) - temp_kmers = set([]) - for kmer in kmer_variants: - for idx in wobble_pos: - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - for base in general_ambiguity_code[kmer[idx]]: - newkmer = kmer[:idx] + base + kmer[idx+1:] - temp_kmers.add(newkmer) - wobble = False - for kmer in temp_kmers: - for idx in range(len(kmer)): - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - wobble = True - break - if wobble: - break - kmer_variants = set(list(temp_kmers)[:]) - if not wobble: - break - - return kmer_variants - -def split_diagonals(data, stepsize=1): - """ - split array if point difference exceeds stepsize - data = sorted list of numbers - """ - return np.split(data, np.where(np.diff(data) != stepsize)[0]+1) - -def longest_common_substring(s1, s2): - m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))] - longest, x_longest = 0, 0 - for x in xrange(1, 1 + len(s1)): - for y in xrange(1, 1 + len(s2)): - if s1[x - 1] == s2[y - 1]: - m[x][y] = m[x - 1][y - 1] + 1 - if m[x][y] > longest: - longest = m[x][y] - x_longest = x - else: - m[x][y] = 0 - return longest - -def lcs_from_x_values(x_values): - """ - calculate length of longest common substring based on nested list of numbers - """ - if len(x_values) == 0: - return 0 - # get lengths of each subarray data - lengths = np.array([len(i) for i in x_values]) - return max(lengths) - - -############################### -# Matching Functions # -############################### - -def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - - # forward - ################################# - kmer_pos_dict_one = {}; kmer_pos_dict_two = {} # dictionaries for both sequences - - # reverse complement - ################################# - kmer_pos_dict_three = {}; kmer_pos_dict_four = {} # dictionaries for both sequences - - # create dictionaries with kmers (wordsize) and there position(s) in the sequence - if rc_option: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two), - (str(seq_one), kmer_pos_dict_three), - (str(seq_two.reverse_complement()), kmer_pos_dict_four)] - else: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two)] - for (seq, kmer_pos_dict) in data_list: - for i in range(len(seq)-wordsize+1): - kmer = seq[i:i+wordsize] - # discard kmer, if too many Ns included - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - if not convert_wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - wobbles = False - for item in general_ambiguity_code.keys(): - if item in kmer: - wobbles = True - break - if not wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - kmer_variants = wobble_replacement(kmer, general_ambiguity_code) - for new_kmer in kmer_variants: - # print "\t", new_kmer - try: - kmer_pos_dict[new_kmer].append(i) - except KeyError: - kmer_pos_dict[new_kmer] = [i] - - # find kmers shared between both sequences - matches_for = set(kmer_pos_dict_one).intersection(kmer_pos_dict_two) # forward - matches_rc = set(kmer_pos_dict_three).intersection(kmer_pos_dict_four) # reverse complement - - if verbose: - text = "[matches: %i for; %.i rc]" % (len(matches_for), len(matches_rc)) - logprint(text, start=False, printing=True) - - # create lists of x and y co-ordinates for scatter plot - # keep all coordinates of all shared kmers (may match multiple times) - diag_dict_for = {} - diag_dict_rc = {} - for (match_list, pos_dict1, pos_dict2, diag_dict) in [(matches_for, kmer_pos_dict_one, kmer_pos_dict_two, diag_dict_for), - (matches_rc, kmer_pos_dict_three, kmer_pos_dict_four, diag_dict_rc)]: - for kmer in match_list: - for i in pos_dict1[kmer]: - for j in pos_dict2[kmer]: - diag = i-j - points = set(range(i+1, i+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - -def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize via regular expression search - fuzzy matching - allow up to substitution_count substitutions - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - ambiguity_match_dict = alphabets(type_nuc)[3] - - ambiq_residues = "[%s]" % "".join(general_ambiguity_code.keys()) - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # check for wobble presence - if not (regex.search(ambiq_residues, str(seq_one)) == None and regex.search(ambiq_residues, str(seq_two)) == None): - wobble_found = True - else: - wobble_found = False - - # dictionary for matches - diag_dict_for = {} - diag_dict_rc = {} - counter = [0, 0] - - # one-way matching - if rc_option: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0), - (str(seq_one), str(seq_two.reverse_complement()), diag_dict_rc, 1)] - else: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0)] - - for seq_query, seq_target, diag_dict, counter_pos in data_list: - # split query sequence into kmers - if not rc_option and counter_pos == 1: - break - - for idx in range(len(str(seq_query))-wordsize+1): - kmer = str(seq_query)[idx:idx+wordsize] - - # skip excessive N/X stretches (big black areas) - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - # convert kmer to regular expression for wobble_matching - if convert_wobbles and wobble_found: - kmer_string = "" - # replace each residue with matching residues or wobbles - for jdx in range(len(kmer)): - kmer_string += ambiguity_match_dict[kmer[jdx]] - else: - kmer_string = kmer - - # convert to regular expression tolerating substitution errors - if type(substitution_count) == int and substitution_count != 0: - kmer_string = "(%s){s<=%d}" % (kmer_string, substitution_count) - - # search for regular expression in target sequence - kdx = 0 - start = True - if regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - while regex.search(kmer_string, seq_target[kdx:]) != None: - # search for regular expression pattern in target sequence - result = regex.search(kmer_string, seq_target[kdx:]) - - kmer2 = seq_target[kdx:][result.start():result.end()] - - # skip excessive N/X stretches (big black areas) - if kmer2.count(any_residue)*100./wordsize <= max_N_percentage: - diag = idx-(kdx+result.start()) - points = set(range(idx+1, idx+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - kdx += result.start() + 1 - if kdx >= len(seq_target): - break - elif regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - - if verbose: - text = "%5.i \tforward matches" % counter[0] - text += "\n%5.i \treverse complementary matches" % counter[1] - logprint(text, start=False, printing=True) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - - -############################### -# Dot Plot Functions # -############################### - -def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, title_length=float("Inf"), title_clip_pos="B"): - """ - self-against-self dotplot - partially from biopython cookbook - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least one input sequence - if len(sequences) == 0: - text = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - text += " No sequences provided for selfdotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1 and multi: - text = "\n\nCreating collage output for single selfdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nSelfdotplot Collage: Invalid collage - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences): - ncols = len(sequences) - nrows = 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - if prefix != None and prefix != "": - legend_prefix = prefix + "-Selfdotplot" - else: legend_prefix = "Selfdotplot" - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=legend_prefix, filetype=filetype, verbose=verbose) - - global t1 - - print "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-"), - log_txt = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - - # preparations for file name - name_graph = "Selfdotplots" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if multi: - suffix += "_collage" - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - list_of_png_names = [] - - counter = 0 - for seq_name in sequences: - print seq_name, - log_txt += " " + seq_name - - counter += 1 - if not multi: - P.cla() # clear any prior graph - - # read sequence - seq_record = seq_dict[seq_name] - name_seq = seq_record.id - seq_one = seq_record.seq.upper() - length_seq = len(seq_one) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_regex(seq_one, seq_one, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG", - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_diag(seq_one, seq_one, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlim(0, length_seq+1) - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - P.title(unicode_name(shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, fontweight='bold') - # P.title(unicode_name(name_seq), fontsize=label_size*1.3, fontweight='bold') - - # save figure and reinitiate if page is full - if counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' % (prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - else: # not multi - - fig = P.figure(figsize=(plot_size, plot_size)) # figure size needs to be a square - ax = P.subplot(1, 1, 1) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - number = 0 - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlim(0, length_seq+1) - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - P.title(unicode_name(shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size*1.3, fontweight='bold') - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_%s_wordsize%i%s.%s' %(prefix, name_graph, counter, shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos), wordsize, suffix, filetype) - P.savefig(fig_name, bbox_inches='tight') - - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - if multi and counter >= 1: - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - print "\n\nDrawing selfdotplots done" - log_txt += "\n\nDrawing selfdotplots done" - logprint(log_txt, start=False, printing=False) - - return list_of_png_names - -def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, length_scaling=True, scale_delim_col="red", title_length=float("Inf"), title_clip_pos="B"): - """ - pairwise dotplot (all-against-all) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least two input sequences - if len(sequences) < 2: - text = "\n%s\n\nCreating %d paired dotplot image \n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += " Please provide at least two sequences for pairdotplot!\n\nTerminating paired dotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 2 and multi: - text = "\n\nCreating collage output for single pairdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nPairdotplot Collage: Invalid collage settings - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences)*(len(sequences)-1): - ncols = len(sequences) - nrows = 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences)*(len(sequences)-1): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - - text = "\n%s\n\nCreating %d paired dotplot image for\n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += ", ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - y_label_rotation = "vertical" - - # preparations for file name - name_graph = "Pairdotplot" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if length_scaling: - suffix += "_scaled" - if multi: - suffix += "_collage" - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - list_of_png_names = [] - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - - # prepare LCS data file - lcs_data_file = open("%sPairdotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - counter, seq_counter = 0, 0 - print "Drawing pairwise dotplot...", - log_txt = "Drawing pairwise dotplot..." - if verbose: - seq_text = "" - for idx in range(len(sequences)-1): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx+1, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += " " + str(seq_counter) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - - # write LCS data file - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - else: - # calculate figure size for separate figures - if len_one >= len_two: - sizing = (plot_size, max(2, (plot_size)*len_two*1./len_one)) - # sizing = (plot_size, min(plot_size, max(2, (plot_size-2)*len_two*1./len_one+2))) - else: - sizing = (max(2, (plot_size)*len_one*1./len_two), plot_size) - # sizing = (min(plot_size, max(2, (plot_size-2)*len_one*1./len_two+2)), plot_size) - fig = P.figure(figsize=(plot_size, plot_size)) - - ax = P.subplot(1, 1, 1) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlabel(unicode_name(shorten_name(name_one, max_len=title_length, title_clip_pos=title_clip_pos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.ylabel(unicode_name(shorten_name(name_two, max_len=title_length, title_clip_pos=title_clip_pos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - if not multi: - if length_scaling: - ax.set_aspect(aspect='equal', adjustable='box', anchor='NW') - P.xlim(0, len_one+1) - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - elif not length_scaling: - P.xlim(0, len_one+1) - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - else: - max_len = max(len_one, len_two) - P.xlim(0, max_len+1) - P.ylim(max_len+1, 0) # rotate y axis (point downwards) - - # plot line deliminating shorter sequence - if max_len != len_one: - ax.plot((len_one+1, len_one+1), (0, len_two), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - if max_len != len_two: - ax.plot((0, len_one), (len_two+1, len_two+1), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - - # evtl. switch x axis position - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - P.setp(ax.get_xticklabels(), fontsize=label_size*.9) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) - - # save figure and reinitiate if page is full - if multi and counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=.5, wspace=.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=.5, wspace=.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - elif not multi: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, bottom=0.05) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02) # space between rows - def 0.4 - - # name and create output files - fig_name = '%s%s-%d_wordsize%i%s.%s' % (prefix, name_graph, counter, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - list_of_png_names.append(fig_name) - fig = P.figure() - - # save figure - if multi and counter >= 1: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=0.5, wspace=0.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.5, wspace=0.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - print - logprint(seq_text, start=False, printing=False) - - return list_of_png_names - -def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, max_N_percentage=49, verbose=False, lcs_shading=True, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, lcs_shading_num=5, spacing=0.04, input_user_matrix_file="", user_matrix_print=True, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, title_length=float("Inf"), title_clip_pos="B", rotate_labels=False): - """ - all-against-all dotplot - derived from dotplot function - - lcs_shading_refs: - 0 color relative to maximum lcs observed in dataset [default] - 1 color by coverage of shorter sequence (e.g. lcs = 70% of seq1) - lcs_shading_ori - 0 forward only - 1 reverse only - 2 both orientations (in opposite plot) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - if len(sequences) == 0: - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " No sequences provided for polydotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1: - text = "\n\nCreating polydotplot for single sequence!" - text += "\nRecommendation: Use selfdotplot via '--plotting_mode 0'!\n\n" - logprint(text, start=False, printing=True) - - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " " + " ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - if prefix != None and prefix != "": - legend_prefix = prefix + "-Polydotplot" - else: legend_prefix = "Polydotplot" - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=legend_prefix, filetype=filetype, verbose=verbose) - - if lcs_shading and not type_nuc: - if lcs_shading_ori != 0: - lcs_shading_ori = 0 - text = "Protein shading does not support reverse complementary matching!\n" - logprint(text, start=False, printing=True) - - # read custom shading matrix & match names of sequences to fasta - if input_user_matrix_file != "" and input_user_matrix_file != None: - logprint("Reading user matrix file: %s" % input_user_matrix_file) - # lcs_shading_ori = 2 - custom_dict = read_matrix(input_user_matrix_file) - if custom_dict != {}: - custom_shading = True - custom_similarity_dict = {} - invalid_entries = [] - custom_max = 0 - custom_min = float("Inf") - for key in custom_dict.keys(): - number_key = [] - - # convert number into float - try: - value = float(custom_dict[key]) - if not "." in custom_dict[key]: - value = int(custom_dict[key]) - custom_max = max(custom_max, value) - custom_min = min(custom_min, value) - except: - value = custom_dict[key] - if value == "": - value = None - invalid_entries.append(key) - # match matrix names with sequence names - for item in key: - if item in sequences: - number_key.append(sequences.index(item)) - else: - number_key.append(-1) - # dictionary with tuple of sorted sequence indices as key and number as value - custom_similarity_dict[tuple(sorted(number_key))] = value - if len(invalid_entries) != 0: - text = "No valid number in custom similarity matrix for %d entries: \n\t" % (len(invalid_entries)) - for key in invalid_entries: - text += str(key) + " - " + str(custom_dict[key]) + "; " - logprint(text[:-2]+"\n") - - text = "Custom user matrix given: min %.2f, max %.2f\n" % (custom_min, custom_max) - - # artificially rounding intervals if likely identity/divergence percentages - if 0 <= custom_min < 1 and 0 < custom_max <= 1: - rounding_factor = 5 - multi_factor = 100 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (multi_factor*custom_min // rounding_factor) * (1.*rounding_factor/multi_factor)) - custom_max = min((multi_factor*custom_max // rounding_factor) * (1.*rounding_factor/multi_factor), 1) - text += "new (%.2f, %2f)\n" % (custom_min, custom_max) - - elif 0 <= custom_min < 100 and 0 < custom_max <= 100: - rounding_factor = 5 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (custom_min // rounding_factor) * rounding_factor) - custom_max = min((custom_max // rounding_factor) * rounding_factor, 100) - text += "new (%d, %d)\n" % (custom_min, custom_max) - - logprint(text) - - else: - custom_shading = False - - name_graph = "Polydotplot" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if custom_shading: - suffix += "_matrix" - if lcs_shading: - suffix += "_%dshades_ref%d_ori%s" % (lcs_shading_num+1, lcs_shading_ref, lcs_shading_ori) - if "ref2" in suffix and type_nuc: - suffix = suffix.replace("ref2", "%dbp" % lcs_shading_interval_len) - elif "ref2" in suffix: - suffix = suffix.replace("ref2", "%daa" % lcs_shading_interval_len) - - - # name and create output files (names derived from SEQNAME) - if prefix != None and str(prefix) != "": - prefix = str(prefix) + "-" - else: - prefix = "" - - # preparations for background shading - if lcs_shading or custom_shading: - # create color range white to grey - colors = create_color_list(lcs_shading_num+1, color_map=None, logging=True) - colors_2 = create_color_list(lcs_shading_num+1, color_map="OrRd", logging=True) - - if custom_shading: - text = "Custom Matrix Colors: " + ", ".join(colors_2) - - # write lcs lengths to file - lcs_data_file = open("%sPolydotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - # compare sequences pairwise - save lcs and line information in dictionary for plotting - data_dict = {} # keys = tuple(idx, jdx), value = x1, y1, x2, y2 (line positions) - lcs_dict = {} # keys = tuple(idx, jdx), value = length of lcs: lcs_len or (lcs_for, lcs_rev) - for_lcs_set = set([]) # keep lengths to calculate max (excluding self comparisons) - rev_lcs_set = set([]) # keep lengths to calculate max (all) - - text = "\nTotal plot count: %d" % (len(sequences)*(len(sequences))) - text += "\nTotal calculations: %d" % (len(sequences)*(len(sequences)+1)/2) - logprint(text, start=False, printing=True) - - print "\nCalculating shared regions and lengths of longest_common_substring...", - log_txt = "\nCalculating shared regions and lengths of longest_common_substring..." - # determine matches and length of lcs by comparing all sequence pairs - if verbose: - seq_text = "" - counter = 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif len(sequences) < 5: - print "\t%s (%d %s), %s (%d %s)" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - log_txt += "\t%s (%d %s), %s (%d %s)\n" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - else: - if not counter % 25: - print counter, - log_txt += str(counter) - - # get positions of matches & length of longest common substring based on match lengths - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - data_dict[(idx, jdx)] = x1[:], y1[:], x2[:], y2[:] - lcs_dict[idx, jdx] = lcs_for, lcs_rev - - if idx != jdx: - for_lcs_set.add(lcs_for) - rev_lcs_set.add(lcs_rev) - - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - if not verbose: - print len(sequences)*(len(sequences)+1)/2, " done\n" - log_txt += str(len(sequences)*(len(sequences)+1)/2) + " done\n" - else: - print "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - log_txt += "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - logprint(log_txt, start=False, printing=False) - - if verbose: - logprint ("\n\nlcs_dict\n" + str(lcs_dict)) - if custom_shading: - logprint ("\ncustom_dict\n" + str(custom_dict)) - logprint ("\ncustom_similarity_dict\n\n" + str(custom_similarity_dict)) - - if verbose: - print - logprint(seq_text+"\n", start=False, printing=False) - - if lcs_shading_ref == 2: - color_bins = [] - text = "\nLCS lengh bins: " - for idx in range(lcs_shading_num): - color_bins.append(lcs_shading_interval_len*(idx+1)) - text += " " + str(lcs_shading_interval_len*(idx+1)) - logprint(text, start=False, printing=True) - - # calculate maximum lcs length - if lcs_shading_ori == 0: # forward only - if len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - elif lcs_shading_ori == 1: # reverse complement only - if len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - else: - max_lcs = None - else: # both orientations - if len(rev_lcs_set) != 0 and len(for_lcs_set) != 0: - max_lcs = max(max(rev_lcs_set), max(for_lcs_set)) - elif len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - elif len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - - if not max_lcs == None: - text = "Maximum LCS: %d %s" % (max_lcs, aa_bp_unit) - logprint(text, start=False, printing=True) - if custom_shading: - text = "Maximum custom value: %d\n" % custom_max - logprint(text, start=False, printing=True) - - # count sequences - ncols = len(sequences); nrows = len(sequences) - - # get sequence lengths to scale plot widths and heights accordingly - size_ratios = [] - for item in sequences: - size_ratios.append(len(seq_dict[item].seq)) - - P.cla() # clear any prior graph - # use GridSpec to resize plots according to sequence length - gs = gridspec.GridSpec(nrows, ncols, - width_ratios=size_ratios, - height_ratios=size_ratios) - fig = P.figure(figsize=(plot_size, plot_size)) - - # determine label orientations - if len(sequences) > 5 or rotate_labels: - x_label_rotation = 45 - y_label_rotation = "horizontal" - if x_label_pos_top: - xhalign = 'left' - xvalign = 'bottom' - else: - xhalign = 'right' - xvalign = 'top' - yhalign = "right" - else: - x_label_rotation = "horizontal" - y_label_rotation = "vertical" - xvalign = "center" - xhalign = "center" - yhalign = "center" - yvalign = 'center' - - print "\nDrawing polydotplot...", - log_txt = "\nDrawing polydotplot..." - - # draw subplots - if verbose: - if lcs_shading and custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "Custom matrix value", "Matrix color index", "LCS color index"]) + "\n" - elif lcs_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "LCS color index for", "LCS color index rev"]) + "\n" - elif custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "Custom matrix value", "Color index for", "Color index rev"]) + "\n" - - if verbose: - seq_text = "" - counter, seq_counter = 0, 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - len_two = len(rec_two.seq) - name_two = rec_two.id - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - len_one = len(rec_one.seq) - name_one = rec_one.id - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - # optional shade background according to length of LCS and/or user matrix - ######################################################################### - - # get interval based on LCS - background_colors = [None, None] - if lcs_shading and (lcs_shading_ref==1 or lcs_shading_ref==2 or max_lcs!=None): # self plot max_lcs_for == None - lcs_len = lcs_dict[(idx, jdx)] - l1 = lcs_len[0] # forward - l2 = lcs_len[1] # reverse complement - - lcs_shading_bool = True - - # calculate shading acc. to chosen option - if lcs_shading_ref == 1: # percentage of shorter sequence - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // min(len_one, len_two)) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // min(len_one, len_two)) - elif lcs_shading_ref == 2: # by given interval size - color_idx0 = min(len(colors)-1, l1 // lcs_shading_interval_len) - color_idx1 = min(len(colors)-1, l2 // lcs_shading_interval_len) - if color_idx0 >= len(colors): - color_idx0 = len(colors) - if color_idx1 >= len(colors): - color_idx1 = len(colors) - else: # percentage of maximum lcs length - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // max_lcs) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // max_lcs) - else: - lcs_shading_bool = False - - # get interval based on custom matrix - if custom_shading: - # matrix value - try: - custom_value = custom_similarity_dict[(idx, jdx)] - except: - custom_value = "" - - # bottom left triangle = LCS forward/reverse or best of both - if lcs_shading_bool: - if lcs_shading_ori == 0: # forward - color_idx1 = color_idx0 - elif lcs_shading_ori == 2: # both directions - color_idx1 = max(color_idx0, color_idx1) - - # top right triangle = custom value (not colored if text matrix provided) - if type(custom_value) == int or type(custom_value) == float: - color_idx0 = int((custom_value-custom_min)*lcs_shading_num // (custom_max-custom_min)) - # if string is proviced - else: - color_idx0 = 0 - - # set colors dependent on lcs dependent on orientation - if lcs_shading_bool and not custom_shading: - if idx != jdx: - if lcs_shading_ori == 0: - color_idx1 = color_idx0 - elif lcs_shading_ori == 1: - color_idx0 = color_idx1 - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx1] - # for selfcomparison, only color reverse complement - elif lcs_shading_ori != 0 and not custom_shading: - background_colors[0] = colors[color_idx1] - # set different colors for shading by LCS + user matrix - elif lcs_shading_bool and custom_shading: - # print colors, background_colors, color_idx0, color_idx1 - background_colors[0] = colors_2[color_idx0] - background_colors[1] = colors[color_idx1] - # set grey color range for user matrix if no LCS shading - elif custom_shading: - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx0] - - if verbose: - if custom_shading and lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - elif lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(color_idx0), str(color_idx1)]) + "\n" - elif custom_shading: - lcs_text += "\t".join([name_one, name_two, str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - - # diagonal (self-dotplots) - if idx == jdx: - # skip positions below diagonal - counter = counter + (counter - 1) // (nrows) # + row_pos - counters = [counter] - # draw both graphs at once (due to symmetry) - else: - col_pos = (counter - 1) % ncols - row_pos = (counter - 1) // (nrows) - counter2 = col_pos * ncols + row_pos + 1 - counters = [counter, counter2] - - if len(counters) == 2: - seq_counter += 1 - if not verbose and not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - x_lists, y_lists, x_lists_rc, y_lists_rc = data_dict[(idx, jdx)] - - # plot diagram(s) - for kdx in range(len(counters)): - - fig_pos = counters[kdx] - # plotting subplot with matplotlib - ax = P.subplot(gs[fig_pos-1]) # rows, columns, plotnumber - - # shade annotated regions if gff file(s) provided - if idx == jdx and gff_files != None and gff_files != []: - if name_one in feat_dict.keys(): - features = feat_dict[name_one] - if len_two != len_one: - logprint("Polydot GFF shading for diagonal fields - nequal length error!") - return - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(len_one+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # if custom matrix value printed into upper matrix triangle, skip data plotting - # text print in top triangle - if user_matrix_print and custom_shading and kdx==0 and idx!=jdx: - data_plotting = False - # dotplot in bottom triangle - else: - data_plotting = True - - # mirror plot, if plotting below diagonal - if kdx == 0: - l1, l2 = len_one, len_two - n1, n2 = name_one, name_two - x1, y1 = x_lists, y_lists - x2, y2 = x_lists_rc, y_lists_rc - else: - l2, l1 = len_one, len_two - n2, n1 = name_one, name_two - x1, y1 = y_lists, x_lists - x2, y2 = y_lists_rc, x_lists_rc - - if data_plotting: - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # plot value provided by customer instead of dotplot - else: - alignment = {'horizontalalignment': 'center', 'verticalalignment': 'center'} - # P.text(0.5, 0.5, custom_value, size='medium', transform=ax.transAxes, **alignment) - P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, **alignment) - # P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, - # horizontalalignment='center', verticalalignment='center', color="black") - - if custom_shading: - # omit diagonal - if idx == jdx: - ax.set_facecolor("white") - # use white background for text fields (top right triangle only [kdx 0]) - elif type(custom_value) != int and type(custom_value) != float and kdx == 0: - ax.set_facecolor("white") - else: - ax.set_facecolor(background_colors[kdx]) - # set background color if lcs shading - elif lcs_shading_bool and background_colors[kdx] != None: - ax.set_facecolor(background_colors[kdx]) - - # set axis limits - P.xlim(0, l1+1) - P.ylim(l2+1, 0) # rotate y axis (point downwards) - - # determine axis positions - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - x_label_bool = fig_pos <= ncols - x_tick_bool = fig_pos > ncols*(ncols-1) - else: - x_label_bool = fig_pos > ncols*(ncols-1) - x_tick_bool = fig_pos <= ncols - - # x axis labels dependent on plot position/number - if x_label_bool: # x title and labels on top or bottom - P.xlabel(unicode_name(shorten_name(n1, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, rotation=x_label_rotation, verticalalignment=xvalign, horizontalalignment=xhalign, fontweight='bold', labelpad=8) # axis naming - if not x_label_rotation in ["horizontal", "vertical"]: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation="vertical") - else: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation=x_label_rotation) - elif x_tick_bool and x_label_pos_top: # x ticks on bottom row - ax.xaxis.tick_bottom() # ticks without labels on bottom - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) - elif x_tick_bool: # x ticks on top row - ax.xaxis.tick_top() # # ticks without labels on top - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) # inner diagrams without labelling - else: # no x ticks on internal rows - ax.axes.get_xaxis().set_visible(False) - - # y axis labels dependent on plot position/number - if fig_pos % ncols == 1 or (ncols == 1 and nrows == 1): # y title and labels in 1st column - P.ylabel(unicode_name(shorten_name(n2, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, rotation=y_label_rotation, verticalalignment=yvalign, horizontalalignment=yhalign, fontweight='bold', labelpad=8) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) # axis naming - elif fig_pos % ncols == 0: # y ticks in last column - ax.yaxis.tick_right() - P.setp(ax.get_yticklabels(), visible=False) # inner diagrams without labelling - else: - ax.axes.get_yaxis().set_visible(False) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - try: - logprint(lcs_text, start=False, printing=True) - except: - pass - - # finalize layout - margins & spacing between plots - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - # gs.tight_layout(fig, h_pad=.02, w_pad=.02) # less overlapping tick labels, but also disturbingly large spacing - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, top=0.87) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, bottom=0.13) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing) # space between rows - def 0.4 - - # save figure and close instance - fig_name = '%s%s_wordsize%i%s.%s' % (prefix, name_graph, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - - # create figure color legend - if lcs_shading: - if lcs_shading_ref == 1: # percentage of shorter sequence - legend_file_name = legend_figure(colors, lcs_shading_num, unit="%", filetype=filetype, prefix=prefix) - elif lcs_shading_ref == 2: # interval sizes - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, bins=color_bins) - else: # relative of maximum lcs - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, max_lcs_len=max_lcs) - - if custom_shading: - custom_prefix = "custom-matrix-" + prefix - legend_file_name_custom = legend_figure(colors_2, lcs_shading_num, unit="%", filetype=filetype, prefix=custom_prefix, max_lcs_len=custom_max, min_lcs_len=custom_min) - - if lcs_shading and custom_shading: - return [fig_name, legend_file_name, legend_file_name_custom] - elif lcs_shading: - return [fig_name, legend_file_name] - elif custom_shading: - return [fig_name, legend_file_name_custom] - else: - return [fig_name] - - -############################### -# Function Call # -############################### - -def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_size=10, filetype="png", type_nuc=True, convert_wobbles=False, substitution_count=0, rc_option=True, alphabetic_sorting=False, gff=None, multi=True, ncols=1, nrows=1, lcs_shading=True, lcs_shading_num=5, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, gff_color_config_file="", input_user_matrix_file="", user_matrix_print=False, length_scaling=True, title_length=50, title_clip_pos="B", spacing=0.04, max_N_percentage=49, verbose=False): - - global t1, line_col_rev - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage = 49 - if type_nuc: - ambiq_res = "N" - else: - ambiq_res = "X" - text = "Provide valid max_N_percentage, kmers with >50%% %ss are ignored\n" % (ambiq_res) - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given:%s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - # read gff color config file if provided - if len(input_gff_files) != 0 and input_gff_files != None: - if gff_color_config_file not in ["", None]: - text = "\n%s\n\nReading GFF color configuration file\n%s\n\n=> %s\n" % (50*"=", 28*"-", gff_color_config_file) - logprint(text, start=False, printing=True) - gff_feat_colors = read_gff_color_config(gff_color_config_file) - else: - gff_feat_colors = {} - if gff_color_config_file not in ["", None]: - text = "Please provide GFF annotation files to use configuration file", gff_color_config_file - logprint(text, start=False, printing=True) - - # if color is set to white, reverse complementary matches are skipped - if not rc_option: - line_col_rev = "white" # reverse matches not calculated - elif not type_nuc: - logprint("Reverse complement deactivated for proteins!") - line_col_rev = "white" # reverse matches not calculated - - mode_text = [] - for item in modes: - mode_text.append(str(item)) - text = "%s\n\nRunning plotting modes %s" % (50*"=", ", ".join(mode_text)) - logprint(text, start=False, printing=True) - - - # create dotplots - ########################################## - - # self dotplots - t1 = time.time() - if 0 in modes: - list_of_png_names = selfdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, gff_files=gff, gff_color_dict=gff_feat_colors, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # paired dotplots - if 1 in modes: - if multi: - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=length_scaling, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - else: - if not length_scaling: - text = "\nPairwise dotplot with individual output files scaled by sequence length automatically!" - logprint(text, start=False, printing=True) - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=True, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # all-against-all dotplot - if 2 in modes: - list_of_png_names = polydotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, spacing=spacing, gff_files=gff, gff_color_dict=gff_feat_colors, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - text = "\n" + 50 * "#" + "\n" + 50 * "#" - text += "\n\nThank you for using FlexiDot!\n" - logprint(text, start=False, printing=True) - -# testing mode for debugging -trial_mode = False -# trial_mode = True - -# parameters = check_input(sys.argv) -parameters = check_input(sys.argv, trial_mode=trial_mode) - -# read out parameters -commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype, type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos_top, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, verbose = parameters - -# evtl. overwrite parameters for testing purposes in trial mode -if trial_mode: - input_fasta = ["test-sequences-8.fas"] - input_gff_files = ["Seq2_annotations.gff3"] - # input_user_matrix_file = "matrix.txt" - # user_matrix_print = True - output_file_prefix = "#GFF_poly" - plot_size = 10 - plotting_modes = [0,1,2] - plotting_modes = [2] - lcs_shading = False - lcs_shading = True - lcs_shading_ref = 2 - lcs_shading_num = 4 - lcs_shading_ori = 0 - lcs_shading_interval_len = 15 - wordsize = 10 - wordsize = 7 - x_label_pos_top = True - filetype = "pdf" - filetype = "png" - - wobble_conversion = False - wobble_conversion = True - - substitution_count = 0 - - rc_option = True - rc_option = False - label_size = 10 - - verbose = False - verbose = True - -if auto_fas: - path = os.path.dirname(os.path.abspath(__file__)) - files_long = glob.glob(path+"/*.fasta") - files_long.extend(glob.glob(path+"/*.fas")) - files_long.extend(glob.glob(path+"/*.fa")) - files_long.extend(glob.glob(path+"/*.fna")) - input_fasta = [] - for i in files_long: - if not "combined" in i: - filename = i[i.rfind('\\')+1:] - input_fasta.append(filename) - -if trial_mode: - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - -main(input_fasta, wordsize, modes=plotting_modes, prefix=output_file_prefix, plot_size=plot_size, label_size=label_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=wobble_conversion, substitution_count=substitution_count, rc_option=rc_option, gff=input_gff_files, multi=collage_output, ncols=m_col, nrows=n_row, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, gff_color_config_file=gff_color_config_file, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, length_scaling=length_scaling, title_length=title_length, title_clip_pos=title_clip_pos, spacing=spacing, max_N_percentage=max_N_percentage, verbose=verbose) - - diff --git a/code/flexidot_v1.04.py b/code/flexidot_v1.04.py deleted file mode 100644 index f5b3481..0000000 --- a/code/flexidot_v1.04.py +++ /dev/null @@ -1,3325 +0,0 @@ -#!/usr/bin/python2.7 -# -*- coding: utf-8 -*- - -""" -FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation - -Kathrin M. Seibt, Thomas Schmidt and Tony Heitkam -Institute of Botany, TU Dresden, Dresden, 01277, Germany - -Bioinformatics, 2018, doi 10.1093/bioinformatics/bty395 - -FlexiDot version1.04 (aka lab version 137) - -""" - - -############################### -# Requirements # -############################### - -# import system modules -import os, glob -import time, datetime -import sys -import shutil, getopt -import unicodedata - -def module_install_command(module_name, upgrade=False): - """ - create installation commands for Python modules and print information - """ - if upgrade: - load_command = "python -m pip install --upgrade %s" % module_name - else: - load_command = "python -m pip install %s" % module_name - - try: - logprint("Installing Python module: %s\n\t%s\n" % (module_name, load_command)) - except: - print "Installing Python module: %s\n\t%s\n" % (module_name, load_command) - - return load_command - -def load_modules(): - """ - load Python modules, if possible - otherwise try to install them - """ - - # make module names global - global cllct, gridspec, patches, rcParams, mplrc, P, Color, SeqIO, np, ccv, mcolors, rgb2hex, regex - - # matplotlib - try: - import matplotlib.collections as cllct - except: - command = module_install_command("matplotlib", upgrade=True) - try: - os.system(command) - print "\n" - import matplotlib.collections as cllct - except: - print "Please install module matplotlib manually" - from matplotlib.colors import colorConverter as ccv - import matplotlib.colors as mcolors - import matplotlib.gridspec as gridspec - import matplotlib.patches as patches - import pylab as P - - # specify matplotlib font settings - from matplotlib import rc as mplrc - mplrc('pdf', fonttype=42, compression=0) - from matplotlib import rcParams - rcParams['font.family'] = 'sans-serif' - rcParams['font.sans-serif'] = ['Helvetica', 'Verdana', 'Tahoma', ] - - # colour for color gradient palette - try: - from colour import Color - except: - command = module_install_command("colour") - try: - os.system(command) - print "\n" - from colour import Color - except: - print "Please install module colour manually" - - # color converter - try: - from colormap import rgb2hex - except: - command = module_install_command("colormap") - # additional module easydev.tools required by colormap - command2 = module_install_command("easydev") - try: - os.system(command) - os.system(command2) - print "\n" - from colormap import rgb2hex - except: - print "Please install module colormap manually" - - # biopython - try: - from Bio import SeqIO - except: - command = module_install_command("biopython") - try: - os.system(command) - print "\n" - from Bio import SeqIO - except: - print "Please install module biopython manually" - - # numpy - try: - import numpy as np - except: - command = module_install_command("numpy") - try: - os.system(command) - print "\n" - import numpy as np - except: - print "Please install module numpy manually" - - # regex for pattern matching - try: - import regex - except: - command = module_install_command("regex") - try: - os.system(command) - print "\n" - import regex - except: - print "Please install module regex manually" - -load_modules() - - -############################### -# Usage & Input # -############################### - -def usage(): - """ - usage and help - """ - - print """\n\n FLEXIDOT - ------------------------------------------------------------------- - - Version: - 1.04 - - Citation: - Kathrin M. Seibt, Thomas Schmidt, Tony Heitkam (2018) - "FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation" - Bioinformatics, doi: 10.1093/bioinformatics/bty395 - - - General usage: - $ python flexidot.py -a [ARGUMENTS] - $ python flexidot.py -i [ARGUMENTS] - - - ARGUMENTS - ------------------------------------------------------------------- - - - INPUT/OUTPUT OPTIONS... required are [-a] OR [-i] - - -a, --auto_fas Imports all fasta files from current directory (*.fasta, *.fas, *.fa, *.fna) - -i is not needed, if -a is activated - [inactive by default] - - -i, --in_file Input fasta file (fasta file name or comma-separated file list) - > Provide multiple files: Recall -i or provide comma-separated file names - - -o, --output_file_prefix File prefix to be added to the generated filenames [default = NONE] - - -c, --collage_output Multiple dotplots are combined in a collage - Y or 1 = ON [default] - N or 0 = OFF - - -m, --m_col Number of columns per page [default = 4] (only if --collage_output is ON) - - -n, --n_row Number of rows per page [default = 5] (only if --collage_output is ON) - - -f, --filetype Output file format - 0 = PNG [default] - 1 = PDF - 2 = SVG - - -s, --alphabetic_sorting Sort sequences alphabetically according to titles - Y or 1 = ON - N or 0 = OFF [default] - - - CALCULATION PARAMETERS... - - -k, --wordsize Wordsize (kmer length) for dotplot comparison [default = 7] - - -p, --plotting_mode Mode of FlexiDot dotplotting - 0 = self [default] - 1 = paired - 2 = poly (matrix with all-against-all dotplots) - > Run multiple plotting modes: Recall -p or provide comma-separated numbers - - -t, --type_nuc Type of residue is nucleotide - Y or 1 = nucleotide [default] - N or 0 = amino acid - - -w, --wobble_conversion Ambiguity handling for relaxed matching - Y or 1 = ON - N or 0 = OFF [default] - - -S, --substitution_count Number of substitutions (mismatches) allowed per window for relaxed matching - [default = 0] - - -r, --rc_option Find reverse complementary matches (only if type_nuc=y) - Y or 1 = ON [default] - N or 0 = OFF - - - GRAPHIC FORMATTING... - - -A, --line_width Line width [default = 1] - - -B, --line_col_for Line color [default = black] - - -C, --line_col_rev Reverse line color [default = green] - - -D, --x_label_pos Position of the X-label - Y or 1 = top [default] - N or 0 = bottom - - -E, --label_size Font size [default = 10] - - -F, --spacing Spacing between all-against-all dotplots (only if --plotting_mode=2) - [default = 0.04] - - -L, --length_scaling Scale plot size for pairwise comparison (only if --plotting_mode=1) - Y or 1 = Scaling ON (axes scaled according to sequence length) - N or 0 = Scaling OFF (squared plots) [default] - - -M, --mirror_y_axis Flip y-axis bottom to top (cartesian coordinate system) - Y or 1 = y-axis bottom to top - N or 0 = y-axis top to bottom [default] - - -P, --plot_size Plotsize [default = 10] - - -R, --representation Region of plot to display (only if --plotting_mode=2) - 0 = full [default] - 1 = upper - 2 = lower - - -T, --title_length Limit title length for dotplot comparisons - [default = 20] - Position of selection can be specified by appending a letter (e.g. -T 20E) - B = beginning [default] - E = end - - - GFF SHADING (for -p/--plotting_mode=0 only)... - - -g, --input_gff_files GFF3 file used for markup in self-dotplots - (provide multiple files: Recall -g or provide comma-separated file names) - - -G, --gff_color_config_file Tab-delimited config file for custom gff shading - column 1: feature type - column 2: color - column 3: alpha - column 4: zoom factor (for small regions) - - - LCS SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -x, --lcs_shading Shade subdotplot based on the length of the longest common substring (LCS) - Y or 1 = ON - N or 0 = OFF [default] - - -X, --lcs_shading_num Number of shading intervals (hues) for LCS (-x) and user matrix shading (-u) - [default = 5] - - -y, --lcs_shading_ref Reference for LCS shading - 0 = maximal LCS length [default] - 1 = maximally possible length (length of shorter sequence in pairwise comparison) - 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] - see -Y - - -Y, --lcs_shading_interval_len Length of intervals for LCS shading (only if --lcs_shading_ref=2) - [default for nucleotides = 50; default for amino acids = 10] - - -z, --lcs_shading_ori Shade subdotplots according to LCS on - 0 = forward [default], - 1 = reverse, or - 2 = both strands (forward shading above diagonal, reverse shading on diagonal and below; - if using --input_user_matrix_file, best LCS is used below diagonal) - - - CUSTOM USER MATRIX SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -u, --input_user_matrix_file Shading above diagonal according to values in matrix file specified by the user - (tab-delimited or comma-separated matrix with sequence name in column 1 and numbers in columns 2-n - e.g. identity matrix from multiple sequence alignment - strings are ignored) - - -U, --user_matrix_print Display provided matrix entries in the fields above diagonal of all-against-all dotplot - Y or 1 = ON - N or 0 = OFF [default] - - - OTHERS... - - -h, --help Help screen - - -v, --verbose Verbose - - - - - """ - -def check_input(argv, trial_mode=False): - """ - commandline argument parsing - """ - - global log_txt, aa_bp_unit - - # helpers for argument parsing - ###################################### - - arguments = ["-a", "--auto_fas", "a", "auto_fas", - "-i", "--input_fasta", "i:", "input_fasta=", - "-o", "--output_file_prefix", "o:", "output_file_prefix=", - "-c", "--collage_output", "c:", "collage_output=", - "-m", "--m_col", "m:", "m_col=", - "-n", "--n_row", "n:", "n_row=", - "-f", "--filetype", "f:", "filetype=", - "-t", "--type_nuc", "t:", "type_nuc=", - "-g", "--input_gff_files", "g:", "input_gff_files", - "-G", "--gff_color_config_file", "G:", "gff_color_config_file", - "-k", "--wordsize", "k:", "wordsize=", - "-p", "--plotting_mode", "p:", "plotting_mode=", - "-w", "--wobble_conversion", "w:", "wobble_conversion=", - "-S", "--substitution_count", "S:", "substitution_count=", - "-r", "--rc_option", "r:", "rc_option=", - "-s", "--alphabetic_sorting", "s:", "alphabetic_sorting=", - "-x", "--lcs_shading", "x:", "lcs_shading=", - "-X", "--lcs_shading_num", "X:", "lcs_shading_num=", - "-y", "--lcs_shading_ref", "y:", "lcs_shading_ref=", - "-Y", "--lcs_shading_interval_len", "Y:", "lcs_shading_interval_len=", - "-z", "--lcs_shading_ori", "z:", "lcs_shading_ori=", - "-u", "--input_user_matrix_file", "u:", "input_user_matrix_file=", - "-U", "--user_matrix_print", "U:", "user_matrix_print=", - "-P", "--plot_size", "P:", "plot_size=", - "-A", "--line_width", "A:", "line_width=", - "-B", "--line_col_for", "B:", "line_col_for=", - "-C", "--line_col_rev", "C:", "line_col_rev=", - "-D", "--x_label_pos", "D:", "x_label_pos=", - "-E", "--label_size", "E:", "label_size=", - "-F", "--spacing", "F:", "spacing=", - "-L", "--length_scaling", "L:", "length_scaling=", - "-M", "--mirror_y_axis", "M:", "mirror_y_axis=", - "-R", "--representation", "R:", "representation=", - "-T", "--title_length", "T:", "title_length=", - "-h", "--help", "h", "help", - "-v", "--verbose", "v", "verbose"] - - arguments_sysargv = tuple(arguments[0::4] + arguments[1::4]) - arguments_opts = "".join(arguments[2::4]) - arguments_args = arguments[3::4] - - - # setting defaults - ###################################### - - auto_fas = False # 0 - input_fasta = [] - output_file_prefix = None - collage_output = True # 1 - m_col = 4 - n_row = 5 - filetype = 0 - type_nuc = True - input_gff_files = [] - gff_color_config_file = "" - - wordsize = 7 - plotting_modes = [0] - wobble_conversion = False # 0 - substitution_count = 0 - rc_option = True # 1 - alphabetic_sorting = False # 0 - - lcs_shading = False # 0 - lcs_shading_num = 4 - lcs_shading_ref = 0 - lcs_shading_interval_len = 50 # interval default changes to "10" for amino acids [type_nuc = n] - lcs_shading_ori = 0 - - input_user_matrix_file = "" - user_matrix_print = False - - plot_size = 10 - line_width = 1 - line_col_for = "black" - line_col_rev = "#009243" - x_label_pos = True # 0 - label_size = 10 - spacing = 0.04 - length_scaling = False # 0 - title_length = 20 # float("Inf") - title_clip_pos = "B" # B (begin), E (end) - max_N_percentage = 49 # fixed value, no user input - mirror_y_axis = False - representation = 0 - - aa_bp_unit = "bp" - - verbose = False # 0 - - filetype_dict = {0: "png", 1: "pdf", 2: "svg"} - lcs_shading_ref_dict = {0: "maximal LCS length", 1: "maximally possible length", 2: "given interval sizes"} - plotting_mode_dict = {0: "self", 1: "paired", 2: "all-against-all"} - lcs_shading_ori_dict = {0: "forward", 1: "reverse complement", 2: "both"} - representation_dict = {0: "full", 1: "upper", 2: "lower"} - - # return default parameters for testing purposes - if trial_mode: - print "ATTENTION: YOU ARE IN THE TRIAL MODE!!!\n\n" - - commandline = "trial_mode\n" - - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose] - return parameters - - - # read arguments - ###################################### - - commandline = "" - for arg in sys.argv: - commandline += arg + " " - - log_txt = "\n...reading input arguments..." - print log_txt - - if len(sys.argv) < 2: - print "\nERROR: More arguments are needed. Exit..." - log_txt += "\nERROR: More arguments are needed. Exit..." - usage() - sys.exit() - - elif sys.argv[1] not in arguments_sysargv: - print "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - log_txt += "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - # usage() - sys.exit() - - try: - opts, args = getopt.getopt(sys.argv[1:], arguments_opts, arguments_args) - - except getopt.GetoptError: - print "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - log_txt += "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - # usage() - sys.exit() - - for opt, arg in opts: - - if opt in ("-h", "--help"): - print "...fetch help screen" - log_txt += "\n...fetch help screen" - usage(), sys.exit() - - if opt in ("-v", "--verbose"): - print "...verbose output" - log_txt += "\n...verbose output" - verbose = True - - elif opt in ("-i", "--input_fasta"): - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: fasta_file '%s' was not found!" % str(temp_file) - sys.exit(message) - else: - input_fasta.append(str(temp_file)) - print "fasta file #%i: %s" % (len(input_fasta), str(temp_file)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: fasta_file '%s' was not found!" % str(arg) - log_txt += message - sys.exit(message) - else: - input_fasta.append(str(arg)) - print "fasta file #%i: %s" % (len(input_fasta), str(arg)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(arg)) - - - elif opt in ("-a", "--auto_fas"): - auto_fas = True - - - # multiple gff files: reads them into a list - elif opt in ("-g", "--input_gff_files"): - - # append gff file only if existing - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: gff_file '%s' was not found!" % str(temp_file) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - print "GFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - input_gff_files.append(str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: gff_file '%s' was not found!" % str(arg) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - input_gff_files.append(str(arg)) - print "GFF file #%i: %s" %(len(input_gff_files), str(arg)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(arg)) - - - elif opt in ("-G", "--gff_color_config_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: gff_color_config_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot with default gff coloring specification!" - log_txt += message + "\n -->Running FlexiDot with default gff coloring specification!" - else: - gff_color_config_file = str(arg) - - - elif opt in ("-u", "--input_user_matrix_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: input_user_matrix_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot without input_user_matrix_file %s!" % arg - log_txt += message + "\n -->Running FlexiDot withdefault matrix shading file!" - else: - input_user_matrix_file = str(arg) - - elif opt in ("-U", "--user_matrix_print"): - user_matrix_print = check_bools(str(arg), default=user_matrix_print) - - elif opt in ("-o", "--output_file_prefix"): - output_file_prefix = arg - - elif opt in ("-c", "--collage_output"): - collage_output = check_bools(str(arg), default=collage_output) - - elif opt in ("-m", "--m_col"): - try: m_col = int(arg) - except: - print "m_col - invalid argument - using default value" - log_txt += "\nm_col - invalid argument - using default value" - - elif opt in ("-n", "--n_row"): - try: n_row = int(arg) - except: - print "n_row - invalid argument - using default value" - log_txt += "\nn_row - invalid argument - using default value" - - elif opt in ("-f", "--filetype"): - if 0 <= int(arg) <= 2: - filetype = int(arg) - else: - print "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - log_txt += "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - - elif opt in ("-t", "--type_nuc"): - type_nuc = check_bools(str(arg), default=type_nuc) - - if type_nuc == False: - # interval default changed for amino acids - lcs_shading_interval_len = 10 - aa_bp_unit = "aa" - - elif opt in ("-k", "--wordsize"): - try: wordsize = int(arg) - except: - print "wordsize - invalid argument - using default value" - log_txt += "\nwordsize - invalid argument - using default value" - - elif opt in ("-p", "--plotting_mode"): - if "," in arg: - temp_modes = arg.split(",") - for item in temp_modes: - if item in ["0","1","2"]: - plotting_modes.append(int(item)) - elif arg in ["0","1","2"]: - plotting_modes = [int(arg)] - else: - print "Please provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - log_txt += "\nPlease provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - - elif opt in ("-w", "--wobble_conversion"): - wobble_conversion = check_bools(str(arg), default=wobble_conversion) - - elif opt in ("-S", "--substitution_count"): - try: substitution_count = int(arg) - except: - print "substitution_count - invalid argument - using default value" - log_txt += "\nsubstitution_count - invalid argument - using default value" - - elif opt in ("-r", "--rc_option"): - rc_option = check_bools(str(arg), default=rc_option) - - elif opt in ("-s", "--alphabetic_sorting"): - alphabetic_sorting = check_bools(str(arg), default=alphabetic_sorting) - - elif opt in ("-x", "--lcs_shading"): - lcs_shading = check_bools(str(arg), default=lcs_shading) - - elif opt in ("-X", "--lcs_shading_num"): - try: lcs_shading_num = int(arg) - 1 - except: - print "lcs_shading_num - invalid argument - using default value" - log_txt += "\nlcs_shading_num - invalid argument - using default value" - - elif opt in ("-y", "--lcs_shading_ref"): - try: - if 0 <= int(arg) <= 2: - lcs_shading_ref = int(arg) - else: - print "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - log_txt += "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - except: - print "lcs_shading_ref - invalid argument - using default value" - log_txt += "\nlcs_shading_ref - invalid argument - using default value" - - elif opt in ("-Y", "--lcs_shading_interval_len"): - try: lcs_shading_interval_len = int(arg) - except: - print "lcs_shading_interval_len - invalid argument - using default value" - log_txt += "\nlcs_shading_interval_len - invalid argument - using default value" - - elif opt in ("-z", "--lcs_shading_ori"): - if 0 <= int(arg) <= 2: - lcs_shading_ori = int(arg) - else: - print "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - log_txt += "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - - elif opt in ("-P", "--plot_size"): - try: plot_size = float(arg) - except: - print "plot_size - invalid argument - using default value" - log_txt += "\nplot_size - invalid argument - using default value" - - - elif opt in ("-A", "--line_width"): - try: line_width = float(arg) - except: - print "line_width - invalid argument - using default value" - log_txt += "\nline_width - invalid argument - using default value" - - elif opt in ("-B", "--line_col_for"): - if mcolors.is_color_like(arg): - line_col_for = arg - else: - print "line_col_for - invalid argument - using default value" - log_txt += "\nline_col_for - invalid argument - using default value" - - elif opt in ("-C", "--line_col_rev"): - if mcolors.is_color_like(arg): - line_col_rev = arg - else: - print "line_col_rev - invalid argument - using default value" - log_txt += "\nline_col_rev - invalid argument - using default value" - - elif opt in ("-D", "--x_label_pos"): - x_label_pos = check_bools(str(arg), default=x_label_pos) - - elif opt in ("-E", "--label_size"): - try: label_size = float(arg) - except: - print "label_size - invalid argument - using default value" - log_txt += "\nlabel_size - invalid argument - using default value" - - elif opt in ("-F", "--spacing"): - try: spacing = float(arg) - except: - print "spacing - invalid argument - using default value" - log_txt += "\nspacing - invalid argument - using default value" - - elif opt in ("-L", "--length_scaling"): - length_scaling = check_bools(str(arg), default=length_scaling) - - elif opt in ("-M", "--mirror_y_axis"): - mirror_y_axis = check_bools(str(arg), default=mirror_y_axis) - - elif opt in ("-R", "--representation"): - if 0 <= int(arg) <= 2: - representation = int(arg) - else: - print "\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." %(representation) - log_txt += "\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." %(representation) - - elif opt in ("-T", "--title_length"): - try: title_length = int(arg) - except: - try: - title_length = int(str(arg)[:-1]) - if arg[-1].upper() in ["B", "E"]: # B (beginning), E (end) - title_clip_pos = arg[-1].upper() - else: - print "title_length position information invalid - using default value" - log_txt += "\ntitle_length position information invalid - using default value" - except: - print "title_length - invalid argument - using default value" - log_txt += "\ntitle_length - invalid argument - using default value" - - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - logprint(log_txt, start=False, printing=False) - - - # print chosen arguments - ###################################### - - text = "\n%s\n" % (70 * "-") - text += "\n" + "INPUT/OUTPUT OPTIONS...\n" - text += "\n" + "Input fasta file: " + ", ".join(input_fasta) - text += "\n" + "Automatic fasta collection from current directory: " + str(auto_fas) - text += "\n" + "Collage output: " + str(collage_output) - text += "\n" + "Number of columns per page: " + str(m_col) - text += "\n" + "Number of rows per page: " + str(n_row) - text += "\n" + "File format: " + filetype_dict[filetype] - text += "\n" + "Residue type is nucleotide: " + str(type_nuc) - - text += "\n" + "\n\nCALCULATION PARAMETERS...\n" - text += "\n" + "Wordsize: " + str(wordsize) - text += "\n" + "Plotting mode: " + str(plotting_modes).replace("[", "").replace("]", "") + "\n" + 51 * " " - for item in plotting_modes: - text += plotting_mode_dict[item] + " " - text += "\n" + "Ambiguity handling: " + str(wobble_conversion) - text += "\n" + "Reverse complement scanning: " + str(rc_option) - text += "\n" + "Alphabetic sorting: " + str(alphabetic_sorting) - - if 0 in plotting_modes and input_gff_files != []: - text += "\n" + "Input gff files: " + ", ".join(input_gff_files) - if gff_color_config_file != "": - text += "\n" + "GFF color config file: " + gff_color_config_file - text += "\n" + "Prefix for output files: " + str(output_file_prefix) - - if 2 in plotting_modes: - text += "\n" + "\n\nLCS SHADING OPTIONS (plotting_mode 'all-against-all' only)...\n" - text += "\n" + "LCS shading: " + str(lcs_shading) - text += "\n" + "LCS shading interval number: " + str(lcs_shading_num + 1) - text += "\n" + "LCS shading reference: " + lcs_shading_ref_dict[lcs_shading_ref] - if lcs_shading_ref == 2: - text += "\n" + "LCS shading interval size [%s]: " % (aa_bp_unit) + str(lcs_shading_interval_len) - text += "\n" + "LCS shading orientation: " + lcs_shading_ori_dict[lcs_shading_ori] - if input_user_matrix_file != "": - text += "\n" + "Custom user shading matrix file: " + input_user_matrix_file - text += "\n" + "Print user matrix values (instead of dotplot): " + str(user_matrix_print) - text += "\n" + "Displayed plot region: " + representation_dict[representation] - - text += "\n" + "\n\nGRAPHIC FORMATTING...\n" - text += "\n" + "Plot size: " + str(plot_size) - text += "\n" + "Line width: " + str(line_width) - text += "\n" + "Line color: " + line_col_for - text += "\n" + "Reverse line color: " + line_col_rev - text += "\n" + "X label position: " + str(x_label_pos) - text += "\n" + "Label size: " + str(label_size) - text += "\n" + "Spacing: " + str(spacing) - if mirror_y_axis: - text += "\n" + "Y-axis mirrored (bottom to top) " + str(mirror_y_axis) - if title_clip_pos == "E": - text += "\n" + "Title length (limit number of characters): " + "last" + str(title_length) + "characters" - else: - text += "\n" + "Title length (limit number of characters): " + "first" + str(title_length) + "characters" - text += "\n" + "Length scaling: " + str(length_scaling) - text += "\n%s\n" % (70 * "-") - logprint(text) - - - # collect settings - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose] - - return parameters - - -############################### -# Helper Functions # -############################### - -def alphabets(type_nuc=True): - """ - provide ambiguity code for sequences - """ - - nucleotide_alphabet = ["A", "C", "G", "T"] - - nucleotide_alphabet_full = ["A", "C", "G", "T", "N", "B", "D", "H", - "V", "Y", "R", "W", "S", "K", "M"] - - nucleotide_ambiguity_code = {"N": ["A", "C", "G", "T"], # any - "B": ["C", "G", "T"], # not A - "D": ["A", "G", "T"], # not C - "H": ["A", "C", "T"], # not G - "V": ["A", "C", "G"], # not T - "Y": ["C", "T"], # pyrimidine - "R": ["A", "G"], # purine - "W": ["A", "T"], # weak - "S": ["C", "G"], # strong - "K": ["G", "T"], # keto - "M": ["A", "C"]} # amino - - nucleotide_match_dict = {"N": "[ACGTNBDHVYRWSKM]", # any - "B": "[CGTNBDHVYRWSKM]", # not A - "D": "[AGTNBDHVYRWSKM]", # not C - "H": "[ACTNBDHVYRWSKM]", # not G - "V": "[ACGNBDHVYRWSKM]", # not T - "K": "[GTNBDHVYRWSK]", # keto - not A,C,M - "M": "[ACNBDHVYRWSM]", # amino - not G,T,K - "W": "[ATNBDHVYRWKM]", # weak - not C,G,S - "S": "[CGNBDHVYRSKM]", # strong - not A,G,W - "Y": "[CTNBDHVYWSKM]", # pyrimidine - not A,G,R - "R": "[AGNBDHVRWSKM]", # purine - not C,T,Y - "A": "[ANDHVRWM]", - "C": "[CNBHVYSM]", - "G": "[GNBDVRSK]", - "T": "[TNBDHYWK]"} - - # nucleotide_match_dict = {"N": ".", # any - # "B": "[^A]", # not A - # "D": "[^C]", # not C - # "H": "[^G]", # not G - # "V": "[^T]", # not T - # "K": "[^ACM]", # keto - not A,C,M - # "M": "[^GTK]", # amino - not G,T,K - # "W": "[^CGS]", # weak - not C,G,S - # "S": "[^AGW]", # strong - not A,G,W - # "Y": "[^AGR]", # pyrimidine - not A,G,R - # "R": "[^CTY]", # purine - not C,T,Y - # "A": "[ANDHVRWM]", - # "C": "[CNBHVYSM]", - # "G": "[GNBDVRSK]", - # "T": "[TNBDHYWK]"} - - aminoacid_alphabet = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"] - - aminoacid_alphabet_full = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*", "J", - "Z", "B", "X"] - - aminoacid_ambiguity_code = {"J": ["I", "L"], - "Z": ["Q", "E"], - "B": ["N", "D"], - "X": ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"]} # any - - aminoacid_match_dict = {"J": "[ILJ]", - "Z": "[QEZ]", - "B": "[NDB]", - # "X": ".", - "X": "[ARNDCEQGHILKMFPSTWYVUO*XBZJ]", - "A": "[AX]", - "R": "[RX]", - "N": "[NXB]", - "D": "[DXB]", - "C": "[CX]", - "E": "[EXZ]", - "Q": "[QXZ]", - "G": "[GX]", - "H": "[HX]", - "I": "[IXJ]", - "L": "[LXJ]", - "K": "[KX]", - "M": "[MX]", - "F": "[FX]", - "P": "[PX]", - "S": "[SX]", - "T": "[TX]", - "W": "[WX]", - "Y": "[YX]", - "V": "[VX]", - "U": "[UX]", - "O": "[OX]", - "*": "[*X]"} - - aa_only = set(['E', 'F', 'I', 'J', 'L', 'O', 'Q', 'P', 'U', 'X', 'Z', '*']) - # return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aa_only - - if type_nuc: - return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, nucleotide_match_dict - else: - return aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aminoacid_match_dict - -def logprint(text, start=False, printing=True, prefix=""): - """ - log output to log_file and optionally print - """ - - # define log file name and open file - global log_file_name - if start and trial_mode: - log_file_name = "log_file.txt" - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - elif start: - date = datetime.date.today() - time = str(datetime.datetime.now()).split(" ")[1].split(".")[0].replace(":", "-") - log_file_name = "%s_%s_log_file.txt" % (date, time) - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - else: - log_file = open(log_file_name, 'a') - - # write log (and print) - log_file.write(text + "\n") - if printing: - print text - log_file.close() - -def time_track(starting_time, show=True): - """ - calculate time passed since last time measurement - """ - now = time.time() - delta = now - starting_time - if show: - text = "\n\t %s seconds\n" % str(delta) - logprint(text, start=False, printing=True) - return now - -def calc_fig_ratio(ncols, nrows, plot_size, verbose=False): - """ - calculate size ratio for given number of columns (ncols) and rows (nrows) - with plot_size as maximum width and length - """ - ratio = ncols*1./nrows - if verbose: - text = " ".join([ncols, nrows, ratio]) - logprint(text, start=False, printing=True) - if ncols >= nrows: - figsize_x = plot_size - figsize_y = plot_size / ratio - else: - figsize_x = plot_size * ratio - figsize_y = plot_size - return figsize_x, figsize_y - -def shorten_name(seq_name, max_len=20, title_clip_pos="B"): #, delim="_"): - """ - shorten sequence names (for diagram titles) - """ - - if len(seq_name) <= max_len: - return seq_name - - # take last characters - if title_clip_pos == "E": - name = seq_name[len(seq_name)-max_len:] - - # take first characters - else: - name = seq_name[:max_len] - - """# keep first and last part if multiple parts separated by delimiter (e.g. species_prefix + sequence_id) - if delim in seq_name: - if seq_name.count(delim) >= 2: - name = "%s..." % delim.join(seq_name.split(delim)[:1]) + seq_name.split(delim)[-1] # .replace("_000", "-") - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - - if len(name) > max_len: - name = name[:((max_len-2)//2)] + "..." + name[((max_len-2)//2):] - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - """ - - return name - -def unicode_name(name): - """ - replace non-ascii characters in string (e.g. for use in matplotlib) - """ - unicode_string = eval('u"%s"' % name) - return unicodedata.normalize('NFKD', unicode_string).encode('ascii','ignore') - -def check_bools(arg, update_log_txt = True, default=None): - """ - converts commandline arguments into boolean - """ - - - # convert valid arguments - if str(arg).lower() == "y" or str(arg) == "1": - return True - elif str(arg).lower() == "n" or str(arg) == "0": - return False - - # use default in case of invalid argument - else: - if update_log_txt: - global log_txt - log_txt += "using default for " + str(arg) - else: - try: - logprint("using default for " + str(arg)) - except: - print "using default for " + str(arg) - return default - -def create_color_list(number, color_map=None, logging=False, max_grey="#595959"): - """ - create color list with given number of entries - grey by default, matplotlib color_map can be provided - """ - - try: - # create pylab colormap - cmap = eval("P.cm." + color_map) - # get descrete color list from pylab - cmaplist = [cmap(i) for i in range(cmap.N)] # extract colors from map - # determine positions for number of colors required - steps = (len(cmaplist)-1)/(number) - numbers = range(0, len(cmaplist), steps) - - # extract color and convert to hex code - colors = [] - for idx in numbers[:-1]: - rgb_color = cmaplist[idx] - col = rgb2hex(rgb_color[0]*255, rgb_color[1]*255, rgb_color[2]*255) - colors.append(col) - - # grey - except: - if not color_map == None: - logprint("Invalid color_map (%s) provided! - Examples: jet, Blues, OrRd, bwr,..." % color_map) - logprint("See https://matplotlib.org/users/colormaps.html\n") - old_max_grey = "#373737" - old_max_grey = "#444444" - colors = list(Color("#FFFFFF").range_to(Color(max_grey), number)) # grey - for idx in range(len(colors)): - colors[idx] = str(colors[idx]).replace("Color ", "") - if "#" in colors[idx] and len(colors[idx]) != 7: - # print colors[idx] - colors[idx] = colors[idx] + colors[idx][-(7-len(colors[idx])):] - - text = "%d Colors: %s" % (len(colors), ", ".join(colors)) - if logging: logprint(text, start=False, printing=True) - - if len(colors) < number: - logprint("\nError in color range definition! %d colors missing\n" % (number - len(colors))) - - return colors - - -############################### -# File Handling # -############################### - -def read_seq(input_fasta, verbose=False): - """ - read fasta sequences from (all) file(s) - """ - - # check if file provided - if input_fasta == [] or input_fasta == "": - text = "Attention: No valid file names provided: >%s<" % input_fasta - logprint(text, start=False, printing=True) - return {}, [] - - # combine sequence files, if required - if type(input_fasta) == list: - # concatenate fasta files - if len(input_fasta) > 1: - if verbose: - print "concatenating fastas...", - text = "concatenating fastas..." - input_fasta_combi = concatenate_files(input_fasta) - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - else: - input_fasta_combi = input_fasta[0] - else: - input_fasta_combi = input_fasta - - # read sequences - if verbose: - print "reading fasta...", - text = "reading fasta...", - try: - seq_dict = SeqIO.index(input_fasta_combi, "fasta") - except ValueError: - logprint("Error reading fasta sequences - please check input files, e.g. for duplicate names!") - return {}, [] - except: - logprint("Error reading fasta sequences - please check input files!") - return {}, [] - - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - - for seq in seq_dict: - if "-" in seq_dict[seq].seq: - # ungapped = seq_dict[seq].seq.ungap("-") # cannot be assigned back to sequence record - text = "\nSequences degapped prior Analysis!!!" - logprint(text, start=False, printing=True) - return read_seq(degap_fasta(input_fasta), verbose=verbose) - - # get ordered sequence names - sequences = [] - for item in SeqIO.parse(input_fasta_combi, "fasta"): - sequences.append(item.id) - return seq_dict, sequences - -def read_gff_color_config(gff_color_config_file=""): - """ - define coloring options for gff-based color shading of self-dotplots - """ - - # default aestetics for annotation shading (e.g. if no user config file is provided) - # dictionary with feature_type as key and tuple(color, transparency, zoom) as value - gff_feat_colors = {"orf": ("#b41a31", 0.2, 0), - "orf_rev": ("#ff773b", 0.3, 0), - "gene": ("#b41a31", 0.2, 0), - "cds": ("darkorange", 0.2, 0), - "exon": ("orange", 0.2, 0), - "intron": ("lightgrey", 0.2, 0), - "utr": ("lightblue", 0.2, 0), - "repeat_region": ("green", 0.3, 0), - "repeat": ("green", 0.3, 0), - "tandem_repeat": ("red", 0.3, 0), - "transposable_element": ("blue", 0.3, 0), - "ltr_retrotransposon": ("#cccccc", 0.5, 0), - "ltr-retro": ("#cccccc", 0.5, 0), - "long_terminal_repeat": ("#2dd0f0", 0.75, 2), - "ltr": ("#2dd0f0", 0.75, 2), - "pbs": ("purple", 0.75, 2), - "ppt": ("#17805a", 0.5, 2), - "target_site_duplication": ("red", 0.75, 2), - "misc_feature": ("grey", 0.3, 0), - "misc_feat": ("grey", 0.3, 0), - "misc": ("grey", 0.3, 0), - "others": ("grey", 0.5, 0)} - if gff_color_config_file in ["", None] or not os.path.exists(str(gff_color_config_file)): - return gff_feat_colors - - text = "Updating GFF color configuration with custom specifications\n" - logprint(text, start=False, printing=True) - - # read custom gff_color_config_file - in_file = open(gff_color_config_file, 'rb') - overwritten = set([]) - for line in in_file: - if not line.startswith("#") and len(line.strip().split("\t")) >= 4: - data = line.strip().split("\t") - feat = data[0].lower() - color = data[1].lower() - - # check, if settings are valid - if not mcolors.is_color_like(color): - color = "grey" - text = "Invalid color specified for %s: %s - default grey" % (data[0], data[1]) - logprint(text) - try: - alpha = float(data[2]) - except: - alpha = 0.75 - text = "Invalid alpha specified for %s: %s - default 0.75" % (data[0], data[2]) - logprint(text) - try: - zoom = float(data[3]) - except: - zoom = 0 - text = "Invalid zoom specified for %s: %s - default 0" % (data[0], data[3]) - logprint(text) - - # track changes of predefined settings - if feat in gff_feat_colors.keys(): - overwritten.add(data[0].lower()) - - gff_feat_colors[feat] = (color, alpha, zoom) - in_file.close() - - # default coloring for unknown annotations - if not "others" in gff_feat_colors.keys(): - gff_feat_colors["others"] = ("grey", 0.5, 0) - - if verbose: - # print configuration - text = "\n\nGFF color specification:\n%s\n" % (60 * ".") - for item in sorted(gff_feat_colors.keys()): - text += "%-30s\t%-10s\t%-5s\t%s\n" % (item, str(gff_feat_colors[item][0]), str(gff_feat_colors[item][1]), str(gff_feat_colors[item][2])) - logprint (text, printing=True) - - # print overwritting feature type specifications - if len(overwritten) != 0: - text = "%d feature type specifications overwritten:" % len(overwritten) - text += "\n\t"+ ", ".join(overwritten) + "\n" - logprint(text, start=False, printing=True) - - text = "GFF color specification updated acc. to %s\n\t%s\n\n" % (gff_color_config_file, ", ".join(gff_feat_colors)) - logprint(text, start=False, printing=True) - - return gff_feat_colors - -def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=True, prefix="", filetype='png', verbose=False): - """ - create feature dictionary from input_gff - sequence name as key and (feature type, start, stop) as value - """ - if type(input_gff_files) != list: - input_gff_files = [input_gff_files] - - # create dictionary with seq_name as key and (type, start and stop) as value - unknown_feats = set([]) - used_feats = set([]) - feat_dict = {} - for input_gff in input_gff_files: - text = "...reading " + input_gff - logprint(text, start=False, printing=True) - - in_file = open(input_gff, 'rb') - for line in in_file: - if not line.startswith("#") and line.strip() != "": - data = line.strip().split("\t") - feat_type = data[2].lower() - if data[6] == "-": - feat_type += "_rev" - if not feat_type.lower() in color_dict.keys(): - if feat_type.lower().replace("_rev", "") in color_dict.keys(): - feat_type = feat_type.replace("_rev", "") - else: - unknown_feats.add(feat_type) - feat_type = "others" - used_feats.add(feat_type) - if not data[0] in feat_dict.keys(): - feat_dict[data[0]] = [(feat_type, int(data[3]), int(data[4]))] # feature type, start, stop - else: - feat_dict[data[0]].append((feat_type, int(data[3]), int(data[4]))) # feature type, start, stop - if verbose: - text = "\nAnnotations for: %s\n" % ", ".join(feat_dict.keys()[:10]) - if len(feat_dict.keys()) > 10: - text = text[:-1] + ", ...\n" - logprint(text, start=False, printing=True) - in_file.close() - - # print feature types without specific shading settings - if len(unknown_feats) != 0: - text = "Missing shading specification for %d feature type(s):\n\t%s\n" % (len(unknown_feats), ", ".join(sorted(unknown_feats))) - logprint(text, start=False, printing=True) - - # create color legend - colors, alphas = [], [] - for item in sorted(used_feats): - colors.append(color_dict[item][0]) - alphas.append(color_dict[item][1]) - legend_figure(colors=colors, lcs_shading_num=len(used_feats), type_nuc=type_nuc, bins=sorted(used_feats), alphas=alphas, gff_legend=True, prefix=prefix, filetype=filetype) - - # print settings - text = "GFF Feature Types: %s\nGFF Colors: %s" % (", ".join(sorted(used_feats)), ", ".join(sorted(colors))) - logprint(text, start=False, printing=True) - - return feat_dict - -def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, verbose=False): - input_file = open(matrix_file_name, 'rb') - - # read sequence names from first column - names = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - names.append(line.strip().split(delim)[0]) - logprint("Delimiter '%s': %d names - %s\n" % (delim, len(names), ", ".join(names))) - - # check if names were found - otherwise try another delimiter - if names == [] and not recursion: - if delim == "\t": - new_delim = "," - else: - new_delim = "\t" - logprint("\nMatrix file not containing data delimited by '%s' - trying to read matrix with delimiter '%s'" % (delim.replace("\t", "\\t"), new_delim)) - info_dict = read_matrix(matrix_file_name, delim=new_delim, symmetric=symmetric, recursion=True, verbose=verbose) - return info_dict - elif names == []: - logprint("Empty matrix file with alternative delimiter!") - return info_dict - input_file.close() - - input_file = open(matrix_file_name, 'rb') - # read matrix entries as values in dictionary with tuple(names) as key - info_dict = {} - contradictory_entries = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - data = line.strip().split(delim) - for idx in range(len(data[1:])): - # print tuple(sorted([data[0], names[idx]])), data[idx+1] - if symmetric: - key = tuple(sorted([names[idx], data[0]])) - else: - key = tuple(names[idx], data[0]) - if key in info_dict.keys(): - if symmetric and info_dict[key] != data[idx+1] and data[idx+1] not in ["", "-"] and info_dict[key] not in ["", "-"]: - contradictory_entries.append(key) - info_dict[key] = data[idx+1] - input_file.close() - - if len(contradictory_entries) != 0: - try: - logprint("\nContradictory entries in matrix file %s:\n\t%s" % (matrix_file_name, ", ".join(contradictory_entries))) - except: - log_txt = "\nContradictory entries in matrix file %s:\n\t" % (matrix_file_name) - for item in contradictory_entries: - log_txt += str(item).replace("'", "") + ", " - log_txt = log_txt[:-2] - logprint(log_txt) - logprint("Using value from bottom left triangle!") - if verbose: - logprint("\nMatrix information for Sequences named: " % ", ".join(names)) - - return info_dict - -def concatenate_files(file_list, combi_filename="temp_combined.fasta", verbose=False): - """ - concatenate content of all files in file_list into a combined file named combi_filename - """ - out_file = open(combi_filename, 'w') - text = "" - for item in file_list: - if verbose: - text += item + " " - print item, - # read in_file linewise and write to out_file - in_file = open(item, 'rb') - for line in in_file: - out_file.write(line.strip()+"\n") - in_file.close() - out_file.close() - if verbose: - logprint(text, start=False, printing=False) - return combi_filename - -def degap_fasta(input_fasta): - """ - remove gaps from fasta - new degapped sequence file created - """ - - # degap all sequence files - output_fastas = [] - if type(input_fasta) != list: - input_fasta = list(input_fasta) - for input_fas in input_fasta: - output_fas = input_fas[:input_fas.rfind(".")] + "_degapped.fas" - in_file = open(input_fas, 'rb') - out_file = open(output_fas, 'w') - for line in in_file: - if line.startswith(">"): - out_file.write(line.strip()+"\n") - else: - out_file.write(line.strip().replace("-", "")+"\n") - out_file.close() - in_file.close() - output_fastas.append(output_fas) - return output_fastas - -def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="png", max_lcs_len=None, min_lcs_len=0, bins=[], alphas=[], gff_legend=False, prefix="", verbose=False): - """ - create figure color legend - """ - max_legend_length_row = 8 - max_legend_length_col = 4 - - # define output file - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg" - logprint(text, start=False, printing=True) - filetype="png" - - # check if length of information fit - if not gff_legend and ((bins != [] and len(colors) != lcs_shading_num+1) or (bins != [] and len(colors) != len(bins)+1)): - if bins != [] and len(colors) != lcs_shading_num+1: - text = "**Attention**\nlcs_shading_num (%d) does not match number of colors (%d)!\n"% (lcs_shading_num, len(bins)) - elif bins != [] and len(colors) != len(bins)+1: - text = "**Attention**\nnumber of LCS length bins (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - elif gff_legend and len(bins) != len(colors): - text = "**Attention**\nnumber of GFF Feature Types (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - - # set alpha values to opaque if none are provided - if alphas == []: - for item in colors: - alphas.append(1) - - # legend data points - data_points = range(len(colors)) - if not gff_legend: - - # specify intervals, if max_lcs_len provided - if max_lcs_len != None: - multi_factor = 100 # one digit - if max_lcs_len <= 1: - multi_factor = 1000 # two digits - # len_interval_size = (max_lcs_len-min_lcs_len) * multi_factor *1. // lcs_shading_num * (1./ multi_factor) - len_interval_size = (max_lcs_len-min_lcs_len) * 1. / lcs_shading_num - len_pos = [float("%.2f" % (min_lcs_len))] - # calculate interval positions - for idx in range(lcs_shading_num): - len_pos.append(float("%.2f" % (len_pos[-1] + len_interval_size))) - - if prefix.startswith("custom-matrix") and (0 <= max_lcs_len <= 100 and 0 <= min_lcs_len <= 100): - unit = "%" - elif prefix.startswith("custom-matrix"): - unit = "" - - text = "\n%d Legend intervals from %.2f to %.2f: \n\t%s - number: %d, step: %.2f, unit: %s\n" % (lcs_shading_num+1, min_lcs_len, max_lcs_len, str(len_pos), len(len_pos), len_interval_size, unit) - logprint(text, start=False, printing=True) - pos = len_pos - interval_size = len_interval_size - else: - # generate legend labels acc. to standard interval notation - interval_size = 100 // lcs_shading_num - pos = range(interval_size, 101+interval_size, interval_size) - - if bins != []: # labels provided - legend_labels = bins[:] - legend_labels.append("max") - legend_labels_lengths = [] - for item in bins: - legend_labels_lengths.append("[%d %s, %d %s)" % (item - min(bins), unit, item, unit)) - if len(bins) == len(colors) - 1: - legend_labels_lengths.append("[%d %s, %s]" % (max(bins), unit, u"\u221E")) # infinite - - else: - legend_labels = [] - legend_labels_lengths = [] - for idx in range(len(pos)): - num = pos[idx] - legend_labels.append("[%d%%, %d%%)" % (num - interval_size, num)) - if max_lcs_len != None: - num = len_pos[idx] - # as int or float - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths.append("[%d %s, %d %s)" % (num, unit, num + len_interval_size, unit)) - else: - legend_labels_lengths.append("[%.2f %s, %.2f %s)" % (num, unit, num + len_interval_size, unit)) - legend_labels[-1] = "100" + unit - if max_lcs_len != None: - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths[-1] = "%d %s" % (max_lcs_len, unit) - else: - legend_labels_lengths[-1] = "%.2f %s" % (max_lcs_len, unit) - - # set labels and choose file name - if gff_legend: - label_text = bins[:] - edge_col = None - legend_file_name = "GFF_Shading_Legend_n%d." % lcs_shading_num + filetype - elif max_lcs_len != None: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_max%d%s_n%d." % (max_lcs_len, unit, lcs_shading_num) + filetype - elif bins != []: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%d%s_n%d." % (bins[0], unit, lcs_shading_num) + filetype - else: - label_text = legend_labels[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%%len_n%d." % lcs_shading_num + filetype - - if prefix != None and prefix != "": - if not prefix.endswith("-"): - prefix = prefix + "-" - legend_type = "LCS" - if prefix.startswith("custom-matrix"): - prefix = prefix.replace("custom-matrix", "")[1:] - legend_type = "CustomMatrix" - legend_file_name = prefix + legend_file_name.replace("LCS", legend_type) - - # plot legend figure - fig, ax = P.subplots(3, 1, figsize=(len(colors)*2, len(colors)*2)) - for idx in range(len(colors)): - ax[0].bar(data_points[idx]+1, data_points[idx]+1, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[2].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].set_ylim(0,1) - ax[2].set_ylim(0,1) - ax[1].legend(ncol=((len(colors)-1)//max_legend_length_row)+1, framealpha=1) # vertical legend - col_num = len(colors) - if len(colors) > max_legend_length_col: - remainder = 0 - if len(colors) % max_legend_length_col != 0: - remainder = 1 - row_num = len(colors) // max_legend_length_col + remainder - remainder = 0 - if len(colors) % row_num != 0: - remainder = 1 - col_num = len(colors) // row_num + remainder - ax[2].legend(ncol=col_num, framealpha=1) # horizontal legend - - P.savefig(legend_file_name) - - return legend_file_name - - -############################### -# Analysis Functions # -############################### - -def wobble_replacement(sequence, general_ambiguity_code, verbose=False): - """ - get all degenerated sequences for sequence with ambiguous residues - (only residues considered that are keys in wobble_dictionary) - """ - - # get positions of ambiguous residues - wobble_pos = [] - for idx in range(len(sequence)): - letter = sequence[idx] - if letter in general_ambiguity_code.keys(): - wobble_pos.append(idx) - - if verbose: - text = "\t%d wobbles" % len(wobble_pos) - logprint(text, start=False, printing=True) - - # replace one wobble through each iteration by all possible residues - # repeat if still wobbles in new kmers - kmer_variants = [sequence] - while True: - if verbose: - text = "\t\t%d kmer variants" % len(kmer_variants) - logprint(text, start=False, printing=True) - temp_kmers = set([]) - for kmer in kmer_variants: - for idx in wobble_pos: - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - for base in general_ambiguity_code[kmer[idx]]: - newkmer = kmer[:idx] + base + kmer[idx+1:] - temp_kmers.add(newkmer) - wobble = False - for kmer in temp_kmers: - for idx in range(len(kmer)): - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - wobble = True - break - if wobble: - break - kmer_variants = set(list(temp_kmers)[:]) - if not wobble: - break - - return kmer_variants - -def split_diagonals(data, stepsize=1): - """ - split array if point difference exceeds stepsize - data = sorted list of numbers - """ - return np.split(data, np.where(np.diff(data) != stepsize)[0]+1) - -def longest_common_substring(s1, s2): - m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))] - longest, x_longest = 0, 0 - for x in xrange(1, 1 + len(s1)): - for y in xrange(1, 1 + len(s2)): - if s1[x - 1] == s2[y - 1]: - m[x][y] = m[x - 1][y - 1] + 1 - if m[x][y] > longest: - longest = m[x][y] - x_longest = x - else: - m[x][y] = 0 - return longest - -def lcs_from_x_values(x_values): - """ - calculate length of longest common substring based on nested list of numbers - """ - if len(x_values) == 0: - return 0 - # get lengths of each subarray data - lengths = np.array([len(i) for i in x_values]) - return max(lengths) - - -############################### -# Matching Functions # -############################### - -def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - - # forward - ################################# - kmer_pos_dict_one = {}; kmer_pos_dict_two = {} # dictionaries for both sequences - - # reverse complement - ################################# - kmer_pos_dict_three = {}; kmer_pos_dict_four = {} # dictionaries for both sequences - - # create dictionaries with kmers (wordsize) and there position(s) in the sequence - if rc_option: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two), - (str(seq_one), kmer_pos_dict_three), - (str(seq_two.reverse_complement()), kmer_pos_dict_four)] - else: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two)] - for (seq, kmer_pos_dict) in data_list: - for i in range(len(seq)-wordsize+1): - kmer = seq[i:i+wordsize] - # discard kmer, if too many Ns included - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - if not convert_wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - wobbles = False - for item in general_ambiguity_code.keys(): - if item in kmer: - wobbles = True - break - if not wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - kmer_variants = wobble_replacement(kmer, general_ambiguity_code) - for new_kmer in kmer_variants: - # print "\t", new_kmer - try: - kmer_pos_dict[new_kmer].append(i) - except KeyError: - kmer_pos_dict[new_kmer] = [i] - - # find kmers shared between both sequences - matches_for = set(kmer_pos_dict_one).intersection(kmer_pos_dict_two) # forward - matches_rc = set(kmer_pos_dict_three).intersection(kmer_pos_dict_four) # reverse complement - - if verbose: - text = "[matches: %i for; %.i rc]" % (len(matches_for), len(matches_rc)) - logprint(text, start=False, printing=True) - - # create lists of x and y co-ordinates for scatter plot - # keep all coordinates of all shared kmers (may match multiple times) - diag_dict_for = {} - diag_dict_rc = {} - for (match_list, pos_dict1, pos_dict2, diag_dict) in [(matches_for, kmer_pos_dict_one, kmer_pos_dict_two, diag_dict_for), - (matches_rc, kmer_pos_dict_three, kmer_pos_dict_four, diag_dict_rc)]: - for kmer in match_list: - for i in pos_dict1[kmer]: - for j in pos_dict2[kmer]: - diag = i-j - points = set(range(i+1, i+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - -def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize via regular expression search - fuzzy matching - allow up to substitution_count substitutions - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - ambiguity_match_dict = alphabets(type_nuc)[3] - - ambiq_residues = "[%s]" % "".join(general_ambiguity_code.keys()) - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # check for wobble presence - if not (regex.search(ambiq_residues, str(seq_one)) == None and regex.search(ambiq_residues, str(seq_two)) == None): - wobble_found = True - else: - wobble_found = False - - # dictionary for matches - diag_dict_for = {} - diag_dict_rc = {} - counter = [0, 0] - - # one-way matching - if rc_option: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0), - (str(seq_one), str(seq_two.reverse_complement()), diag_dict_rc, 1)] - else: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0)] - - for seq_query, seq_target, diag_dict, counter_pos in data_list: - # split query sequence into kmers - if not rc_option and counter_pos == 1: - break - - for idx in range(len(str(seq_query))-wordsize+1): - kmer = str(seq_query)[idx:idx+wordsize] - - # skip excessive N/X stretches (big black areas) - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - # convert kmer to regular expression for wobble_matching - if convert_wobbles and wobble_found: - kmer_string = "" - # replace each residue with matching residues or wobbles - for jdx in range(len(kmer)): - kmer_string += ambiguity_match_dict[kmer[jdx]] - else: - kmer_string = kmer - - # convert to regular expression tolerating substitution errors - if type(substitution_count) == int and substitution_count != 0: - kmer_string = "(%s){s<=%d}" % (kmer_string, substitution_count) - - # search for regular expression in target sequence - kdx = 0 - start = True - if regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - while regex.search(kmer_string, seq_target[kdx:]) != None: - # search for regular expression pattern in target sequence - result = regex.search(kmer_string, seq_target[kdx:]) - - kmer2 = seq_target[kdx:][result.start():result.end()] - - # skip excessive N/X stretches (big black areas) - if kmer2.count(any_residue)*100./wordsize <= max_N_percentage: - diag = idx-(kdx+result.start()) - points = set(range(idx+1, idx+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - kdx += result.start() + 1 - if kdx >= len(seq_target): - break - elif regex.search(kmer_string, seq_target[kdx:]) != None: - counter[counter_pos] += 1 - - if verbose: - text = "%5.i \tforward matches" % counter[0] - text += "\n%5.i \treverse complementary matches" % counter[1] - logprint(text, start=False, printing=True) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - - -############################### -# Dot Plot Functions # -############################### - -def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}): - """ - self-against-self dotplot - partially from biopython cookbook - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least one input sequence - if len(sequences) == 0: - text = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - text += " No sequences provided for selfdotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1 and multi: - text = "\n\nCreating collage output for single selfdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nSelfdotplot Collage: Invalid collage - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences): - ncols = len(sequences) - nrows = 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - if prefix != None and prefix != "": - legend_prefix = prefix + "-Selfdotplot" - else: legend_prefix = "Selfdotplot" - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=legend_prefix, filetype=filetype, verbose=verbose) - - global t1 - - print "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-"), - log_txt = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - - # preparations for file name - name_graph = "Selfdotplots" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if multi: - suffix += "_collage" - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - list_of_png_names = [] - - counter = 0 - for seq_name in sequences: - print seq_name, - log_txt += " " + seq_name - - counter += 1 - if not multi: - P.cla() # clear any prior graph - - # read sequence - seq_record = seq_dict[seq_name] - name_seq = seq_record.id - seq_one = seq_record.seq.upper() - length_seq = len(seq_one) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_regex(seq_one, seq_one, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG", - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_diag(seq_one, seq_one, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - # print P.xticks()[0], P.yticks()[0] - P.axis('scaled') # make images quadratic - P.xlim(0, length_seq+1) - if mirror_y_axis: - P.ylim(0, length_seq+1) # rotate y axis (point upwards) - else: - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - # # use same tick labels for x and y axis - # tick_locs, tick_labels = P.yticks() - # P.xticks(tick_locs) - # P.xlim(0, length_seq+1) - - P.title(unicode_name(shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, fontweight='bold') - # P.title(unicode_name(name_seq), fontsize=label_size*1.3, fontweight='bold') - - # save figure and reinitiate if page is full - if counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' % (prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - else: # not multi - - fig = P.figure(figsize=(plot_size, plot_size)) # figure size needs to be a square - ax = P.subplot(1, 1, 1) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - number = 0 - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.axis('scaled') # make images quadratic - P.xlim(0, length_seq+1) - if mirror_y_axis: - P.ylim(0, length_seq+1) # rotate y axis (point upwards) - else: - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - # # use same tick labels for x and y axis - # tick_locs, tick_labels = P.yticks() - # P.xticks(tick_locs) - # P.xlim(0, length_seq+1) - - P.title(unicode_name(shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size*1.3, fontweight='bold') - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_%s_wordsize%i%s.%s' %(prefix, name_graph, counter, shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos), wordsize, suffix, filetype) - P.savefig(fig_name, bbox_inches='tight') - - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - if multi and counter >= 1: - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - print "\n\nDrawing selfdotplots done" - log_txt += "\n\nDrawing selfdotplots done" - logprint(log_txt, start=False, printing=False) - - return list_of_png_names - -def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, x_label_pos_top=True, length_scaling=True, scale_delim_col="red"): - """ - pairwise dotplot (all-against-all) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least two input sequences - if len(sequences) < 2: - text = "\n%s\n\nCreating %d paired dotplot image \n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += " Please provide at least two sequences for pairdotplot!\n\nTerminating paired dotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 2 and multi: - text = "\n\nCreating collage output for single pairdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nPairdotplot Collage: Invalid collage settings - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences)*(len(sequences)-1): - ncols = len(sequences) - nrows = 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences)*(len(sequences)-1): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - - text = "\n%s\n\nCreating %d paired dotplot image for\n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += ", ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - y_label_rotation = "vertical" - # for cartesian coordinate system with mirrored y-axis: plot x labels below plot - if mirror_y_axis: - x_label_pos_top = False - - # preparations for file name - name_graph = "Pairdotplot" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if length_scaling: - suffix += "_scaled" - if multi: - suffix += "_collage" - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - list_of_png_names = [] - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - - # prepare LCS data file - lcs_data_file = open("%sPairdotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - counter, seq_counter = 0, 0 - print "Drawing pairwise dotplot...", - log_txt = "Drawing pairwise dotplot..." - if verbose: - seq_text = "" - for idx in range(len(sequences)-1): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx+1, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += " " + str(seq_counter) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - - # write LCS data file - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - - else: - # calculate figure size for separate figures - if len_one >= len_two: - sizing = (plot_size, max(2, (plot_size)*len_two*1./len_one)) - # sizing = (plot_size, min(plot_size, max(2, (plot_size-2)*len_two*1./len_one+2))) - else: - sizing = (max(2, (plot_size)*len_one*1./len_two), plot_size) - # sizing = (min(plot_size, max(2, (plot_size-2)*len_one*1./len_two+2)), plot_size) - fig = P.figure(figsize=(plot_size, plot_size)) - - ax = P.subplot(1, 1, 1) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlabel(unicode_name(shorten_name(name_one, max_len=title_length, title_clip_pos=title_clip_pos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.ylabel(unicode_name(shorten_name(name_two, max_len=title_length, title_clip_pos=title_clip_pos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - # P.axis('scaled') # make images scaled by size ### optional update ### - if not multi: - if length_scaling: - ax.set_aspect(aspect='equal', adjustable='box', anchor='NW') - P.xlim(0, len_one+1) - # xlimit = [0, len_one+1] - if mirror_y_axis: - P.ylim(0, len_two+1) # rotate y axis (point upwards) - else: - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - elif not length_scaling: - P.xlim(0, len_one+1) - # xlimit = [0, len_one+1] - if mirror_y_axis: - P.ylim(0, len_two+1) # rotate y axis (point upwards) - else: - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - else: - max_len = max(len_one, len_two) - P.xlim(0, max_len+1) - # xlimit = [0, max_len+1] - if mirror_y_axis: - P.ylim(0, max_len+1) # rotate y axis (point upwards) - else: - P.ylim(max_len+1, 0) # rotate y axis (point downwards) - - # plot line deliminating shorter sequence - if max_len != len_one: - ax.plot((len_one+1, len_one+1), (0, len_two), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - if max_len != len_two: - ax.plot((0, len_one), (len_two+1, len_two+1), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - - # # use same tick labels for x and y axis - # if P.xlim() == P.ylim(): - # tick_locs, tick_labels = P.yticks() - # P.xticks(tick_locs) - # P.xlim(xlimit) - - # evtl. switch x axis position - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - P.setp(ax.get_xticklabels(), fontsize=label_size*.9) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) - - # save figure and reinitiate if page is full - if multi and counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=.5, wspace=.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=.5, wspace=.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - elif not multi: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, bottom=0.05) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02) # space between rows - def 0.4 - - # name and create output files - fig_name = '%s%s-%d_wordsize%i%s.%s' % (prefix, name_graph, counter, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - list_of_png_names.append(fig_name) - fig = P.figure() - - # save figure - if multi and counter >= 1: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=0.5, wspace=0.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.5, wspace=0.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - print - logprint(seq_text, start=False, printing=False) - - return list_of_png_names - -def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, x_label_pos_top=True, lcs_shading=True, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, lcs_shading_num=5, spacing=0.04, input_user_matrix_file="", user_matrix_print=True, rotate_labels=False): - """ - all-against-all dotplot - derived from dotplot function - - lcs_shading_refs: - 0 color relative to maximum lcs observed in dataset [default] - 1 color by coverage of shorter sequence (e.g. lcs = 70% of seq1) - lcs_shading_ori - 0 forward only - 1 reverse only - 2 both orientations (in opposite plot) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - if len(sequences) == 0: - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " No sequences provided for polydotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1: - text = "\n\nCreating polydotplot for single sequence!" - text += "\nRecommendation: Use selfdotplot via '--plotting_mode 0'!\n\n" - logprint(text, start=False, printing=True) - - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " " + " ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - if prefix != None and prefix != "": - legend_prefix = prefix + "-Polydotplot" - else: legend_prefix = "Polydotplot" - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=legend_prefix, filetype=filetype, verbose=verbose) - - if lcs_shading and not type_nuc: - if lcs_shading_ori != 0: - lcs_shading_ori = 0 - text = "Protein shading does not support reverse complementary matching!\n" - logprint(text, start=False, printing=True) - - # read custom shading matrix & match names of sequences to fasta - if input_user_matrix_file != "" and input_user_matrix_file != None: - logprint("Reading user matrix file: %s" % input_user_matrix_file) - # lcs_shading_ori = 2 - custom_dict = read_matrix(input_user_matrix_file) - if custom_dict != {}: - custom_shading = True - custom_similarity_dict = {} - invalid_entries = [] - custom_max = 0 - custom_min = float("Inf") - for key in custom_dict.keys(): - number_key = [] - - # convert number into float - try: - value = float(custom_dict[key]) - if not "." in custom_dict[key]: - value = int(custom_dict[key]) - custom_max = max(custom_max, value) - custom_min = min(custom_min, value) - except: - value = custom_dict[key] - if value == "": - value = None - invalid_entries.append(key) - # match matrix names with sequence names - for item in key: - if item in sequences: - number_key.append(sequences.index(item)) - else: - number_key.append(-1) - # dictionary with tuple of sorted sequence indices as key and number as value - custom_similarity_dict[tuple(sorted(number_key))] = value - if len(invalid_entries) != 0: - text = "No valid number in custom similarity matrix for %d entries: \n\t" % (len(invalid_entries)) - for key in invalid_entries: - text += str(key) + " - " + str(custom_dict[key]) + "; " - logprint(text[:-2]+"\n") - - text = "Custom user matrix given: min %.2f, max %.2f\n" % (custom_min, custom_max) - - # artificially rounding intervals if likely identity/divergence percentages - if 0 <= custom_min < 1 and 0 < custom_max <= 1: - rounding_factor = 5 - multi_factor = 100 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (multi_factor*custom_min // rounding_factor) * (1.*rounding_factor/multi_factor)) - custom_max = min((multi_factor*custom_max // rounding_factor) * (1.*rounding_factor/multi_factor), 1) - text += "new (%.2f, %2f)\n" % (custom_min, custom_max) - - elif 0 <= custom_min < 100 and 0 < custom_max <= 100: - rounding_factor = 5 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (custom_min // rounding_factor) * rounding_factor) - custom_max = min((custom_max // rounding_factor) * rounding_factor, 100) - text += "new (%d, %d)\n" % (custom_min, custom_max) - - logprint(text) - - else: - custom_shading = False - - name_graph = "Polydotplot" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if custom_shading: - suffix += "_matrix" - if lcs_shading: - suffix += "_%dshades_ref%d_ori%s" % (lcs_shading_num+1, lcs_shading_ref, lcs_shading_ori) - if "ref2" in suffix and type_nuc: - suffix = suffix.replace("ref2", "%dbp" % lcs_shading_interval_len) - elif "ref2" in suffix: - suffix = suffix.replace("ref2", "%daa" % lcs_shading_interval_len) - - - # name and create output files (names derived from SEQNAME) - if prefix != None and str(prefix) != "": - prefix = str(prefix) + "-" - else: - prefix = "" - - # preparations for background shading - if lcs_shading or custom_shading: - # create color range white to grey - colors = create_color_list(lcs_shading_num+1, color_map=None, logging=True) - colors_2 = create_color_list(lcs_shading_num+1, color_map="OrRd", logging=True) - - if custom_shading: - text = "Custom Matrix Colors: " + ", ".join(colors_2) - - # write lcs lengths to file - lcs_data_file = open("%sPolydotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - # compare sequences pairwise - save lcs and line information in dictionary for plotting - data_dict = {} # keys = tuple(idx, jdx), value = x1, y1, x2, y2 (line positions) - lcs_dict = {} # keys = tuple(idx, jdx), value = length of lcs: lcs_len or (lcs_for, lcs_rev) - for_lcs_set = set([]) # keep lengths to calculate max (excluding self comparisons) - rev_lcs_set = set([]) # keep lengths to calculate max (all) - - text = "\nTotal plot count: %d" % (len(sequences)*(len(sequences))) - text += "\nTotal calculations: %d" % (len(sequences)*(len(sequences)+1)/2) - logprint(text, start=False, printing=True) - - print "\nCalculating shared regions and lengths of longest_common_substring...", - log_txt = "\nCalculating shared regions and lengths of longest_common_substring..." - # determine matches and length of lcs by comparing all sequence pairs - if verbose: - seq_text = "" - counter = 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif len(sequences) < 5: - print "\t%s (%d %s), %s (%d %s)" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - log_txt += "\t%s (%d %s), %s (%d %s)\n" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - else: - if not counter % 25: - print counter, - log_txt += str(counter) - - # get positions of matches & length of longest common substring based on match lengths - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - data_dict[(idx, jdx)] = x1[:], y1[:], x2[:], y2[:] - lcs_dict[idx, jdx] = lcs_for, lcs_rev - - if idx != jdx: - for_lcs_set.add(lcs_for) - rev_lcs_set.add(lcs_rev) - - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - if not verbose: - print len(sequences)*(len(sequences)+1)/2, " done\n" - log_txt += str(len(sequences)*(len(sequences)+1)/2) + " done\n" - else: - print "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - log_txt += "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - logprint(log_txt, start=False, printing=False) - - if verbose: - logprint ("\n\nlcs_dict\n" + str(lcs_dict)) - if custom_shading: - logprint ("\ncustom_dict\n" + str(custom_dict)) - logprint ("\ncustom_similarity_dict\n\n" + str(custom_similarity_dict)) - - if verbose: - print - logprint(seq_text+"\n", start=False, printing=False) - - if lcs_shading_ref == 2: - color_bins = [] - text = "\nLCS lengh bins: " - for idx in range(lcs_shading_num): - color_bins.append(lcs_shading_interval_len*(idx+1)) - text += " " + str(lcs_shading_interval_len*(idx+1)) - logprint(text, start=False, printing=True) - - # calculate maximum lcs length - if lcs_shading_ori == 0: # forward only - if len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - elif lcs_shading_ori == 1: # reverse complement only - if len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - else: - max_lcs = None - else: # both orientations - if len(rev_lcs_set) != 0 and len(for_lcs_set) != 0: - max_lcs = max(max(rev_lcs_set), max(for_lcs_set)) - elif len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - elif len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - - if not max_lcs == None: - text = "Maximum LCS: %d %s" % (max_lcs, aa_bp_unit) - logprint(text, start=False, printing=True) - if custom_shading: - text = "Maximum custom value: %d\n" % custom_max - logprint(text, start=False, printing=True) - - # count sequences - ncols = len(sequences); nrows = len(sequences) - - # get sequence lengths to scale plot widths and heights accordingly - size_ratios = [] - for item in sequences: - size_ratios.append(len(seq_dict[item].seq)) - - P.cla() # clear any prior graph - # use GridSpec to resize plots according to sequence length - if mirror_y_axis: - height_ratios = size_ratios[::-1] - else: - height_ratios = size_ratios[:] - gs = gridspec.GridSpec(nrows, ncols, - width_ratios=size_ratios, - height_ratios=height_ratios) - fig = P.figure(figsize=(plot_size, plot_size)) - - # for cartesian coordinate system with mirrored y-axis: plot x labels below plot - if mirror_y_axis and representation == 1: - x_label_pos_top = True - elif mirror_y_axis or representation == 2: - x_label_pos_top = False - - # print y labels on the right, if upper right triangle is displayed - if (representation == 1 and not mirror_y_axis) or (representation == 2 and mirror_y_axis): - y_label_pos = 0 # last column - else: # left y label - y_label_pos = 1 # first column - - # determine label orientations - if len(sequences) > 5 or rotate_labels: - x_label_rotation = 45 - y_label_rotation = "horizontal" - if x_label_pos_top: - xhalign = 'left' - xvalign = 'bottom' - else: - xhalign = 'right' - xvalign = 'top' - yhalign = "right" - else: - x_label_rotation = "horizontal" - y_label_rotation = "vertical" - xvalign = "center" - xhalign = "center" - yhalign = "center" - yvalign = 'center' - - # check combination of shading parameters for triangular output - if representation != 0 and lcs_shading and custom_shading: # both directions in triangle - logprint("\nAttention: For triangular output custom-shading and LCS shading cannot be combined!\n") - elif representation != 0 and lcs_shading and lcs_shading_ori == 2: # both directions in triangle - logprint("\nAttention: For triangular output LCS shading for both orientations is combined to max of both orientations!\n") - - print "\nDrawing polydotplot...", - log_txt = "\nDrawing polydotplot..." - - # draw subplots - if verbose: - if lcs_shading and custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "Custom matrix value", "Matrix color index", "LCS color index"]) + "\n" - elif lcs_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "LCS color index for", "LCS color index rev"]) + "\n" - elif custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "Custom matrix value", "Color index for", "Color index rev"]) + "\n" - - if verbose: - seq_text = "" - counter, seq_counter = 0, 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - len_two = len(rec_two.seq) - name_two = rec_two.id - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - len_one = len(rec_one.seq) - name_one = rec_one.id - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - # optional shade background according to length of LCS and/or user matrix - ######################################################################### - - # get interval based on LCS - background_colors = [None, None] - if lcs_shading and (lcs_shading_ref==1 or lcs_shading_ref==2 or max_lcs!=None): # self plot max_lcs_for == None - lcs_len = lcs_dict[(idx, jdx)] - l1 = lcs_len[0] # forward - l2 = lcs_len[1] # reverse complement - - lcs_shading_bool = True - - # calculate shading acc. to chosen option - if lcs_shading_ref == 1: # percentage of shorter sequence - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // min(len_one, len_two)) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // min(len_one, len_two)) - elif lcs_shading_ref == 2: # by given interval size - color_idx0 = min(len(colors)-1, l1 // lcs_shading_interval_len) - color_idx1 = min(len(colors)-1, l2 // lcs_shading_interval_len) - if color_idx0 >= len(colors): - color_idx0 = len(colors) - if color_idx1 >= len(colors): - color_idx1 = len(colors) - else: # percentage of maximum lcs length - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // max_lcs) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // max_lcs) - else: - lcs_shading_bool = False - - # get interval based on custom matrix - if custom_shading: - # matrix value - try: - custom_value = custom_similarity_dict[(idx, jdx)] - except: - custom_value = "" - - # bottom left triangle = LCS forward/reverse or best of both - if lcs_shading_bool: - if lcs_shading_ori == 0: # forward - color_idx1 = color_idx0 - elif lcs_shading_ori == 2: # both directions - color_idx1 = max(color_idx0, color_idx1) - - # top right triangle = custom value (not colored if text matrix provided) - if type(custom_value) == int or type(custom_value) == float: - color_idx0 = int((custom_value-custom_min)*lcs_shading_num // (custom_max-custom_min)) - # no color if string is proviced - else: - color_idx0 = 0 - - # use best LCS of both orientations for coloring triangle with two-ori-LCS - if representation != 0 and lcs_shading_ori == 2: # both directions in triangle - color_idx0, color_idx1 = max(color_idx0, color_idx1), max(color_idx0, color_idx1) - - # set colors dependent on lcs dependent on orientation - if lcs_shading_bool and not custom_shading: - if idx != jdx: - if lcs_shading_ori == 0: - color_idx1 = color_idx0 - elif lcs_shading_ori == 1: - color_idx0 = color_idx1 - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx1] - # for selfcomparison, only color reverse complement - elif lcs_shading_ori != 0 and not custom_shading: - background_colors[0] = colors[color_idx1] - # set different colors for shading by LCS + user matrix - elif lcs_shading_bool and custom_shading: - # print colors, background_colors, color_idx0, color_idx1 - background_colors[0] = colors_2[color_idx0] - background_colors[1] = colors[color_idx1] - # set grey color range for user matrix if no LCS shading - elif custom_shading: - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx0] - - if verbose: - if custom_shading and lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - elif lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(color_idx0), str(color_idx1)]) + "\n" - elif custom_shading: - lcs_text += "\t".join([name_one, name_two, str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - - # calculate figure position in polyplot - # diagonal (self-dotplots) - if idx == jdx: - if mirror_y_axis: - seq_num = sequences.index(name_one)+1 - counter1 = seq_num + len(sequences) * (len(sequences)-seq_num) - counter = counter + (counter - 1) // (nrows) - else: - # skip positions below diagonal - counter1 = counter + (counter - 1) // (nrows) # + row_pos - counter = counter1 - counters = [counter1] - - # draw both graphs at once (due to symmetry) - else: - if mirror_y_axis: - col_pos = sequences.index(name_two)+1 - row_pos = len(sequences) - (sequences.index(name_one)+1) - counter1 = row_pos * ncols + col_pos - counter2 = (ncols - col_pos) * ncols + ncols - row_pos - else: - counter1 = counter - col_pos = (counter - 1) % ncols - row_pos = (counter - 1) // (nrows) - counter2 = col_pos * ncols + row_pos + 1 - counters = [counter1, counter2] # lower, upper - - if len(counters) == 2: - seq_counter += 1 - if not verbose and not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - x_lists, y_lists, x_lists_rc, y_lists_rc = data_dict[(idx, jdx)] - - # plot diagram(s) - for kdx in range(len(counters)): - - if representation == 0 or len(counters) == 1 or (representation == 1 and kdx == 0) or (representation == 2 and kdx == 1): - - fig_pos = counters[kdx] - # plotting subplot with matplotlib - ax = P.subplot(gs[fig_pos-1]) # rows, columns, plotnumber - - # shade annotated regions if gff file(s) provided - if idx == jdx and gff_files != None and gff_files != []: - if name_one in feat_dict.keys(): - features = feat_dict[name_one] - if len_two != len_one: - logprint("Polydot GFF shading for diagonal fields - nequal length error!") - return - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(len_one+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # if custom matrix value printed into upper matrix triangle, skip data plotting - # text print in top triangle - if user_matrix_print and custom_shading and kdx==0 and idx!=jdx: - data_plotting = False - # dotplot in bottom triangle - else: - data_plotting = True - - # mirror plot, if plotting below diagonal - if kdx == 0: - l1, l2 = len_one, len_two - n1, n2 = name_one, name_two - x1, y1 = x_lists, y_lists - x2, y2 = x_lists_rc, y_lists_rc - else: - l2, l1 = len_one, len_two - n2, n1 = name_one, name_two - x1, y1 = y_lists, x_lists - x2, y2 = y_lists_rc, x_lists_rc - - if mirror_y_axis: - x1, y1, x2, y2 = y1, x1, y2, x2 - n1, n2 = n2, n1 - - if data_plotting: - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # plot value provided by customer instead of dotplot - else: - alignment = {'horizontalalignment': 'center', 'verticalalignment': 'center'} - # P.text(0.5, 0.5, custom_value, size='medium', transform=ax.transAxes, **alignment) - P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, **alignment) - # P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, - # horizontalalignment='center', verticalalignment='center', color="black") - - if custom_shading: - # omit diagonal - if idx == jdx: - ax.set_facecolor("white") - # use white background for text fields (top right triangle only [kdx 0]) - elif type(custom_value) != int and type(custom_value) != float and kdx == 0: - ax.set_facecolor("white") - else: - ax.set_facecolor(background_colors[kdx]) - # set background color if lcs shading - elif lcs_shading_bool and background_colors[kdx] != None: - ax.set_facecolor(background_colors[kdx]) - - # set axis limits - # P.xlim(0, l1+1) - if mirror_y_axis: - P.xlim(0, l2+1) - P.ylim(0, l1+1) # rotate y axis (point upwards) - else: - P.xlim(0, l1+1) - P.ylim(l2+1, 0) # rotate y axis (point downwards) - - ## axis labelling - ################## - - # determine axis positions - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - x_label_bool = fig_pos <= ncols - x_tick_bool = fig_pos > ncols*(ncols-1) - else: - x_label_bool = fig_pos > ncols*(ncols-1) - x_tick_bool = fig_pos <= ncols - - # settings for y labels on right side - if y_label_pos == 0: # right label - ax.yaxis.tick_right() - ax.yaxis.set_label_position("right") - label_dist = 30 - else: - label_dist = 8 - - # x axis labels dependent on plot position/number - if x_label_bool: # x title and labels on top or bottom - P.xlabel(unicode_name(shorten_name(n1, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, rotation=x_label_rotation, verticalalignment=xvalign, horizontalalignment=xhalign, fontweight='bold', labelpad=8) # axis naming - if not x_label_rotation in ["horizontal", "vertical"]: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation="vertical") - else: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation=x_label_rotation) - elif x_tick_bool and x_label_pos_top: # x ticks on bottom row - ax.xaxis.tick_bottom() # ticks without labels on bottom - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) - elif x_tick_bool: # x ticks on top row - ax.xaxis.tick_top() # # ticks without labels on top - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) # inner diagrams without labelling - elif idx == jdx and representation != 0: - if not mirror_y_axis and representation == 1: # upper - ax.xaxis.tick_bottom() - elif mirror_y_axis and representation == 2: # lower - ax.xaxis.tick_top() - elif mirror_y_axis and representation == 1: # upper - ax.xaxis.tick_bottom() - elif not mirror_y_axis and representation == 2: # lower - ax.xaxis.tick_top() - P.setp(ax.get_xticklabels(), visible=False) # inner diagrams without labelling - else: # no x ticks on internal rows - ax.axes.get_xaxis().set_visible(False) - - # y axis labels dependent on plot position/number - if fig_pos % ncols == y_label_pos or (ncols == 1 and nrows == 1): # y title and labels in 1st column - P.ylabel(unicode_name(shorten_name(n2, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, rotation=y_label_rotation, verticalalignment=yvalign, horizontalalignment=yhalign, fontweight='bold', labelpad=label_dist) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) # axis naming - elif fig_pos % ncols == 0: # y ticks in last column - ax.yaxis.tick_right() - P.setp(ax.get_yticklabels(), visible=False) # inner diagrams without labelling - elif idx == jdx and representation != 0: - if not mirror_y_axis and representation == 1: # upper - ax.yaxis.tick_left() - elif mirror_y_axis and representation == 2: # lower - ax.yaxis.tick_left() - elif mirror_y_axis and representation == 1: # upper - ax.yaxis.tick_right() - elif not mirror_y_axis and representation == 2: # lower - ax.yaxis.tick_right() - P.setp(ax.get_yticklabels(), visible=False) # inner diagrams without labelling - else: - ax.axes.get_yaxis().set_visible(False) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - try: - logprint(lcs_text, start=False, printing=True) - except: - pass - - # finalize layout - margins & spacing between plots - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - # gs.tight_layout(fig, h_pad=.02, w_pad=.02) # less overlapping tick labels, but also disturbingly large spacing - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, top=0.87) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, bottom=0.13) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing) # space between rows - def 0.4 - - # save figure and close instance - fig_name = '%s%s_wordsize%i%s.%s' % (prefix, name_graph, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - - # create figure color legend - if lcs_shading: - if lcs_shading_ref == 1: # percentage of shorter sequence - legend_file_name = legend_figure(colors, lcs_shading_num, unit="%", filetype=filetype, prefix=prefix) - elif lcs_shading_ref == 2: # interval sizes - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, bins=color_bins) - else: # relative of maximum lcs - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, max_lcs_len=max_lcs) - - if custom_shading: - custom_prefix = "custom-matrix-" + prefix - legend_file_name_custom = legend_figure(colors_2, lcs_shading_num, unit="%", filetype=filetype, prefix=custom_prefix, max_lcs_len=custom_max, min_lcs_len=custom_min) - - if lcs_shading and custom_shading: - return [fig_name, legend_file_name, legend_file_name_custom] - elif lcs_shading: - return [fig_name, legend_file_name] - elif custom_shading: - return [fig_name, legend_file_name_custom] - else: - return [fig_name] - - -############################### -# Function Call # -############################### - -def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_size=10, filetype="png", type_nuc=True, convert_wobbles=False, substitution_count=0, rc_option=True, alphabetic_sorting=False, gff=None, multi=True, ncols=1, nrows=1, lcs_shading=True, lcs_shading_num=5, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, gff_color_config_file="", input_user_matrix_file="", user_matrix_print=False, length_scaling=True, title_length=50, title_clip_pos="B", spacing=0.04, max_N_percentage=49, mirror_y_axis=False, verbose=False): - - global t1, line_col_rev - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage = 49 - if type_nuc: - ambiq_res = "N" - else: - ambiq_res = "X" - text = "Provide valid max_N_percentage, kmers with >50%% %ss are ignored\n" % (ambiq_res) - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given:%s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - # read gff color config file if provided - if len(input_gff_files) != 0 and input_gff_files != None: - if gff_color_config_file not in ["", None]: - text = "\n%s\n\nReading GFF color configuration file\n%s\n\n=> %s\n" % (50*"=", 28*"-", gff_color_config_file) - logprint(text, start=False, printing=True) - gff_feat_colors = read_gff_color_config(gff_color_config_file) - else: - gff_feat_colors = {} - if gff_color_config_file not in ["", None]: - text = "Please provide GFF annotation files to use configuration file", gff_color_config_file - logprint(text, start=False, printing=True) - - # if color is set to white, reverse complementary matches are skipped - if not rc_option: - line_col_rev = "white" # reverse matches not calculated - elif not type_nuc: - logprint("Reverse complement deactivated for proteins!") - line_col_rev = "white" # reverse matches not calculated - - mode_text = [] - for item in modes: - mode_text.append(str(item)) - text = "%s\n\nRunning plotting modes %s" % (50*"=", ", ".join(mode_text)) - logprint(text, start=False, printing=True) - - - # create dotplots - ########################################## - - # self dotplots - t1 = time.time() - if 0 in modes: - list_of_png_names = selfdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, gff_files=gff, gff_color_dict=gff_feat_colors, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # paired dotplots - if 1 in modes: - if multi: - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=length_scaling, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - else: - if not length_scaling: - text = "\nPairwise dotplot with individual output files scaled by sequence length automatically!" - logprint(text, start=False, printing=True) - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, length_scaling=True, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # all-against-all dotplot - if 2 in modes: - list_of_png_names = polydotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, spacing=spacing, gff_files=gff, gff_color_dict=gff_feat_colors, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - text = "\n" + 50 * "#" + "\n" + 50 * "#" - text += "\n\nThank you for using FlexiDot!\n" - logprint(text, start=False, printing=True) - -# testing mode for debugging -trial_mode = False -#trial_mode = True - -# parameters = check_input(sys.argv) -parameters = check_input(sys.argv, trial_mode=trial_mode) - -# read out parameters -commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype, type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos_top, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose = parameters - -# evtl. overwrite parameters for testing purposes in trial mode -if trial_mode: - input_fasta = ["test-sequences-8.fas"] - input_gff_files = ["Seq2_annotations.gff3"] - # input_user_matrix_file = "matrix.txt" - # user_matrix_print = True - output_file_prefix = "#Test" - plot_size = 10 - plotting_modes = [0,1,2] - plotting_modes = [2] - lcs_shading = False - lcs_shading = True - lcs_shading_ref = 2 - lcs_shading_num = 4 - lcs_shading_ori = 0 - lcs_shading_interval_len = 15 - wordsize = 10 - wordsize = 7 - x_label_pos_top = True - filetype = "pdf" - filetype = "png" - mirror_y_axis = False - mirror_y_axis = True - - output_file_prefix = "#R-upper" - representation = 0 # both - representation = 1 # upper - representation = 2 # lower - - wobble_conversion = False - wobble_conversion = True - - substitution_count = 0 - - rc_option = True - rc_option = False - label_size = 10 - - verbose = False - verbose = True - -if auto_fas: - path = os.path.dirname(os.path.abspath(__file__)) - files_long = glob.glob(path+"/*.fasta") - files_long.extend(glob.glob(path+"/*.fas")) - files_long.extend(glob.glob(path+"/*.fa")) - files_long.extend(glob.glob(path+"/*.fna")) - input_fasta = [] - for i in files_long: - if not "combined" in i: - filename = i[i.rfind('\\')+1:] - input_fasta.append(filename) - -if trial_mode: - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - -main(input_fasta, wordsize, modes=plotting_modes, prefix=output_file_prefix, plot_size=plot_size, label_size=label_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=wobble_conversion, substitution_count=substitution_count, rc_option=rc_option, gff=input_gff_files, multi=collage_output, ncols=m_col, nrows=n_row, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, gff_color_config_file=gff_color_config_file, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, length_scaling=length_scaling, title_length=title_length, title_clip_pos=title_clip_pos, spacing=spacing, max_N_percentage=max_N_percentage, mirror_y_axis=mirror_y_axis, verbose=verbose) - - diff --git a/code/flexidot_v1.05.py b/code/flexidot_v1.05.py deleted file mode 100644 index f326726..0000000 --- a/code/flexidot_v1.05.py +++ /dev/null @@ -1,3390 +0,0 @@ -#!/usr/bin/python2.7 -# -*- coding: utf-8 -*- - -""" -FlexiDot Version 1.05 - -FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation - -Kathrin M. Seibt, Thomas Schmidt and Tony Heitkam -Institute of Botany, TU Dresden, Dresden, 01277, Germany - -Bioinformatics (2018) Vol. 34 (20), 3575–3577, doi 10.1093/bioinformatics/bty395 -""" - - -############################### -# Requirements # -############################### - -# import system modules -import os, glob -import time, datetime -import sys -import shutil, getopt -import unicodedata - -def module_install_command(module_name, upgrade=False): - """ - create installation commands for Python modules and print information - """ - if upgrade: - load_command = "python -m pip install --upgrade %s" % module_name - else: - load_command = "python -m pip install %s" % module_name - - try: - logprint("Installing Python module: %s\n\t%s\n" % (module_name, load_command)) - except: - print "Installing Python module: %s\n\t%s\n" % (module_name, load_command) - - return load_command - -def load_modules(): - """ - load Python modules, if possible - otherwise try to install them - """ - # make module names global - global cllct, gridspec, patches, rcParams, mplrc, P, Color, SeqIO, np, mcolors, rgb2hex, regex - - # matplotlib - try: - import matplotlib.collections as cllct - except: - command = module_install_command("matplotlib", upgrade=True) - try: - os.system(command) - print "\n" - import matplotlib.collections as cllct - except: - print "Please install module matplotlib manually" - import matplotlib.colors as mcolors - import matplotlib.gridspec as gridspec - import matplotlib.patches as patches - import pylab as P - P.switch_backend('agg') # bugfix for _tkinter.TclError on CentOs 7 servers, see Github Issue #5 - - # specify matplotlib font settings - from matplotlib import rc as mplrc - mplrc('pdf', fonttype=42, compression=0) - from matplotlib import rcParams - rcParams['font.family'] = 'sans-serif' - rcParams['font.sans-serif'] = ['Helvetica', 'Verdana', 'Tahoma' , 'DejaVu Sans', 'Droid Sans Mono', 'Sans', 'Liberation', 'Ubuntu', 'Arial', ] - - # colour for color gradient palette - try: - from colour import Color - except: - command = module_install_command("colour") - try: - os.system(command) - print "\n" - from colour import Color - except: - print "Please install module colour manually" - - # color converter - try: - from colormap import rgb2hex - except: - command = module_install_command("colormap") - # additional module easydev.tools required by colormap - command2 = module_install_command("easydev") - try: - os.system(command) - os.system(command2) - print "\n" - from colormap import rgb2hex - except: - print "Please install module colormap manually" - - # biopython - try: - from Bio import SeqIO - except: - command = module_install_command("biopython") - try: - os.system(command) - print "\n" - from Bio import SeqIO - except: - print "Please install module biopython manually" - - # numpy - try: - import numpy as np - except: - command = module_install_command("numpy") - try: - os.system(command) - print "\n" - import numpy as np - except: - print "Please install module numpy manually" - - # regex for pattern matching - try: - import regex - except: - command = module_install_command("regex") - try: - os.system(command) - print "\n" - import regex - except: - print "Please install module regex manually" - - - -############################### -# Usage & Input # -############################### - -def usage(): - """ - usage and help - """ - - print """\n\n FLEXIDOT - ------------------------------------------------------------------- - - Version: - 1.05 - - Citation: - Kathrin M. Seibt, Thomas Schmidt, Tony Heitkam (2018) - "FlexiDot: Highly customizable ambiguity-aware dotplots for visual sequence investigation" - Bioinformatics 34 (20), 3575–3577, doi: 10.1093/bioinformatics/bty395 - - - General usage: - $ python flexidot.py -a [ARGUMENTS] - $ python flexidot.py -i [ARGUMENTS] - - - ARGUMENTS - ------------------------------------------------------------------- - - - INPUT/OUTPUT OPTIONS... required are [-a] OR [-i] - - -a, --auto_fas Imports all fasta files from current directory (*.fasta, *.fas, *.fa, *.fna) - -i is not needed, if -a is activated - [inactive by default] - - -i, --in_file Input fasta file (fasta file name or comma-separated file list) - > Provide multiple files: Recall -i or provide comma-separated file names - - -o, --output_file_prefix File prefix to be added to the generated filenames [default = NONE] - - -c, --collage_output Multiple dotplots are combined in a collage - Y or 1 = ON [default] - N or 0 = OFF - - -m, --m_col Number of columns per page [default = 4] (only if --collage_output is ON) - - -n, --n_row Number of rows per page [default = 5] (only if --collage_output is ON) - - -f, --filetype Output file format - 0 = PNG [default] - 1 = PDF - 2 = SVG - - -s, --alphabetic_sorting Sort sequences alphabetically according to titles - Y or 1 = ON - N or 0 = OFF [default] - - - CALCULATION PARAMETERS... - - -k, --wordsize Wordsize (kmer length) for dotplot comparison [default = 10] - - -p, --plotting_mode Mode of FlexiDot dotplotting - 0 = self [default] - 1 = paired - 2 = poly (matrix with all-against-all dotplots) - > Run multiple plotting modes: Recall -p or provide comma-separated numbers - - -t, --type_nuc Type of residue is nucleotide - Y or 1 = nucleotide [default] - N or 0 = amino acid - - -w, --wobble_conversion Ambiguity handling for relaxed matching - Y or 1 = ON - N or 0 = OFF [default] - - -S, --substitution_count Number of substitutions (mismatches) allowed per window for relaxed matching - [default = 0] - - -r, --rc_option Find reverse complementary matches (only if type_nuc=y) - Y or 1 = ON [default] - N or 0 = OFF - - -O, --only_vs_first_seq Limit pairwise comparisons to match all sequences to 1st sequence only - (only if --plotting_mode=1) - Y or 1 = ON - N or 0 = OFF [default] - - GRAPHIC FORMATTING... - - -A, --line_width Line width [default = 1] - - -B, --line_col_for Line color [default = black] - - -C, --line_col_rev Reverse line color [default = green] - - -D, --x_label_pos Position of the X-label - Y or 1 = top [default] - N or 0 = bottom - - -E, --label_size Font size [default = 10] - - -F, --spacing Spacing between all-against-all dotplots (only if --plotting_mode=2) - [default = 0.04] - - -L, --length_scaling Scale plot size for pairwise comparison (only if --plotting_mode=1) - Y or 1 = Scaling ON (axes scaled according to sequence length) - N or 0 = Scaling OFF (squared plots) [default] - - -M, --mirror_y_axis Flip y-axis bottom to top (cartesian coordinate system) - Y or 1 = y-axis bottom to top - N or 0 = y-axis top to bottom [default] - - -P, --plot_size Plotsize [default = 10] - - -R, --representation Region of plot to display (only if --plotting_mode=2) - 0 = full [default] - 1 = upper - 2 = lower - - -T, --title_length Limit title length for dotplot comparisons - [default = 20] - Position of selection can be specified by appending a letter (e.g. -T 20E) - B = beginning [default] - E = end - - - GFF SHADING (for -p/--plotting_mode=0 only)... - - -g, --input_gff_files GFF3 file used for markup in self-dotplots - (provide multiple files: Recall -g or provide comma-separated file names) - - -G, --gff_color_config_file Tab-delimited config file for custom gff shading - column 1: feature type - column 2: color - column 3: alpha - column 4: zoom factor (for small regions) - - - LCS SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -x, --lcs_shading Shade subdotplot based on the length of the longest common substring (LCS) - Y or 1 = ON - N or 0 = OFF [default] - - -X, --lcs_shading_num Number of shading intervals (hues) for LCS (-x) and user matrix shading (-u) - [default = 5] - - -y, --lcs_shading_ref Reference for LCS shading - 0 = maximal LCS length [default] - 1 = maximally possible length (length of shorter sequence in pairwise comparison) - 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] - see -Y - - -Y, --lcs_shading_interval_len Length of intervals for LCS shading (only if --lcs_shading_ref=2) - [default for nucleotides = 50; default for amino acids = 10] - - -z, --lcs_shading_ori Shade subdotplots according to LCS on - 0 = forward [default], - 1 = reverse, or - 2 = both strands (forward shading above diagonal, reverse shading on diagonal and below; - if using --input_user_matrix_file, best LCS is used below diagonal) - - - CUSTOM USER MATRIX SHADING OPTIONS (for -p/--plotting_mode=2 only)... - - -u, --input_user_matrix_file Shading above diagonal according to values in matrix file specified by the user - (tab-delimited or comma-separated matrix with sequence name in column 1 and numbers in columns 2-n - e.g. identity matrix from multiple sequence alignment - strings are ignored) - - -U, --user_matrix_print Display provided matrix entries in the fields above diagonal of all-against-all dotplot - Y or 1 = ON - N or 0 = OFF [default] - - - OTHERS... - - -h, --help Help screen - - -v, --verbose Verbose - - - - - """ - -def check_input(argv, trial_mode=False): - """ - commandline argument parsing - """ - - global log_txt, aa_bp_unit - - # helpers for argument parsing - ###################################### - - arguments = ["-a", "--auto_fas", "a", "auto_fas", - "-i", "--input_fasta", "i:", "input_fasta=", - "-o", "--output_file_prefix", "o:", "output_file_prefix=", - "-c", "--collage_output", "c:", "collage_output=", - "-m", "--m_col", "m:", "m_col=", - "-n", "--n_row", "n:", "n_row=", - "-f", "--filetype", "f:", "filetype=", - "-t", "--type_nuc", "t:", "type_nuc=", - "-g", "--input_gff_files", "g:", "input_gff_files", - "-G", "--gff_color_config_file", "G:", "gff_color_config_file", - "-k", "--wordsize", "k:", "wordsize=", - "-p", "--plotting_mode", "p:", "plotting_mode=", - "-w", "--wobble_conversion", "w:", "wobble_conversion=", - "-S", "--substitution_count", "S:", "substitution_count=", - "-r", "--rc_option", "r:", "rc_option=", - "-O", "--only_vs_first_seq", "O:", "only_vs_first_seq=", - "-s", "--alphabetic_sorting", "s:", "alphabetic_sorting=", - "-x", "--lcs_shading", "x:", "lcs_shading=", - "-X", "--lcs_shading_num", "X:", "lcs_shading_num=", - "-y", "--lcs_shading_ref", "y:", "lcs_shading_ref=", - "-Y", "--lcs_shading_interval_len", "Y:", "lcs_shading_interval_len=", - "-z", "--lcs_shading_ori", "z:", "lcs_shading_ori=", - "-u", "--input_user_matrix_file", "u:", "input_user_matrix_file=", - "-U", "--user_matrix_print", "U:", "user_matrix_print=", - "-P", "--plot_size", "P:", "plot_size=", - "-A", "--line_width", "A:", "line_width=", - "-B", "--line_col_for", "B:", "line_col_for=", - "-C", "--line_col_rev", "C:", "line_col_rev=", - "-D", "--x_label_pos", "D:", "x_label_pos=", - "-E", "--label_size", "E:", "label_size=", - "-F", "--spacing", "F:", "spacing=", - "-L", "--length_scaling", "L:", "length_scaling=", - "-M", "--mirror_y_axis", "M:", "mirror_y_axis=", - "-R", "--representation", "R:", "representation=", - "-T", "--title_length", "T:", "title_length=", - "-h", "--help", "h", "help", - "-v", "--verbose", "v", "verbose"] - - arguments_sysargv = tuple(arguments[0::4] + arguments[1::4]) - arguments_opts = "".join(arguments[2::4]) - arguments_args = arguments[3::4] - - - # setting defaults - ###################################### - - auto_fas = False # 0 - input_fasta = [] - output_file_prefix = None - collage_output = True # 1 - m_col = 4 - n_row = 5 - filetype = 0 - type_nuc = True - input_gff_files = [] - gff_color_config_file = "" - - wordsize = 10 - plotting_modes = [0] - wobble_conversion = False # 0 - substitution_count = 0 - rc_option = True # 1 - alphabetic_sorting = False # 0 - only_vs_first_seq = False # 0 - - lcs_shading = False # 0 - lcs_shading_num = 4 - lcs_shading_ref = 0 - lcs_shading_interval_len = 50 # interval default changes to "10" for amino acids [type_nuc = n] - lcs_shading_ori = 0 - - input_user_matrix_file = "" - user_matrix_print = False - - plot_size = 10 - line_width = 1 - line_col_for = "black" - line_col_rev = "#009243" - x_label_pos = True # 0 - label_size = 10 - spacing = 0.04 - length_scaling = False # 0 - title_length = 20 # float("Inf") - title_clip_pos = "B" # B (begin), E (end) - max_N_percentage = 49 # fixed value, no user input - mirror_y_axis = False - representation = 0 - - aa_bp_unit = "bp" - - verbose = False # 0 - - filetype_dict = {0: "png", 1: "pdf", 2: "svg"} - lcs_shading_ref_dict = {0: "maximal LCS length", 1: "maximally possible length", 2: "given interval sizes"} - plotting_mode_dict = {0: "self", 1: "paired", 2: "all-against-all"} - lcs_shading_ori_dict = {0: "forward", 1: "reverse complement", 2: "both"} - representation_dict = {0: "full", 1: "upper", 2: "lower"} - - # return default parameters for testing purposes - if trial_mode: - print "ATTENTION: YOU ARE IN THE TRIAL MODE!!!\n\n" - - commandline = "trial_mode\n" - - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, only_vs_first_seq, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose] - return parameters - - - # read arguments - ###################################### - - commandline = "" - for arg in sys.argv: - commandline += arg + " " - - log_txt = "\n...reading input arguments..." - print log_txt - - if len(sys.argv) < 2: - print "\nERROR: More arguments are needed. Exit..." - log_txt += "\nERROR: More arguments are needed. Exit..." - usage() - sys.exit() - - elif sys.argv[1] not in arguments_sysargv: - print "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - log_txt += "\nINPUT ERROR: Input argument %s unknown. Please check the help screen." % sys.argv[1] - # usage() - sys.exit() - - try: - opts, args = getopt.getopt(sys.argv[1:], arguments_opts, arguments_args) - - except getopt.GetoptError: - print "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - log_txt += "\nINPUT ERROR (getopt): Input argument %s unknown. Please check the help screen." % sys.argv[1:] - # usage() - sys.exit() - - for opt, arg in opts: - - if opt in ("-h", "--help"): - print "...fetch help screen" - log_txt += "\n...fetch help screen" - usage(), sys.exit() - - if opt in ("-v", "--verbose"): - print "...verbose output" - log_txt += "\n...verbose output" - verbose = True - - elif opt in ("-i", "--input_fasta"): - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: fasta_file '%s' was not found!" % str(temp_file) - sys.exit(message) - else: - input_fasta.append(str(temp_file)) - print "fasta file #%i: %s" % (len(input_fasta), str(temp_file)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: fasta_file '%s' was not found!" % str(arg) - log_txt += message - sys.exit(message) - else: - input_fasta.append(str(arg)) - print "fasta file #%i: %s" % (len(input_fasta), str(arg)) - log_txt += "\nfasta file #%i: %s" % (len(input_fasta), str(arg)) - - - elif opt in ("-a", "--auto_fas"): - auto_fas = True - - - # multiple gff files: reads them into a list - elif opt in ("-g", "--input_gff_files"): - - # append gff file only if existing - if "," in arg: - arg_list = arg.split(",") - for temp_file in arg_list: - if not os.path.exists(str(temp_file)): - message = "\nERROR: gff_file '%s' was not found!" % str(temp_file) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - print "GFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(temp_file)) - input_gff_files.append(str(temp_file)) - else: - if not os.path.exists(str(arg)): - message = "\nERROR: gff_file '%s' was not found!" % str(arg) - print message - log_txt += message - print " -->Running FlexiDot without this gff file!" - log_txt += "\n -->Running FlexiDot without this gff file!" - else: - input_gff_files.append(str(arg)) - print "GFF file #%i: %s" %(len(input_gff_files), str(arg)) - log_txt += "\nGFF file #%i: %s" %(len(input_gff_files), str(arg)) - - - elif opt in ("-G", "--gff_color_config_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: gff_color_config_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot with default gff coloring specification!" - log_txt += message + "\n -->Running FlexiDot with default gff coloring specification!" - else: - gff_color_config_file = str(arg) - - - elif opt in ("-u", "--input_user_matrix_file"): - if not os.path.exists(str(arg)): - message = "\nERROR: input_user_matrix_file '%s' was not found!" % str(arg) - print message + "\n -->Running FlexiDot without input_user_matrix_file %s!" % arg - log_txt += message + "\n -->Running FlexiDot withdefault matrix shading file!" - else: - input_user_matrix_file = str(arg) - - elif opt in ("-U", "--user_matrix_print"): - user_matrix_print = check_bools(str(arg), default=user_matrix_print) - - elif opt in ("-o", "--output_file_prefix"): - output_file_prefix = arg - - elif opt in ("-c", "--collage_output"): - collage_output = check_bools(str(arg), default=collage_output) - - elif opt in ("-m", "--m_col"): - try: m_col = int(arg) - except: - print "m_col - invalid argument - using default value" - log_txt += "\nm_col - invalid argument - using default value" - - elif opt in ("-n", "--n_row"): - try: n_row = int(arg) - except: - print "n_row - invalid argument - using default value" - log_txt += "\nn_row - invalid argument - using default value" - - elif opt in ("-f", "--filetype"): - if 0 <= int(arg) <= 2: - filetype = int(arg) - else: - print "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - log_txt += "\nERROR: Please provide valid filetype argument. %s is out of range. It will be set to -f 0 [default]." %(filetype) - - elif opt in ("-t", "--type_nuc"): - type_nuc = check_bools(str(arg), default=type_nuc) - - if type_nuc == False: - # interval default changed for amino acids - lcs_shading_interval_len = 10 - aa_bp_unit = "aa" - - elif opt in ("-k", "--wordsize"): - try: wordsize = int(arg) - except: - print "wordsize - invalid argument - using default value" - log_txt += "\nwordsize - invalid argument - using default value" - - elif opt in ("-p", "--plotting_mode"): - if "," in arg: - temp_modes = arg.split(",") - for item in temp_modes: - if item in ["0","1","2"]: - plotting_modes.append(int(item)) - elif arg in ["0","1","2"]: - plotting_modes = [int(arg)] - else: - print "Please provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - log_txt += "\nPlease provide valid plotting_modes argument - e.g. 1 or 0,1,2 - using default [0]" - - elif opt in ("-w", "--wobble_conversion"): - wobble_conversion = check_bools(str(arg), default=wobble_conversion) - - elif opt in ("-S", "--substitution_count"): - try: substitution_count = int(arg) - except: - print "substitution_count - invalid argument - using default value" - log_txt += "\nsubstitution_count - invalid argument - using default value" - - elif opt in ("-r", "--rc_option"): - rc_option = check_bools(str(arg), default=rc_option) - - elif opt in ("-s", "--alphabetic_sorting"): - alphabetic_sorting = check_bools(str(arg), default=alphabetic_sorting) - - elif opt in ("-O", "--only_vs_first_seq"): - only_vs_first_seq = check_bools(str(arg), default=only_vs_first_seq) - - elif opt in ("-x", "--lcs_shading"): - lcs_shading = check_bools(str(arg), default=lcs_shading) - - elif opt in ("-X", "--lcs_shading_num"): - try: lcs_shading_num = int(arg) - 1 - except: - print "lcs_shading_num - invalid argument - using default value" - log_txt += "\nlcs_shading_num - invalid argument - using default value" - - elif opt in ("-y", "--lcs_shading_ref"): - try: - if 0 <= int(arg) <= 2: - lcs_shading_ref = int(arg) - else: - print "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - log_txt += "\nERROR: lcs_shading_ref %s out of range. It will be set to -y 0 [default]." %(lcs_shading_ref) - except: - print "lcs_shading_ref - invalid argument - using default value" - log_txt += "\nlcs_shading_ref - invalid argument - using default value" - - elif opt in ("-Y", "--lcs_shading_interval_len"): - try: lcs_shading_interval_len = int(arg) - except: - print "lcs_shading_interval_len - invalid argument - using default value" - log_txt += "\nlcs_shading_interval_len - invalid argument - using default value" - - elif opt in ("-z", "--lcs_shading_ori"): - if 0 <= int(arg) <= 2: - lcs_shading_ori = int(arg) - else: - print "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - log_txt += "\nERROR: Please provide valid lcs_shading_ori argument. %s is out of range. It will be set to -z 0 [default]." %(lcs_shading_ori) - - elif opt in ("-P", "--plot_size"): - try: plot_size = float(arg) - except: - print "plot_size - invalid argument - using default value" - log_txt += "\nplot_size - invalid argument - using default value" - - - elif opt in ("-A", "--line_width"): - try: line_width = float(arg) - except: - print "line_width - invalid argument - using default value" - log_txt += "\nline_width - invalid argument - using default value" - - elif opt in ("-B", "--line_col_for"): - if mcolors.is_color_like(arg): - line_col_for = arg - else: - print "line_col_for - invalid argument - using default value" - log_txt += "\nline_col_for - invalid argument - using default value" - - elif opt in ("-C", "--line_col_rev"): - if mcolors.is_color_like(arg): - line_col_rev = arg - else: - print "line_col_rev - invalid argument - using default value" - log_txt += "\nline_col_rev - invalid argument - using default value" - - elif opt in ("-D", "--x_label_pos"): - x_label_pos = check_bools(str(arg), default=x_label_pos) - - elif opt in ("-E", "--label_size"): - try: label_size = float(arg) - except: - print "label_size - invalid argument - using default value" - log_txt += "\nlabel_size - invalid argument - using default value" - - elif opt in ("-F", "--spacing"): - try: spacing = float(arg) - except: - print "spacing - invalid argument - using default value" - log_txt += "\nspacing - invalid argument - using default value" - - elif opt in ("-L", "--length_scaling"): - length_scaling = check_bools(str(arg), default=length_scaling) - - elif opt in ("-M", "--mirror_y_axis"): - mirror_y_axis = check_bools(str(arg), default=mirror_y_axis) - - elif opt in ("-R", "--representation"): - if 0 <= int(arg) <= 2: - representation = int(arg) - else: - print "\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." %(representation) - log_txt += "\nERROR: Please provide valid representation argument. %s is out of range. It will be set to -R 0 [default]." %(representation) - - elif opt in ("-T", "--title_length"): - try: title_length = int(arg) - except: - try: - title_length = int(str(arg)[:-1]) - if arg[-1].upper() in ["B", "E"]: # B (beginning), E (end) - title_clip_pos = arg[-1].upper() - else: - print "title_length position information invalid - using default value" - log_txt += "\ntitle_length position information invalid - using default value" - except: - print "title_length - invalid argument - using default value" - log_txt += "\ntitle_length - invalid argument - using default value" - - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - logprint(log_txt, start=False, printing=False) - - - # print chosen arguments - ###################################### - - text = "\n%s\n" % (70 * "-") - text += "\n" + "INPUT/OUTPUT OPTIONS...\n" - text += "\n" + "Input fasta file: " + ", ".join(input_fasta) - text += "\n" + "Automatic fasta collection from current directory: " + str(auto_fas) - text += "\n" + "Collage output: " + str(collage_output) - text += "\n" + "Number of columns per page: " + str(m_col) - text += "\n" + "Number of rows per page: " + str(n_row) - text += "\n" + "File format: " + filetype_dict[filetype] - text += "\n" + "Residue type is nucleotide: " + str(type_nuc) - - text += "\n" + "\n\nCALCULATION PARAMETERS...\n" - text += "\n" + "Wordsize: " + str(wordsize) - text += "\n" + "Sustitution count: " + str(substitution_count) - text += "\n" + "Plotting mode: " + str(plotting_modes).replace("[", "").replace("]", "") + "\n" + 51 * " " - for item in plotting_modes: - text += plotting_mode_dict[item] + " " - text += "\n" + "Ambiguity handling: " + str(wobble_conversion) - text += "\n" + "Reverse complement scanning: " + str(rc_option) - text += "\n" + "Alphabetic sorting: " + str(alphabetic_sorting) - - if 1 in plotting_modes: - text += "\n" + "Only matching sequences to first entry: " + str(only_vs_first_seq) - - if 0 in plotting_modes and input_gff_files != []: - text += "\n" + "Input gff files: " + ", ".join(input_gff_files) - if gff_color_config_file != "": - text += "\n" + "GFF color config file: " + gff_color_config_file - text += "\n" + "Prefix for output files: " + str(output_file_prefix) - - if 2 in plotting_modes: - text += "\n" + "\n\nLCS SHADING OPTIONS (plotting_mode 'all-against-all' only)...\n" - text += "\n" + "LCS shading: " + str(lcs_shading) - text += "\n" + "LCS shading interval number: " + str(lcs_shading_num + 1) - text += "\n" + "LCS shading reference: " + lcs_shading_ref_dict[lcs_shading_ref] - if lcs_shading_ref == 2: - text += "\n" + "LCS shading interval size [%s]: " % (aa_bp_unit) + str(lcs_shading_interval_len) - text += "\n" + "LCS shading orientation: " + lcs_shading_ori_dict[lcs_shading_ori] - if input_user_matrix_file != "": - text += "\n" + "Custom user shading matrix file: " + input_user_matrix_file - text += "\n" + "Print user matrix values (instead of dotplot): " + str(user_matrix_print) - text += "\n" + "Displayed plot region: " + representation_dict[representation] - - text += "\n" + "\n\nGRAPHIC FORMATTING...\n" - text += "\n" + "Plot size: " + str(plot_size) - text += "\n" + "Line width: " + str(line_width) - text += "\n" + "Line color: " + line_col_for - text += "\n" + "Reverse line color: " + line_col_rev - text += "\n" + "X label position: " + str(x_label_pos) - text += "\n" + "Label size: " + str(label_size) - text += "\n" + "Spacing: " + str(spacing) - if mirror_y_axis: - text += "\n" + "Y-axis mirrored (bottom to top) " + str(mirror_y_axis) - if title_clip_pos == "E": - text += "\n" + "Title length (limit number of characters): " + "last" + str(title_length) + "characters" - else: - text += "\n" + "Title length (limit number of characters): " + "first" + str(title_length) + "characters" - text += "\n" + "Length scaling: " + str(length_scaling) - text += "\n%s\n" % (70 * "-") - logprint(text) - - - # collect settings - parameters = [commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype_dict[filetype], type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, only_vs_first_seq, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose] - - return parameters - - -############################### -# Helper Functions # -############################### - -def alphabets(type_nuc=True): - """ - provide ambiguity code for sequences - """ - - nucleotide_alphabet = ["A", "C", "G", "T"] - - nucleotide_alphabet_full = ["A", "C", "G", "T", "N", "B", "D", "H", - "V", "Y", "R", "W", "S", "K", "M"] - - nucleotide_ambiguity_code = {"N": ["A", "C", "G", "T"], # any - "B": ["C", "G", "T"], # not A - "D": ["A", "G", "T"], # not C - "H": ["A", "C", "T"], # not G - "V": ["A", "C", "G"], # not T - "Y": ["C", "T"], # pyrimidine - "R": ["A", "G"], # purine - "W": ["A", "T"], # weak - "S": ["C", "G"], # strong - "K": ["G", "T"], # keto - "M": ["A", "C"]} # amino - - nucleotide_match_dict = {"N": "[ACGTNBDHVYRWSKM]", # any - "B": "[CGTNBDHVYRWSKM]", # not A - "D": "[AGTNBDHVYRWSKM]", # not C - "H": "[ACTNBDHVYRWSKM]", # not G - "V": "[ACGNBDHVYRWSKM]", # not T - "K": "[GTNBDHVYRWSK]", # keto - not A,C,M - "M": "[ACNBDHVYRWSM]", # amino - not G,T,K - "W": "[ATNBDHVYRWKM]", # weak - not C,G,S - "S": "[CGNBDHVYRSKM]", # strong - not A,G,W - "Y": "[CTNBDHVYWSKM]", # pyrimidine - not A,G,R - "R": "[AGNBDHVRWSKM]", # purine - not C,T,Y - "A": "[ANDHVRWM]", - "C": "[CNBHVYSM]", - "G": "[GNBDVRSK]", - "T": "[TNBDHYWK]"} - - # nucleotide_match_dict = {"N": ".", # any - # "B": "[^A]", # not A - # "D": "[^C]", # not C - # "H": "[^G]", # not G - # "V": "[^T]", # not T - # "K": "[^ACM]", # keto - not A,C,M - # "M": "[^GTK]", # amino - not G,T,K - # "W": "[^CGS]", # weak - not C,G,S - # "S": "[^AGW]", # strong - not A,G,W - # "Y": "[^AGR]", # pyrimidine - not A,G,R - # "R": "[^CTY]", # purine - not C,T,Y - # "A": "[ANDHVRWM]", - # "C": "[CNBHVYSM]", - # "G": "[GNBDVRSK]", - # "T": "[TNBDHYWK]"} - - aminoacid_alphabet = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"] - - aminoacid_alphabet_full = ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*", "J", - "Z", "B", "X"] - - aminoacid_ambiguity_code = {"J": ["I", "L"], - "Z": ["Q", "E"], - "B": ["N", "D"], - "X": ["A", "R", "N", "D", "C", "E", "Q", "G", - "H", "I", "L", "K", "M", "F", "P", "S", - "T", "W", "Y", "V", "U", "O", "*"]} # any - - aminoacid_match_dict = {"J": "[ILJ]", - "Z": "[QEZ]", - "B": "[NDB]", - # "X": ".", - "X": "[ARNDCEQGHILKMFPSTWYVUO*XBZJ]", - "A": "[AX]", - "R": "[RX]", - "N": "[NXB]", - "D": "[DXB]", - "C": "[CX]", - "E": "[EXZ]", - "Q": "[QXZ]", - "G": "[GX]", - "H": "[HX]", - "I": "[IXJ]", - "L": "[LXJ]", - "K": "[KX]", - "M": "[MX]", - "F": "[FX]", - "P": "[PX]", - "S": "[SX]", - "T": "[TX]", - "W": "[WX]", - "Y": "[YX]", - "V": "[VX]", - "U": "[UX]", - "O": "[OX]", - "*": "[*X]"} - - aa_only = set(['E', 'F', 'I', 'J', 'L', 'O', 'Q', 'P', 'U', 'X', 'Z', '*']) - # return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aa_only - - if type_nuc: - return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, nucleotide_match_dict - else: - return aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aminoacid_match_dict - -def logprint(text, start=False, printing=True, prefix=""): - """ - log output to log_file and optionally print - """ - - # define log file name and open file - global log_file_name - if start and trial_mode: - log_file_name = "log_file.txt" - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - elif start: - date = datetime.date.today() - time = str(datetime.datetime.now()).split(" ")[1].split(".")[0].replace(":", "-") - log_file_name = "%s_%s_log_file.txt" % (date, time) - if prefix != "" and prefix != None: - if not prefix.endswith("-"): - prefix = prefix + "-" - log_file_name = prefix + log_file_name - log_file = open(log_file_name, 'w') - log_file.write("Date: %s\n\n" % str(datetime.datetime.now())) - else: - log_file = open(log_file_name, 'a') - - # write log (and print) - log_file.write(text + "\n") - if printing: - print text - log_file.close() - -def time_track(starting_time, show=True): - """ - calculate time passed since last time measurement - """ - now = time.time() - delta = now - starting_time - if show: - text = "\n\t %s seconds\n" % str(delta) - logprint(text, start=False, printing=True) - return now - -def calc_fig_ratio(ncols, nrows, plot_size, verbose=False): - """ - calculate size ratio for given number of columns (ncols) and rows (nrows) - with plot_size as maximum width and length - """ - ratio = ncols*1./nrows - if verbose: - text = " ".join([ncols, nrows, ratio]) - logprint(text, start=False, printing=True) - if ncols >= nrows: - figsize_x = plot_size - figsize_y = plot_size / ratio - else: - figsize_x = plot_size * ratio - figsize_y = plot_size - return figsize_x, figsize_y - -def shorten_name(seq_name, max_len=20, title_clip_pos="B"): #, delim="_"): - """ - shorten sequence names (for diagram titles) - """ - - if len(seq_name) <= max_len: - return seq_name - - # take last characters - if title_clip_pos == "E": - name = seq_name[len(seq_name)-max_len:] - - # take first characters - else: - name = seq_name[:max_len] - - """# keep first and last part if multiple parts separated by delimiter (e.g. species_prefix + sequence_id) - if delim in seq_name: - if seq_name.count(delim) >= 2: - name = "%s..." % delim.join(seq_name.split(delim)[:1]) + seq_name.split(delim)[-1] # .replace("_000", "-") - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - - if len(name) > max_len: - name = name[:((max_len-2)//2)] + "..." + name[((max_len-2)//2):] - else: - name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] - """ - - return name - -def unicode_name(name): - """ - replace non-ascii characters in string (e.g. for use in matplotlib) - """ - unicode_string = eval('u"%s"' % name) - return unicodedata.normalize('NFKD', unicode_string).encode('ascii','ignore') - -def check_bools(arg, update_log_txt = True, default=None): - """ - converts commandline arguments into boolean - """ - - - # convert valid arguments - if str(arg).lower() == "y" or str(arg) == "1": - return True - elif str(arg).lower() == "n" or str(arg) == "0": - return False - - # use default in case of invalid argument - else: - if update_log_txt: - global log_txt - log_txt += "using default for " + str(arg) - else: - try: - logprint("using default for " + str(arg)) - except: - print "using default for " + str(arg) - return default - -def create_color_list(number, color_map=None, logging=False, max_grey="#595959"): - """ - create color list with given number of entries - grey by default, matplotlib color_map can be provided - """ - - try: - # create pylab colormap - cmap = eval("P.cm." + color_map) - # get descrete color list from pylab - cmaplist = [cmap(i) for i in range(cmap.N)] # extract colors from map - # determine positions for number of colors required - steps = (len(cmaplist)-1)/(number) - numbers = range(0, len(cmaplist), steps) - - # extract color and convert to hex code - colors = [] - for idx in numbers[:-1]: - rgb_color = cmaplist[idx] - col = rgb2hex(rgb_color[0]*255, rgb_color[1]*255, rgb_color[2]*255) - colors.append(col) - - # grey - except: - if not color_map == None: - logprint("Invalid color_map (%s) provided! - Examples: jet, Blues, OrRd, bwr,..." % color_map) - logprint("See https://matplotlib.org/users/colormaps.html\n") - old_max_grey = "#373737" - old_max_grey = "#444444" - colors = list(Color("#FFFFFF").range_to(Color(max_grey), number)) # grey - for idx in range(len(colors)): - colors[idx] = str(colors[idx]).replace("Color ", "") - if "#" in colors[idx] and len(colors[idx]) != 7: - # print colors[idx] - colors[idx] = colors[idx] + colors[idx][-(7-len(colors[idx])):] - - text = "%d Colors: %s" % (len(colors), ", ".join(colors)) - if logging: logprint(text, start=False, printing=True) - - if len(colors) < number: - logprint("\nError in color range definition! %d colors missing\n" % (number - len(colors))) - - return colors - - -############################### -# File Handling # -############################### - -def read_seq(input_fasta, verbose=False): - """ - read fasta sequences from (all) file(s) - """ - - # check if file provided - if input_fasta == [] or input_fasta == "": - text = "Attention: No valid file names provided: >%s<" % input_fasta - logprint(text, start=False, printing=True) - return {}, [] - - # combine sequence files, if required - if type(input_fasta) == list: - # concatenate fasta files - if len(input_fasta) > 1: - if verbose: - print "concatenating fastas...", - text = "concatenating fastas..." - input_fasta_combi = concatenate_files(input_fasta) - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - else: - input_fasta_combi = input_fasta[0] - else: - input_fasta_combi = input_fasta - - # read sequences - if verbose: - print "reading fasta...", - text = "reading fasta...", - try: - seq_dict = SeqIO.index(input_fasta_combi, "fasta") - except ValueError: - logprint("Error reading fasta sequences - please check input files, e.g. for duplicate names!") - return {}, [] - except: - logprint("Error reading fasta sequences - please check input files!") - return {}, [] - - if verbose: - print "done" - text += "done" - logprint(text, start=False, printing=False) - - for seq in seq_dict: - if "-" in seq_dict[seq].seq: - # ungapped = seq_dict[seq].seq.ungap("-") # cannot be assigned back to sequence record - text = "\nSequences degapped prior Analysis!!!" - logprint(text, start=False, printing=True) - return read_seq(degap_fasta(input_fasta), verbose=verbose) - - # get ordered sequence names - sequences = [] - for item in SeqIO.parse(input_fasta_combi, "fasta"): - sequences.append(item.id) - return seq_dict, sequences - -def read_gff_color_config(gff_color_config_file=""): - """ - define coloring options for gff-based color shading of self-dotplots - """ - - # default aestetics for annotation shading (e.g. if no user config file is provided) - # dictionary with feature_type as key and tuple(color, transparency, zoom) as value - gff_feat_colors = {"orf": ("#b41a31", 0.2, 0), - "orf_rev": ("#ff773b", 0.3, 0), - "gene": ("#b41a31", 0.2, 0), - "cds": ("darkorange", 0.2, 0), - "exon": ("orange", 0.2, 0), - "intron": ("lightgrey", 0.2, 0), - "utr": ("lightblue", 0.2, 0), - "repeat_region": ("green", 0.3, 0), - "repeat": ("green", 0.3, 0), - "tandem_repeat": ("red", 0.3, 0), - "transposable_element": ("blue", 0.3, 0), - "ltr_retrotransposon": ("#cccccc", 0.5, 0), - "ltr-retro": ("#cccccc", 0.5, 0), - "long_terminal_repeat": ("#2dd0f0", 0.75, 2), - "ltr": ("#2dd0f0", 0.75, 2), - "pbs": ("purple", 0.75, 2), - "ppt": ("#17805a", 0.5, 2), - "target_site_duplication": ("red", 0.75, 2), - "misc_feature": ("grey", 0.3, 0), - "misc_feat": ("grey", 0.3, 0), - "misc": ("grey", 0.3, 0), - "others": ("grey", 0.5, 0)} - if gff_color_config_file in ["", None] or not os.path.exists(str(gff_color_config_file)): - return gff_feat_colors - - text = "Updating GFF color configuration with custom specifications\n" - logprint(text, start=False, printing=True) - - # read custom gff_color_config_file - in_file = open(gff_color_config_file, 'rb') - overwritten = set([]) - for line in in_file: - if not line.startswith("#") and len(line.strip().split("\t")) >= 4: - data = line.strip().split("\t") - feat = data[0].lower() - color = data[1].lower() - - # check, if settings are valid - if not mcolors.is_color_like(color): - color = "grey" - text = "Invalid color specified for %s: %s - default grey" % (data[0], data[1]) - logprint(text) - try: - alpha = float(data[2]) - except: - alpha = 0.75 - text = "Invalid alpha specified for %s: %s - default 0.75" % (data[0], data[2]) - logprint(text) - try: - zoom = float(data[3]) - except: - zoom = 0 - text = "Invalid zoom specified for %s: %s - default 0" % (data[0], data[3]) - logprint(text) - - # track changes of predefined settings - if feat in gff_feat_colors.keys(): - overwritten.add(data[0].lower()) - - gff_feat_colors[feat] = (color, alpha, zoom) - in_file.close() - - # default coloring for unknown annotations - if not "others" in gff_feat_colors.keys(): - gff_feat_colors["others"] = ("grey", 0.5, 0) - - if verbose: - # print configuration - text = "\n\nGFF color specification:\n%s\n" % (60 * ".") - for item in sorted(gff_feat_colors.keys()): - text += "%-30s\t%-10s\t%-5s\t%s\n" % (item, str(gff_feat_colors[item][0]), str(gff_feat_colors[item][1]), str(gff_feat_colors[item][2])) - logprint (text, printing=True) - - # print overwritting feature type specifications - if len(overwritten) != 0: - text = "%d feature type specifications overwritten:" % len(overwritten) - text += "\n\t"+ ", ".join(overwritten) + "\n" - logprint(text, start=False, printing=True) - - text = "GFF color specification updated acc. to %s\n\t%s\n\n" % (gff_color_config_file, ", ".join(gff_feat_colors)) - logprint(text, start=False, printing=True) - - return gff_feat_colors - -def read_gffs(input_gff_files, color_dict={"others": ("grey", 1, 0)}, type_nuc=True, prefix="", filetype='png', verbose=False): - """ - create feature dictionary from input_gff - sequence name as key and (feature type, start, stop) as value - """ - if type(input_gff_files) != list: - input_gff_files = [input_gff_files] - - # create dictionary with seq_name as key and (type, start and stop) as value - unknown_feats = set([]) - used_feats = set([]) - feat_dict = {} - for input_gff in input_gff_files: - text = "...reading " + input_gff - logprint(text, start=False, printing=True) - - in_file = open(input_gff, 'rb') - for line in in_file: - if not line.startswith("#") and line.strip() != "": - data = line.strip().split("\t") - feat_type = data[2].lower() - if data[6] == "-": - feat_type += "_rev" - if not feat_type.lower() in color_dict.keys(): - if feat_type.lower().replace("_rev", "") in color_dict.keys(): - feat_type = feat_type.replace("_rev", "") - else: - unknown_feats.add(feat_type) - feat_type = "others" - used_feats.add(feat_type) - if not data[0] in feat_dict.keys(): - feat_dict[data[0]] = [(feat_type, int(data[3]), int(data[4]))] # feature type, start, stop - else: - feat_dict[data[0]].append((feat_type, int(data[3]), int(data[4]))) # feature type, start, stop - if verbose: - text = "\nAnnotations for: %s\n" % ", ".join(feat_dict.keys()[:10]) - if len(feat_dict.keys()) > 10: - text = text[:-1] + ", ...\n" - logprint(text, start=False, printing=True) - in_file.close() - - # print feature types without specific shading settings - if len(unknown_feats) != 0: - text = "Missing shading specification for %d feature type(s):\n\t%s\n" % (len(unknown_feats), ", ".join(sorted(unknown_feats))) - logprint(text, start=False, printing=True) - - # create color legend - colors, alphas = [], [] - for item in sorted(used_feats): - colors.append(color_dict[item][0]) - alphas.append(color_dict[item][1]) - legend_figure(colors=colors, lcs_shading_num=len(used_feats), type_nuc=type_nuc, bins=sorted(used_feats), alphas=alphas, gff_legend=True, prefix=prefix, filetype=filetype) - - # print settings - text = "GFF Feature Types: %s\nGFF Colors: %s" % (", ".join(sorted(used_feats)), ", ".join(sorted(colors))) - logprint(text, start=False, printing=True) - - return feat_dict - -def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False, verbose=False): - input_file = open(matrix_file_name, 'rb') - - # read sequence names from first column - names = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - names.append(line.strip().split(delim)[0]) - logprint("Delimiter '%s': %d names - %s\n" % (delim, len(names), ", ".join(names))) - - # check if names were found - otherwise try another delimiter - if names == [] and not recursion: - if delim == "\t": - new_delim = "," - else: - new_delim = "\t" - logprint("\nMatrix file not containing data delimited by '%s' - trying to read matrix with delimiter '%s'" % (delim.replace("\t", "\\t"), new_delim)) - info_dict = read_matrix(matrix_file_name, delim=new_delim, symmetric=symmetric, recursion=True, verbose=verbose) - return info_dict - elif names == []: - logprint("Empty matrix file with alternative delimiter!") - return info_dict - input_file.close() - - input_file = open(matrix_file_name, 'rb') - # read matrix entries as values in dictionary with tuple(names) as key - info_dict = {} - contradictory_entries = [] - for line in input_file: - if not line.startswith("#") and not line.startswith(delim) and delim in line: - data = line.strip().split(delim) - for idx in range(len(data[1:])): - # print tuple(sorted([data[0], names[idx]])), data[idx+1] - if symmetric: - key = tuple(sorted([names[idx], data[0]])) - else: - key = tuple(names[idx], data[0]) - if key in info_dict.keys(): - if symmetric and info_dict[key] != data[idx+1] and data[idx+1] not in ["", "-"] and info_dict[key] not in ["", "-"]: - contradictory_entries.append(key) - info_dict[key] = data[idx+1] - input_file.close() - - if len(contradictory_entries) != 0: - try: - logprint("\nContradictory entries in matrix file %s:\n\t%s" % (matrix_file_name, ", ".join(contradictory_entries))) - except: - log_txt = "\nContradictory entries in matrix file %s:\n\t" % (matrix_file_name) - for item in contradictory_entries: - log_txt += str(item).replace("'", "") + ", " - log_txt = log_txt[:-2] - logprint(log_txt) - logprint("Using value from bottom left triangle!") - if verbose: - logprint("\nMatrix information for Sequences named: " % ", ".join(names)) - - return info_dict - -def concatenate_files(file_list, combi_filename="temp_combined.fasta", verbose=False): - """ - concatenate content of all files in file_list into a combined file named combi_filename - """ - out_file = open(combi_filename, 'w') - text = "" - for item in file_list: - if verbose: - text += item + " " - print item, - # read in_file linewise and write to out_file - in_file = open(item, 'rb') - for line in in_file: - out_file.write(line.strip()+"\n") - in_file.close() - out_file.close() - if verbose: - logprint(text, start=False, printing=False) - return combi_filename - -def degap_fasta(input_fasta): - """ - remove gaps from fasta - new degapped sequence file created - """ - - # degap all sequence files - output_fastas = [] - if type(input_fasta) != list: - input_fasta = list(input_fasta) - for input_fas in input_fasta: - output_fas = input_fas[:input_fas.rfind(".")] + "_degapped.fas" - in_file = open(input_fas, 'rb') - out_file = open(output_fas, 'w') - for line in in_file: - if line.startswith(">"): - out_file.write(line.strip()+"\n") - else: - out_file.write(line.strip().replace("-", "")+"\n") - out_file.close() - in_file.close() - output_fastas.append(output_fas) - return output_fastas - -def legend_figure(colors, lcs_shading_num, type_nuc=True, unit="%", filetype="png", max_len=None, min_len=0, bins=[], alphas=[], gff_legend=False, prefix="", verbose=False): - """ - create figure color legend - """ - max_legend_length_row = 8 - max_legend_length_col = 4 - - # define output file - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg" - logprint(text, start=False, printing=True) - filetype="png" - - # check if length of information fit - if not gff_legend and ((bins != [] and len(colors) != lcs_shading_num+1) or (bins != [] and len(colors) != len(bins)+1)): - if bins != [] and len(colors) != lcs_shading_num+1: - text = "**Attention**\nlcs_shading_num (%d) does not match number of colors (%d)!\n"% (lcs_shading_num, len(bins)) - elif bins != [] and len(colors) != len(bins)+1: - text = "**Attention**\nnumber of LCS length bins (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - elif gff_legend and len(bins) != len(colors): - text = "**Attention**\nnumber of GFF Feature Types (%d) does not match number of colors (%d)!\n" % (len(colors), len(bins)) - logprint(text, start=False, printing=True) - - # set alpha values to opaque if none are provided - if alphas == []: - for item in colors: - alphas.append(1) - - # legend data points - data_points = range(len(colors)) - if not gff_legend: - - # specify intervals, if max_len provided - if max_len != None: - multi_factor = 100 # one digit - if max_len <= 1: - multi_factor = 1000 # two digits - # len_interval_size = (max_len-min_len) * multi_factor *1. // lcs_shading_num * (1./ multi_factor) - len_interval_size = (max_len-min_len) * 1. / lcs_shading_num - len_pos = [float("%.2f" % (min_len))] - # calculate interval positions - for idx in range(lcs_shading_num): - len_pos.append(float("%.2f" % (len_pos[-1] + len_interval_size))) - - if prefix.startswith("custom-matrix") and (0 <= max_len <= 100 and 0 <= min_len <= 100): - unit = "%" - elif prefix.startswith("custom-matrix"): - unit = "" - - text = "\n%d Legend intervals from %.2f to %.2f: \n\t%s - number: %d, step: %.2f, unit: %s\n" % (lcs_shading_num+1, min_len, max_len, str(len_pos), len(len_pos), len_interval_size, unit) - logprint(text, start=False, printing=True) - pos = len_pos - interval_size = len_interval_size - # generate legend labels acc. to standard interval notation - else: - # use default max_len = 100 and min_len = 0 - len_interval_size = 100. / lcs_shading_num - pos = [float("%.2f" % (0))] - # calculate interval positions - for idx in range(lcs_shading_num): - pos.append(float("%.2f" % (pos[-1] + len_interval_size))) - - # interval_size = 100 // lcs_shading_num - # pos = range(interval_size, 101+interval_size, interval_size) - - # remove unneccessary zeros in decimal places (i.e. if x.x00 in all entries) - while True: - last_digit_all_zero = True - no_delim = False - for idx in range(len(pos)): - # only process if fraction with decimal places - if not "." in str(pos[idx]): - no_delim = True - break - # only process when all entries end in zero - elif str(pos[idx])[-1] != "0": - last_digit_all_zero = False - break - if not last_digit_all_zero or no_delim: - break - # remove last decimal place (== 0) from all entries - else: - temp_pos = pos[:] - for idx in range(len(pos)): - if not str(pos[idx])[-2] == ".": - pos[idx] = float(str(pos[idx])[:-1]) - else: - pos[idx] = int(str(pos[idx])[:-2]) - logprint("Shortening legend entries: %s - %s" % (temp_pos, pos)) - - # eliminate fractions if unit == bp/aa - if unit in ["aa", "bp"]: - for idx in range(len(pos)): - temp_pos = pos[:] - rounded_unit = False - if "." in str(pos[idx]): - rounded_unit = True - # round values up to next integer (keep integer, if not a fraction) - pos[idx] = int(pos[idx] / 1) + int(pos[idx] % 1 > 0) - if idx == len(pos) - 1 and pos[idx] == 101: - pos[idx] = 100 - if rounded_unit: - logprint("Fractions not permitted for unit '%s': %s -> %s" % (unit, temp_pos, pos)) - - if bins != []: # labels provided - legend_labels = bins[:] - legend_labels.append("max") - legend_labels_lengths = [] - for item in bins: - legend_labels_lengths.append("[%d %s, %d %s)" % (item - min(bins), unit, item, unit)) - if len(bins) == len(colors) - 1: - legend_labels_lengths.append("[%d %s, %s]" % (max(bins), unit, u"\u221E")) # infinite - - else: - legend_labels = [] - legend_labels_lengths = [] - for idx in range(len(pos)): - num = pos[idx] - legend_labels.append("[%d%%, %d%%)" % (num - interval_size, num)) - if max_len != None: - num = len_pos[idx] - # as int or float - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths.append("[%d %s, %d %s)" % (num, unit, num + len_interval_size, unit)) - else: - legend_labels_lengths.append("[%.2f %s, %.2f %s)" % (num, unit, num + len_interval_size, unit)) - legend_labels[-1] = "100" + unit - if max_len != None: - if num == int(num) and int(len_interval_size) == len_interval_size: - legend_labels_lengths[-1] = u"[%d %s, \u221E]" % (max_len, unit) - else: - legend_labels_lengths[-1] = u"[%.2f %s, \u221E]" % (max_len, unit) - - # set labels and choose file name - if gff_legend: - label_text = bins[:] - edge_col = None - legend_file_name = "GFF_Shading_Legend_n%d." % lcs_shading_num + filetype - elif max_len != None: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_max%d%s_n%d." % (max_len, unit, lcs_shading_num+1) + filetype - elif bins != []: - label_text = legend_labels_lengths[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%d%s_n%d." % (bins[0], unit, lcs_shading_num+1) + filetype - else: - label_text = legend_labels[:] - edge_col = "black" - legend_file_name = "Polydotplot_LCS_Shading_Legend_%%len_n%d." % (lcs_shading_num+1) + filetype - - if prefix != None and prefix != "": - if not prefix.endswith("-"): - prefix = prefix + "-" - legend_type = "LCS" - if prefix.startswith("custom-matrix"): - prefix = prefix.replace("custom-matrix", "")[1:] - legend_type = "CustomMatrix" - legend_file_name = prefix + legend_file_name.replace("LCS", legend_type) - - # plot legend figure - fig, ax = P.subplots(3, 1, figsize=(len(colors)*2, len(colors)*2)) - for idx in range(len(colors)): - ax[0].bar(data_points[idx]+1, data_points[idx]+1, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[2].bar(data_points[idx]+1, 0, color=colors[idx], label=label_text[idx], - alpha=alphas[idx], edgecolor=edge_col) - ax[1].set_ylim(0,1) - ax[2].set_ylim(0,1) - ax[1].legend(ncol=((len(colors)-1)//max_legend_length_row)+1, framealpha=1) # vertical legend - col_num = len(colors) - if len(colors) > max_legend_length_col: - remainder = 0 - if len(colors) % max_legend_length_col != 0: - remainder = 1 - row_num = len(colors) // max_legend_length_col + remainder - remainder = 0 - if len(colors) % row_num != 0: - remainder = 1 - col_num = len(colors) // row_num + remainder - ax[2].legend(ncol=col_num, framealpha=1) # horizontal legend - - P.savefig(legend_file_name) - - return legend_file_name - - -############################### -# Analysis Functions # -############################### - -def wobble_replacement(sequence, general_ambiguity_code, verbose=False): - """ - get all degenerated sequences for sequence with ambiguous residues - (only residues considered that are keys in wobble_dictionary) - """ - - # get positions of ambiguous residues - wobble_pos = [] - for idx in range(len(sequence)): - letter = sequence[idx] - if letter in general_ambiguity_code.keys(): - wobble_pos.append(idx) - - if verbose: - text = "\t%d wobbles" % len(wobble_pos) - logprint(text, start=False, printing=True) - - # replace one wobble through each iteration by all possible residues - # repeat if still wobbles in new kmers - kmer_variants = [sequence] - while True: - if verbose: - text = "\t\t%d kmer variants" % len(kmer_variants) - logprint(text, start=False, printing=True) - temp_kmers = set([]) - for kmer in kmer_variants: - for idx in wobble_pos: - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - for base in general_ambiguity_code[kmer[idx]]: - newkmer = kmer[:idx] + base + kmer[idx+1:] - temp_kmers.add(newkmer) - wobble = False - for kmer in temp_kmers: - for idx in range(len(kmer)): - letter = kmer[idx] - if letter in general_ambiguity_code.keys(): - wobble = True - break - if wobble: - break - kmer_variants = set(list(temp_kmers)[:]) - if not wobble: - break - - return kmer_variants - -def split_diagonals(data, stepsize=1): - """ - split array if point difference exceeds stepsize - data = sorted list of numbers - """ - return np.split(data, np.where(np.diff(data) != stepsize)[0]+1) - -def longest_common_substring(s1, s2): - m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))] - longest, x_longest = 0, 0 - for x in xrange(1, 1 + len(s1)): - for y in xrange(1, 1 + len(s2)): - if s1[x - 1] == s2[y - 1]: - m[x][y] = m[x - 1][y - 1] + 1 - if m[x][y] > longest: - longest = m[x][y] - x_longest = x - else: - m[x][y] = 0 - return longest - -def lcs_from_x_values(x_values): - """ - calculate length of longest common substring based on nested list of numbers - """ - if len(x_values) == 0: - return 0 - # get lengths of each subarray data - lengths = np.array([len(i) for i in x_values]) - return max(lengths) - - -############################### -# Matching Functions # -############################### - -def find_match_pos_diag(seq1, seq2, wordsize, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - - # forward - ################################# - kmer_pos_dict_one = {}; kmer_pos_dict_two = {} # dictionaries for both sequences - - # reverse complement - ################################# - kmer_pos_dict_three = {}; kmer_pos_dict_four = {} # dictionaries for both sequences - - # create dictionaries with kmers (wordsize) and there position(s) in the sequence - if rc_option: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two), - (str(seq_one), kmer_pos_dict_three), - (str(seq_two.reverse_complement()), kmer_pos_dict_four)] - else: - data_list = [(str(seq_one), kmer_pos_dict_one), - (str(seq_two), kmer_pos_dict_two)] - for (seq, kmer_pos_dict) in data_list: - for i in range(len(seq)-wordsize+1): - kmer = seq[i:i+wordsize] - # discard kmer, if too many Ns included - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - if not convert_wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - wobbles = False - for item in general_ambiguity_code.keys(): - if item in kmer: - wobbles = True - break - if not wobbles: - try: - kmer_pos_dict[kmer].append(i) - except KeyError: - kmer_pos_dict[kmer] = [i] - else: - kmer_variants = wobble_replacement(kmer, general_ambiguity_code) - for new_kmer in kmer_variants: - # print "\t", new_kmer - try: - kmer_pos_dict[new_kmer].append(i) - except KeyError: - kmer_pos_dict[new_kmer] = [i] - - # find kmers shared between both sequences - matches_for = set(kmer_pos_dict_one).intersection(kmer_pos_dict_two) # forward - matches_rc = set(kmer_pos_dict_three).intersection(kmer_pos_dict_four) # reverse complement - - if verbose: - text = "[matches: %i for; %.i rc]" % (len(matches_for), len(matches_rc)) - logprint(text, start=False, printing=True) - - # create lists of x and y co-ordinates for scatter plot - # keep all coordinates of all shared kmers (may match multiple times) - diag_dict_for = {} - diag_dict_rc = {} - for (match_list, pos_dict1, pos_dict2, diag_dict) in [(matches_for, kmer_pos_dict_one, kmer_pos_dict_two, diag_dict_for), - (matches_rc, kmer_pos_dict_three, kmer_pos_dict_four, diag_dict_rc)]: - for kmer in match_list: - for i in pos_dict1[kmer]: - for j in pos_dict2[kmer]: - diag = i-j - points = set(range(i+1, i+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - -def find_match_pos_regex(seq1, seq2, wordsize, substitution_count=0, report_lcs=False, rc_option=True, convert_wobbles=False, max_N_percentage=49, type_nuc=True, verbose=False): - """ - find all matching positions with matches >= wordsize via regular expression search - fuzzy matching - allow up to substitution_count substitutions - convert matching points into lines of the length of the match - (+ optional handling of ambiguities) - """ - global t1 # timer - - # read sequences - seq_one = seq1.upper(); len_one = len(seq_one) - seq_two = seq2.upper(); len_two = len(seq_two) - - # set ambiguity code for wobble replacement - general_ambiguity_code = alphabets(type_nuc)[2] # nucleotide_ambiguity_code or aminoacid_ambiguity_code - ambiguity_match_dict = alphabets(type_nuc)[3] - - ambiq_residues = "[%s]" % "".join(general_ambiguity_code.keys()) - - # look for Ns in DNA or Xs in proeins (minimum word size) - if type_nuc == True: - any_residue = "N" - else: - any_residue = "X" - - # check for wobble presence - if not (regex.search(ambiq_residues, str(seq_one)) == None and regex.search(ambiq_residues, str(seq_two)) == None): - wobble_found = True - else: - wobble_found = False - - # dictionary for matches - diag_dict_for = {} - diag_dict_rc = {} - counter = [0, 0] - - # one-way matching - if rc_option: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0), - (str(seq_one), str(seq_two.reverse_complement()), diag_dict_rc, 1)] - else: - data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0)] - - for seq_query, seq_target, diag_dict, counter_pos in data_list: - # split query sequence into kmers - if not rc_option and counter_pos == 1: - break - - for idx in range(len(str(seq_query))-wordsize+1): - kmer = str(seq_query)[idx:idx+wordsize] - - # skip excessive N/X stretches (big black areas) - if kmer.count(any_residue)*100./wordsize <= max_N_percentage: - # convert kmer to regular expression for wobble_matching - if convert_wobbles and wobble_found: - kmer_string = "" - # replace each residue with matching residues or wobbles - for jdx in range(len(kmer)): - kmer_string += ambiguity_match_dict[kmer[jdx]] - else: - kmer_string = kmer - - # convert to regular expression tolerating substitution errors - if type(substitution_count) == int and substitution_count != 0: - kmer_string = "(%s){s<=%d}" % (kmer_string, substitution_count) - - # search for regular expression pattern in target sequence - result_matches = regex.finditer(kmer_string, seq_target, overlapped=True, concurrent=True) - - # investigate all hits - last_motif_start = 0 - for result in result_matches: - # skip hits only differing in length of TSD region 1 - if result.start() > last_motif_start: - counter[counter_pos] += 1 - last_motif_start += result.start() - - kmer2 = seq_target[result.start():result.end()] - - # skip excessive N/X stretches (big black areas) - if kmer2.count(any_residue)*100./wordsize <= max_N_percentage: - diag = idx-(result.start()) - points = set(range(idx+1, idx+wordsize+1)) - if not diag in diag_dict.keys(): - diag_dict[diag] = points - else: - diag_dict[diag].update(points) - - if verbose: - text = "%5.i \tforward matches" % counter[0] - text += "\n%5.i \treverse complementary matches" % counter[1] - logprint(text, start=False, printing=True) - - # convert coordinate points to line start and stop positions - x1 = [] # x values reverse - y1 = [] # y values forward - for diag in diag_dict_for.keys(): - x_values = np.array(sorted(diag_dict_for[diag])) - x1.extend(split_diagonals(x_values)) - y_values = split_diagonals(x_values - diag) - y1.extend(y_values) - - x2 = [] # x values rc - y2 = [] # y values rc - if rc_option: - for diag in diag_dict_rc.keys(): - factor = len_two + diag + 1 - x_values = np.array(sorted(diag_dict_rc[diag])) - x2.extend(split_diagonals(x_values)) - y_values = split_diagonals(factor - x_values, -1) - y2.extend(y_values) - - if verbose: - t1 = time_track(t1) - - if not report_lcs: - return np.array(x1), np.array(y1), np.array(x2), np.array(y2) - else: - # get length of longest common substring based on match lengths - lcs_for = lcs_from_x_values(x1) - lcs_rev = lcs_from_x_values(x2) - return np.array(x1), np.array(y1), np.array(x2), np.array(y2), lcs_for, lcs_rev - - -############################### -# Dot Plot Functions # -############################### - -def selfdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}): - """ - self-against-self dotplot - partially from biopython cookbook - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least one input sequence - if len(sequences) == 0: - text = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - text += " No sequences provided for selfdotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1 and multi: - text = "\n\nCreating collage output for single selfdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nSelfdotplot Collage: Invalid collage - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences): - ncols = len(sequences) - nrows = 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nSelfdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - if prefix != None and prefix != "": - legend_prefix = prefix + "-Selfdotplot" - else: legend_prefix = "Selfdotplot" - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=legend_prefix, filetype=filetype, verbose=verbose) - - global t1 - - print "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-"), - log_txt = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % (50*"=", len(sequences), 28*"-") - - # preparations for file name - name_graph = "Selfdotplots" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if multi: - suffix += "_collage" - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - list_of_png_names = [] - - counter = 0 - for seq_name in sequences: - print seq_name, - log_txt += " " + seq_name - - counter += 1 - if not multi: - P.cla() # clear any prior graph - - # read sequence - seq_record = seq_dict[seq_name] - name_seq = seq_record.id - seq_one = seq_record.seq.upper() - length_seq = len(seq_one) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_regex(seq_one, seq_one, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG", - x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_diag(seq_one, seq_one, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, type_nuc=type_nuc, verbose=verbose) - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - # print P.xticks()[0], P.yticks()[0] - P.axis('scaled') # make images quadratic - P.xlim(0, length_seq+1) - if mirror_y_axis: - P.ylim(0, length_seq+1) # rotate y axis (point upwards) - else: - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - # # use same tick labels for x and y axis - # tick_locs, tick_labels = P.yticks() - # P.xticks(tick_locs) - # P.xlim(0, length_seq+1) - - P.title(unicode_name(shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, fontweight='bold') - # P.title(unicode_name(name_seq), fontsize=label_size*1.3, fontweight='bold') - - # save figure and reinitiate if page is full - if counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' % (prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - else: # not multi - - fig = P.figure(figsize=(plot_size, plot_size)) # figure size needs to be a square - ax = P.subplot(1, 1, 1) # rows, columns, plotnumber - - # shade annotated regions - if gff_files != None and gff_files != []: - if seq_name in feat_dict.keys(): - features = feat_dict[seq_name] - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(length_seq+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # collect lines - lines = [] - number = 0 - color_list = [] - for (x_lines, y_lines, col) in [(x_lists_rc, y_lists_rc, line_col_rev), (x_lists, y_lists, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.axis('scaled') # make images quadratic - P.xlim(0, length_seq+1) - if mirror_y_axis: - P.ylim(0, length_seq+1) # rotate y axis (point upwards) - else: - P.ylim(length_seq+1, 0) # rotate y axis (point downwards) - P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - # # use same tick labels for x and y axis - # tick_locs, tick_labels = P.yticks() - # P.xticks(tick_locs) - # P.xlim(0, length_seq+1) - - P.title(unicode_name(shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size*1.3, fontweight='bold') - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s-%d_%s_wordsize%i%s.%s' %(prefix, name_graph, counter, shorten_name(name_seq, max_len=title_length, title_clip_pos=title_clip_pos), wordsize, suffix, filetype) - P.savefig(fig_name, bbox_inches='tight') - - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - if multi and counter >= 1: - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() # clear any prior graph - - list_of_png_names.append(fig_name) - - print "\n\nDrawing selfdotplots done" - log_txt += "\n\nDrawing selfdotplots done" - logprint(log_txt, start=False, printing=False) - - return list_of_png_names - -def pairdotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, multi=True, ncols=4, nrows=5, x_label_pos_top=True, only_vs_first_seq=False, length_scaling=True, scale_delim_col="red"): - """ - pairwise dotplot (all-against-all) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - # check if at least two input sequences - if len(sequences) < 2: - text = "\n%s\n\nCreating %d paired dotplot image \n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += " Please provide at least two sequences for pairdotplot!\n\nTerminating paired dotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 2 and multi: - text = "\n\nCreating collage output for single pairdotplot!" - text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" - logprint(text, start=False, printing=True) - - if multi and (ncols == 0 or nrows == 0): - ncols = max(ncols, 1) - nrows = max(nrows, 1) - text = "\n\nPairdotplot Collage: Invalid collage settings - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if multi and ncols > len(sequences)*(len(sequences)-1): - ncols = len(sequences) - nrows = 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - elif multi and ncols*(nrows-1) > len(sequences)*(len(sequences)-1): - nrows = ((len(sequences)-1) // ncols) + 1 - text = "\n\nPairdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" % (ncols, nrows) - logprint(text, start=False, printing=True) - - if not only_vs_first_seq: - text = "\n%s\n\nCreating %d paired dotplot image for\n%s\n\n=>" % (50*"=", len(sequences)*(len(sequences)-1)/2, 36*"-") - text += ", ".join(sequences) + "\n" - else: - text = "\n%s\n\nCreating %d paired dotplot images against 1st sequence '%s':\n%s\n\n=>" % (50*"=", len(sequences)-1, sequences[0], 36*"-") - text += ", ".join(sequences[1:]) + "\n" - logprint(text, start=False, printing=True) - - if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size/2: - label_size = plot_size * 3 // 2 - text = "Reducing label size for better visualization to %d\n" % label_size - logprint(text, start=False, printing=True) - - y_label_rotation = "vertical" - # for cartesian coordinate system with mirrored y-axis: plot x labels below plot - if mirror_y_axis: - x_label_pos_top = False - - # preparations for file name - name_graph = "Pairdotplot" - if prefix != None: - if not prefix[-1] == "-": - prefix = prefix + "-" - else: - prefix = "" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if length_scaling: - suffix += "_scaled" - if multi: - suffix += "_collage" - - # calculate fig ratios - if not multi: - ncols = 1 - nrows = 1 - figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) - - P.cla() # clear any prior graph - list_of_png_names = [] - if multi: - fig = P.figure(figsize=(figsize_x, figsize_y)) - page_counter = 1 - - # prepare LCS data file - lcs_data_file = open("%sPairdotplot_wordsize%d_lcs_data_file%s.txt" % (prefix, wordsize, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - counter, seq_counter = 0, 0 - print "Drawing pairwise dotplot...", - log_txt = "Drawing pairwise dotplot..." - if verbose: - seq_text = "" - for idx in range(len(sequences)-1): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx+1, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += " " + str(seq_counter) - - # get positions of matches - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - - # write LCS data file - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - - # plotting with matplotlib - ################################# - - # combined plotting - if multi: - # plotting subplot with matplotlib - ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber - - else: - # calculate figure size for separate figures - if len_one >= len_two: - sizing = (plot_size, max(2, (plot_size)*len_two*1./len_one)) - # sizing = (plot_size, min(plot_size, max(2, (plot_size-2)*len_two*1./len_one+2))) - else: - sizing = (max(2, (plot_size)*len_one*1./len_two), plot_size) - # sizing = (min(plot_size, max(2, (plot_size-2)*len_one*1./len_two+2)), plot_size) - fig = P.figure(figsize=(plot_size, plot_size)) - - ax = P.subplot(1, 1, 1) - - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # format axes - P.xlabel(unicode_name(shorten_name(name_one, max_len=title_length, title_clip_pos=title_clip_pos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.ylabel(unicode_name(shorten_name(name_two, max_len=title_length, title_clip_pos=title_clip_pos)) + " [%s]" % aa_bp_unit, fontsize=label_size, fontweight='bold', labelpad=4) - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - - # P.axis('scaled') # make images scaled by size ### optional update ### - if not multi: - if length_scaling: - ax.set_aspect(aspect='equal', adjustable='box', anchor='NW') - P.xlim(0, len_one+1) - # xlimit = [0, len_one+1] - if mirror_y_axis: - P.ylim(0, len_two+1) # rotate y axis (point upwards) - else: - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - elif not length_scaling: - P.xlim(0, len_one+1) - # xlimit = [0, len_one+1] - if mirror_y_axis: - P.ylim(0, len_two+1) # rotate y axis (point upwards) - else: - P.ylim(len_two+1, 0) # rotate y axis (point downwards) - else: - max_len = max(len_one, len_two) - P.xlim(0, max_len+1) - # xlimit = [0, max_len+1] - if mirror_y_axis: - P.ylim(0, max_len+1) # rotate y axis (point upwards) - else: - P.ylim(max_len+1, 0) # rotate y axis (point downwards) - - # plot line deliminating shorter sequence - if max_len != len_one: - ax.plot((len_one+1, len_one+1), (0, len_two), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - if max_len != len_two: - ax.plot((0, len_one), (len_two+1, len_two+1), marker="", linestyle="--", color=scale_delim_col, markerfacecolor="r") - - # # use same tick labels for x and y axis - # if P.xlim() == P.ylim(): - # tick_locs, tick_labels = P.yticks() - # P.xticks(tick_locs) - # P.xlim(xlimit) - - # evtl. switch x axis position - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - P.setp(ax.get_xticklabels(), fontsize=label_size*.9) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) - - # save figure and reinitiate if page is full - if multi and counter == ncols * nrows: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=.5, wspace=.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=.5, wspace=.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - counter = 0 - page_counter += 1 - - fig = P.figure(figsize=(figsize_x, figsize_y)) - - # plotting separate figure files - elif not multi: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02, left=0.13, bottom=0.05) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.02, wspace=0.02) # space between rows - def 0.4 - - # name and create output files - fig_name = '%s%s-%d_wordsize%i%s.%s' % (prefix, name_graph, counter, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - list_of_png_names.append(fig_name) - fig = P.figure() - - if only_vs_first_seq: - break - - # save figure - if multi and counter >= 1: - - # finalize layout - margins & spacing between plots - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - if x_label_pos_top: - P.subplots_adjust(hspace=0.5, wspace=0.5, top=0.95) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=0.5, wspace=0.5, bottom=0.05) # space between rows - def 0.4 - - # name and create output files (names derived from SEQNAME) - fig_name = '%s%s_wordsize%i%s-%.3d.%s' %(prefix, name_graph, wordsize, suffix, page_counter, filetype) - P.savefig(fig_name, bbox_inches='tight') - P.close() - P.cla() - - list_of_png_names.append(fig_name) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - print - logprint(seq_text, start=False, printing=False) - - return list_of_png_names - -def polydotplot(input_fasta, wordsize, prefix=None, plot_size=10, label_size=10, filetype='png', type_nuc=True, convert_wobbles=False, substitution_count=0, alphabetic_sorting=False, mirror_y_axis=False, title_length=float("Inf"), title_clip_pos="B", max_N_percentage=49, verbose=False, gff_files=[], gff_color_dict={"others": ("grey", 1, 0)}, x_label_pos_top=True, lcs_shading=True, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, lcs_shading_num=5, spacing=0.04, input_user_matrix_file="", user_matrix_print=True, rotate_labels=False): - """ - all-against-all dotplot - derived from dotplot function - - lcs_shading_refs: - 0 color relative to maximum lcs observed in dataset [default] - 1 color by coverage of shorter sequence (e.g. lcs = 70% of seq1) - lcs_shading_ori - 0 forward only - 1 reverse only - 2 both orientations (in opposite plot) - """ - - # read sequences - seq_dict, sequences = read_seq(input_fasta) - if seq_dict == {}: - logprint("\nFailed to load sequences") - return [] - - if alphabetic_sorting: - sequences = sorted(sequences) - - if len(sequences) == 0: - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " No sequences provided for polydotplot!\n\nTerminating polydotplot!" - logprint(text, start=False, printing=True) - return - elif len(sequences) == 1: - text = "\n\nCreating polydotplot for single sequence!" - text += "\nRecommendation: Use selfdotplot via '--plotting_mode 0'!\n\n" - logprint(text, start=False, printing=True) - - text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % (50*"=", len(sequences), len(sequences), 30*"-") - text += " " + " ".join(sequences) + "\n" - logprint(text, start=False, printing=True) - - # read gff annotation data if provided for shading - if gff_files != None and gff_files != []: - text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % (50*"=", len(gff_files), 28*"-", ", ".join(gff_files)) - logprint(text, start=False, printing=True) - if prefix != None and prefix != "": - legend_prefix = prefix + "-Polydotplot" - else: legend_prefix = "Polydotplot" - feat_dict = read_gffs(gff_files, color_dict=gff_color_dict, type_nuc=type_nuc, prefix=legend_prefix, filetype=filetype, verbose=verbose) - - if lcs_shading and not type_nuc: - if lcs_shading_ori != 0: - lcs_shading_ori = 0 - text = "Protein shading does not support reverse complementary matching!\n" - logprint(text, start=False, printing=True) - - # read custom shading matrix & match names of sequences to fasta - if input_user_matrix_file != "" and input_user_matrix_file != None: - logprint("Reading user matrix file: %s" % input_user_matrix_file) - # lcs_shading_ori = 2 - custom_dict = read_matrix(input_user_matrix_file) - if custom_dict != {}: - custom_shading = True - custom_similarity_dict = {} - invalid_entries = [] - custom_max = 0 - custom_min = float("Inf") - for key in custom_dict.keys(): - number_key = [] - - # convert number into float - try: - value = float(custom_dict[key]) - if not "." in custom_dict[key]: - value = int(custom_dict[key]) - custom_max = max(custom_max, value) - custom_min = min(custom_min, value) - except: - value = custom_dict[key] - if value == "": - value = None - invalid_entries.append(key) - # match matrix names with sequence names - for item in key: - if item in sequences: - number_key.append(sequences.index(item)) - else: - number_key.append(-1) - # dictionary with tuple of sorted sequence indices as key and number as value - custom_similarity_dict[tuple(sorted(number_key))] = value - if len(invalid_entries) != 0: - text = "No valid number in custom similarity matrix for %d entries: \n\t" % (len(invalid_entries)) - for key in invalid_entries: - text += str(key) + " - " + str(custom_dict[key]) + "; " - logprint(text[:-2]+"\n") - - text = "Custom user matrix given: min %.2f, max %.2f\n" % (custom_min, custom_max) - - # artificially rounding intervals if likely identity/divergence percentages - if 0 <= custom_min < 1 and 0 < custom_max <= 1: - rounding_factor = 5 - multi_factor = 100 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (multi_factor*custom_min // rounding_factor) * (1.*rounding_factor/multi_factor)) - custom_max = min((multi_factor*custom_max // rounding_factor) * (1.*rounding_factor/multi_factor), 1) - text += "new (%.2f, >%2f)\n" % (custom_min, custom_max) - - elif 0 <= custom_min < 100 and 0 < custom_max <= 100: - rounding_factor = 5 - text += " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " % (custom_min, custom_max) - custom_min = max(0, (custom_min // rounding_factor) * rounding_factor) - custom_max = min((custom_max // rounding_factor) * rounding_factor, 100) - text += "new (%d, >%d)\n" % (custom_min, custom_max) - - logprint(text) - - else: - custom_shading = False - - name_graph = "Polydotplot" - suffix = "" - if convert_wobbles: - suffix += "_wobbles" - if substitution_count != 0: - suffix += "_S%d" % substitution_count - if custom_shading: - suffix += "_matrix" - if lcs_shading: - suffix += "_%dshades_ref%d_ori%s" % (lcs_shading_num+1, lcs_shading_ref, lcs_shading_ori) - if "ref2" in suffix and type_nuc: - suffix = suffix.replace("ref2", "%dbp" % lcs_shading_interval_len) - elif "ref2" in suffix: - suffix = suffix.replace("ref2", "%daa" % lcs_shading_interval_len) - - - # name and create output files (names derived from SEQNAME) - if prefix != None and str(prefix) != "": - prefix = str(prefix) + "-" - else: - prefix = "" - - # preparations for background shading - if lcs_shading or custom_shading: - # create color range white to grey - colors = create_color_list(lcs_shading_num+1, color_map=None, logging=True) - colors_2 = create_color_list(lcs_shading_num+1, color_map="OrRd", logging=True) - - if custom_shading: - text = "Custom Matrix Colors: " + ", ".join(colors_2) - - # write lcs lengths to file - lcs_data_file = open("%sPolydotplot_lcs_data_file%s.txt" % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), 'w') - lcs_data_file.write("\t".join(["#title1", "title2", "len_seq1", "len_seq2", "len_lcs_for", "%_min_seq_len", "len_lcs_rev", "%_min_seq_len"])+"\n") - - # compare sequences pairwise - save lcs and line information in dictionary for plotting - data_dict = {} # keys = tuple(idx, jdx), value = x1, y1, x2, y2 (line positions) - lcs_dict = {} # keys = tuple(idx, jdx), value = length of lcs: lcs_len or (lcs_for, lcs_rev) - for_lcs_set = set([]) # keep lengths to calculate max (excluding self comparisons) - rev_lcs_set = set([]) # keep lengths to calculate max (all) - - text = "\nTotal plot count: %d" % (len(sequences)*(len(sequences))) - text += "\nTotal calculations: %d" % (len(sequences)*(len(sequences)+1)/2) - logprint(text, start=False, printing=True) - - print "\nCalculating shared regions and lengths of longest_common_substring...", - log_txt = "\nCalculating shared regions and lengths of longest_common_substring..." - # determine matches and length of lcs by comparing all sequence pairs - if verbose: - seq_text = "" - counter = 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - name_two = rec_two.id - seq_two = rec_two.seq - len_two = len(seq_two) - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - name_one = rec_one.id - seq_one = rec_one.seq - len_one = len(seq_one) - - counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif len(sequences) < 5: - print "\t%s (%d %s), %s (%d %s)" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - log_txt += "\t%s (%d %s), %s (%d %s)\n" % (name_one, len_one, aa_bp_unit, name_two, len_two, aa_bp_unit) - else: - if not counter % 25: - print counter, - log_txt += str(counter) - - # get positions of matches & length of longest common substring based on match lengths - if substitution_count != 0: - # print "RE" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex(seq_one, seq_two, wordsize, substitution_count=substitution_count, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - else: - # print "DIAG" - x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq_one, seq_two, wordsize, convert_wobbles=convert_wobbles, max_N_percentage=max_N_percentage, report_lcs=True, type_nuc=type_nuc, verbose=verbose) - data_dict[(idx, jdx)] = x1[:], y1[:], x2[:], y2[:] - lcs_dict[idx, jdx] = lcs_for, lcs_rev - - if idx != jdx: - for_lcs_set.add(lcs_for) - rev_lcs_set.add(lcs_rev) - - lcs_data_file.write("\t".join([name_one, name_two, str(len_one), str(len_two), - str(lcs_for), str(round((lcs_for*100./min(len_one, len_two)), 3)), - str(lcs_rev), str(round((lcs_rev*100./min(len_one, len_two)), 3))]) + "\n") - - if not verbose: - print len(sequences)*(len(sequences)+1)/2, " done\n" - log_txt += str(len(sequences)*(len(sequences)+1)/2) + " done\n" - else: - print "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - log_txt += "\n%d done" % (len(sequences)*(len(sequences)+1)/2) - logprint(log_txt, start=False, printing=False) - - if verbose: - logprint ("\n\nlcs_dict\n" + str(lcs_dict)) - if custom_shading: - logprint ("\ncustom_dict\n" + str(custom_dict)) - logprint ("\ncustom_similarity_dict\n\n" + str(custom_similarity_dict)) - - if verbose: - print - logprint(seq_text+"\n", start=False, printing=False) - - if lcs_shading_ref == 2: - color_bins = [] - text = "\nLCS lengh bins: " - for idx in range(lcs_shading_num): - color_bins.append(lcs_shading_interval_len*(idx+1)) - text += " " + str(lcs_shading_interval_len*(idx+1)) - logprint(text, start=False, printing=True) - - # calculate maximum lcs length - if lcs_shading_ori == 0: # forward only - if len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - elif lcs_shading_ori == 1: # reverse complement only - if len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - else: - max_lcs = None - else: # both orientations - if len(rev_lcs_set) != 0 and len(for_lcs_set) != 0: - max_lcs = max(max(rev_lcs_set), max(for_lcs_set)) - elif len(rev_lcs_set) != 0: - max_lcs = max(rev_lcs_set) - elif len(for_lcs_set) != 0: - max_lcs = max(for_lcs_set) - else: - max_lcs = None - - if not max_lcs == None: - text = "Maximum LCS: %d %s" % (max_lcs, aa_bp_unit) - logprint(text, start=False, printing=True) - if custom_shading: - text = "Maximum custom value: %d\n" % custom_max - logprint(text, start=False, printing=True) - - # count sequences - ncols = len(sequences); nrows = len(sequences) - - # get sequence lengths to scale plot widths and heights accordingly - size_ratios = [] - for item in sequences: - size_ratios.append(len(seq_dict[item].seq)) - - P.cla() # clear any prior graph - # use GridSpec to resize plots according to sequence length - if mirror_y_axis: - height_ratios = size_ratios[::-1] - else: - height_ratios = size_ratios[:] - gs = gridspec.GridSpec(nrows, ncols, - width_ratios=size_ratios, - height_ratios=height_ratios) - fig = P.figure(figsize=(plot_size, plot_size)) - - # for cartesian coordinate system with mirrored y-axis: plot x labels below plot - if mirror_y_axis and representation == 1: - x_label_pos_top = True - elif mirror_y_axis or representation == 2: - x_label_pos_top = False - - # print y labels on the right, if upper right triangle is displayed - if (representation == 1 and not mirror_y_axis) or (representation == 2 and mirror_y_axis): - y_label_pos = 0 # last column - else: # left y label - y_label_pos = 1 # first column - - # determine label orientations - if len(sequences) > 5 or rotate_labels: - x_label_rotation = 45 - y_label_rotation = "horizontal" - if x_label_pos_top: - xhalign = 'left' - xvalign = 'bottom' - else: - xhalign = 'right' - xvalign = 'top' - yhalign = "right" - else: - x_label_rotation = "horizontal" - y_label_rotation = "vertical" - xvalign = "center" - xhalign = "center" - yhalign = "center" - yvalign = 'center' - - # check combination of shading parameters for triangular output - if representation != 0 and lcs_shading and custom_shading: # both directions in triangle - logprint("\nAttention: For triangular output custom-shading and LCS shading cannot be combined!\n") - elif representation != 0 and lcs_shading and lcs_shading_ori == 2: # both directions in triangle - logprint("\nAttention: For triangular output LCS shading for both orientations is combined to max of both orientations!\n") - - print "\nDrawing polydotplot...", - log_txt = "\nDrawing polydotplot..." - - # draw subplots - if verbose: - if lcs_shading and custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "Custom matrix value", "Matrix color index", "LCS color index"]) + "\n" - elif lcs_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "LCS for [%s]" %aa_bp_unit, "LCS for [%s]" %aa_bp_unit, "LCS color index for", "LCS color index rev"]) + "\n" - elif custom_shading: - lcs_text = "\n" + "\t".join(["#Seq1", "Seq2", "Custom matrix value", "Color index for", "Color index rev"]) + "\n" - - if verbose: - seq_text = "" - counter, seq_counter = 0, 0 - for idx in range(len(sequences)): - if verbose: - print "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]), - seq_text += "\n%d\t%s vs." % ((seq_counter+1), sequences[idx]) - rec_two = seq_dict[sequences[idx]] - len_two = len(rec_two.seq) - name_two = rec_two.id - - for jdx in range(idx, len(sequences)): - rec_one = seq_dict[sequences[jdx]] - len_one = len(rec_one.seq) - name_one = rec_one.id - - counter += 1 - seq_counter += 1 - if verbose: - print sequences[jdx], - seq_text += " " + sequences[jdx] - elif not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - # optional shade background according to length of LCS and/or user matrix - ######################################################################### - - # get interval based on LCS - background_colors = [None, None] - if lcs_shading and (lcs_shading_ref==1 or lcs_shading_ref==2 or max_lcs!=None): # self plot max_lcs_for == None - lcs_len = lcs_dict[(idx, jdx)] - l1 = lcs_len[0] # forward - l2 = lcs_len[1] # reverse complement - - lcs_shading_bool = True - - # calculate shading acc. to chosen option - if lcs_shading_ref == 1: # percentage of shorter sequence - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // min(len_one, len_two)) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // min(len_one, len_two)) - elif lcs_shading_ref == 2: # by given interval size - color_idx0 = min(len(colors)-1, l1 // lcs_shading_interval_len) - color_idx1 = min(len(colors)-1, l2 // lcs_shading_interval_len) - if color_idx0 >= len(colors): - color_idx0 = len(colors) - if color_idx1 >= len(colors): - color_idx1 = len(colors) - else: # percentage of maximum lcs length - color_idx0 = min(len(colors)-1, l1*lcs_shading_num // max_lcs) - color_idx1 = min(len(colors)-1, l2*lcs_shading_num // max_lcs) - else: - lcs_shading_bool = False - - # get interval based on custom matrix - if custom_shading: - # matrix value - try: - custom_value = custom_similarity_dict[(idx, jdx)] - except: - custom_value = "" - - # bottom left triangle = LCS forward/reverse or best of both - if lcs_shading_bool: - if lcs_shading_ori == 0: # forward - color_idx1 = color_idx0 - elif lcs_shading_ori == 2: # both directions - color_idx1 = max(color_idx0, color_idx1) - - # top right triangle = custom value (not colored if text matrix provided) - if type(custom_value) == int or type(custom_value) == float: - color_idx0 = int((custom_value-custom_min)*lcs_shading_num // (custom_max-custom_min)) - # no color if string is proviced - else: - color_idx0 = 0 - - # use best LCS of both orientations for coloring triangle with two-ori-LCS - if representation != 0 and lcs_shading_ori == 2: # both directions in triangle - color_idx0, color_idx1 = max(color_idx0, color_idx1), max(color_idx0, color_idx1) - - # set colors dependent on lcs dependent on orientation - if lcs_shading_bool and not custom_shading: - if idx != jdx: - if lcs_shading_ori == 0: - color_idx1 = color_idx0 - elif lcs_shading_ori == 1: - color_idx0 = color_idx1 - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx1] - # for selfcomparison, only color reverse complement - elif lcs_shading_ori != 0 and not custom_shading: - background_colors[0] = colors[color_idx1] - # set different colors for shading by LCS + user matrix - elif lcs_shading_bool and custom_shading: - # print colors, background_colors, color_idx0, color_idx1 - background_colors[0] = colors_2[color_idx0] - background_colors[1] = colors[color_idx1] - # set grey color range for user matrix if no LCS shading - elif custom_shading: - background_colors[0] = colors[color_idx0] - background_colors[1] = colors[color_idx0] - - if verbose: - if custom_shading and lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - elif lcs_shading_bool: - lcs_text += "\t".join([name_one, name_two, str(lcs_len[0]), str(lcs_len[1]), str(color_idx0), str(color_idx1)]) + "\n" - elif custom_shading: - lcs_text += "\t".join([name_one, name_two, str(custom_value), str(color_idx0), str(color_idx1)]) + "\n" - - # calculate figure position in polyplot - # diagonal (self-dotplots) - if idx == jdx: - if mirror_y_axis: - seq_num = sequences.index(name_one)+1 - counter1 = seq_num + len(sequences) * (len(sequences)-seq_num) - counter = counter + (counter - 1) // (nrows) - else: - # skip positions below diagonal - counter1 = counter + (counter - 1) // (nrows) # + row_pos - counter = counter1 - counters = [counter1] - - # draw both graphs at once (due to symmetry) - else: - if mirror_y_axis: - col_pos = sequences.index(name_two)+1 - row_pos = len(sequences) - (sequences.index(name_one)+1) - counter1 = row_pos * ncols + col_pos - counter2 = (ncols - col_pos) * ncols + ncols - row_pos - else: - counter1 = counter - col_pos = (counter - 1) % ncols - row_pos = (counter - 1) // (nrows) - counter2 = col_pos * ncols + row_pos + 1 - counters = [counter1, counter2] # lower, upper - - if len(counters) == 2: - seq_counter += 1 - if not verbose and not seq_counter % 25: - print seq_counter, - log_txt += str(seq_counter) - - x_lists, y_lists, x_lists_rc, y_lists_rc = data_dict[(idx, jdx)] - - # plot diagram(s) - for kdx in range(len(counters)): - - if representation == 0 or len(counters) == 1 or (representation == 1 and kdx == 0) or (representation == 2 and kdx == 1): - - fig_pos = counters[kdx] - # plotting subplot with matplotlib - ax = P.subplot(gs[fig_pos-1]) # rows, columns, plotnumber - - # shade annotated regions if gff file(s) provided - if idx == jdx and gff_files != None and gff_files != []: - if name_one in feat_dict.keys(): - features = feat_dict[name_one] - if len_two != len_one: - logprint("Polydot GFF shading for diagonal fields - nequal length error!") - return - for item in features: - feat_type, start, stop = item - feat_color, strength, zoom = gff_color_dict[feat_type.lower()] - start = max(0, start - zoom - 0.5) - stop = min(len_one+1, stop + zoom + 0.5) - width = stop - start - ax.add_patch(patches.Rectangle((start, start), # (x,y) - width, width, # width, height - edgecolor=None, linewidth=line_width+zoom, - fill=True, facecolor=feat_color, - alpha=strength)) - - # if custom matrix value printed into upper matrix triangle, skip data plotting - # text print in top triangle - if user_matrix_print and custom_shading and kdx==0 and idx!=jdx: - data_plotting = False - # dotplot in bottom triangle - else: - data_plotting = True - - # mirror plot, if plotting below diagonal - if kdx == 0: - l1, l2 = len_one, len_two - n1, n2 = name_one, name_two - x1, y1 = x_lists, y_lists - x2, y2 = x_lists_rc, y_lists_rc - else: - l2, l1 = len_one, len_two - n2, n1 = name_one, name_two - x1, y1 = y_lists, x_lists - x2, y2 = y_lists_rc, x_lists_rc - - if mirror_y_axis: - x1, y1, x2, y2 = y1, x1, y2, x2 - n1, n2 = n2, n1 - - if data_plotting: - # collect lines - lines = [] - color_list = [] - for (x_lines, y_lines, col) in [(x2, y2, line_col_rev), (x1, y1, line_col_for)]: - if col != "white": - for ldx in range(len(x_lines)): - lines.append([(x_lines[ldx][0], y_lines[ldx][0]), (x_lines[ldx][-1], y_lines[ldx][-1])]) - color_list.append(col) - color_list = np.array(color_list) - - # draw lines - lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) - ax.add_collection(lc) - - # plot value provided by customer instead of dotplot - else: - alignment = {'horizontalalignment': 'center', 'verticalalignment': 'center'} - # P.text(0.5, 0.5, custom_value, size='medium', transform=ax.transAxes, **alignment) - P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, **alignment) - # P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, - # horizontalalignment='center', verticalalignment='center', color="black") - - if custom_shading: - # omit diagonal - if idx == jdx: - ax.set_facecolor("white") - # use white background for text fields (top right triangle only [kdx 0]) - elif type(custom_value) != int and type(custom_value) != float and kdx == 0: - ax.set_facecolor("white") - else: - ax.set_facecolor(background_colors[kdx]) - # set background color if lcs shading - elif lcs_shading_bool and background_colors[kdx] != None: - ax.set_facecolor(background_colors[kdx]) - - # set axis limits - # P.xlim(0, l1+1) - if mirror_y_axis: - P.xlim(0, l2+1) - P.ylim(0, l1+1) # rotate y axis (point upwards) - else: - P.xlim(0, l1+1) - P.ylim(l2+1, 0) # rotate y axis (point downwards) - - ## axis labelling - ################## - - # determine axis positions - if x_label_pos_top: - ax.xaxis.tick_top() - ax.xaxis.set_label_position('top') - x_label_bool = fig_pos <= ncols - x_tick_bool = fig_pos > ncols*(ncols-1) - else: - x_label_bool = fig_pos > ncols*(ncols-1) - x_tick_bool = fig_pos <= ncols - - # settings for y labels on right side - if y_label_pos == 0: # right label - ax.yaxis.tick_right() - ax.yaxis.set_label_position("right") - label_dist = 30 - else: - label_dist = 8 - - # x axis labels dependent on plot position/number - if x_label_bool: # x title and labels on top or bottom - P.xlabel(unicode_name(shorten_name(n1, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, rotation=x_label_rotation, verticalalignment=xvalign, horizontalalignment=xhalign, fontweight='bold', labelpad=8) # axis naming - if not x_label_rotation in ["horizontal", "vertical"]: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation="vertical") - else: - P.setp(ax.get_xticklabels(), fontsize=label_size*.9, rotation=x_label_rotation) - elif x_tick_bool and x_label_pos_top: # x ticks on bottom row - ax.xaxis.tick_bottom() # ticks without labels on bottom - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) - elif x_tick_bool: # x ticks on top row - ax.xaxis.tick_top() # # ticks without labels on top - P.setp(ax.get_xticklabels(), fontsize=label_size, rotation=x_label_rotation, visible=False) # inner diagrams without labelling - elif idx == jdx and representation != 0: - if not mirror_y_axis and representation == 1: # upper - ax.xaxis.tick_bottom() - elif mirror_y_axis and representation == 2: # lower - ax.xaxis.tick_top() - elif mirror_y_axis and representation == 1: # upper - ax.xaxis.tick_bottom() - elif not mirror_y_axis and representation == 2: # lower - ax.xaxis.tick_top() - P.setp(ax.get_xticklabels(), visible=False) # inner diagrams without labelling - else: # no x ticks on internal rows - ax.axes.get_xaxis().set_visible(False) - - # y axis labels dependent on plot position/number - if fig_pos % ncols == y_label_pos or (ncols == 1 and nrows == 1): # y title and labels in 1st column - P.ylabel(unicode_name(shorten_name(n2, max_len=title_length, title_clip_pos=title_clip_pos)), fontsize=label_size, rotation=y_label_rotation, verticalalignment=yvalign, horizontalalignment=yhalign, fontweight='bold', labelpad=label_dist) - P.setp(ax.get_yticklabels(), fontsize=label_size*.9) # axis naming - elif fig_pos % ncols == 0: # y ticks in last column - ax.yaxis.tick_right() - P.setp(ax.get_yticklabels(), visible=False) # inner diagrams without labelling - elif idx == jdx and representation != 0: - if not mirror_y_axis and representation == 1: # upper - ax.yaxis.tick_left() - elif mirror_y_axis and representation == 2: # lower - ax.yaxis.tick_left() - elif mirror_y_axis and representation == 1: # upper - ax.yaxis.tick_right() - elif not mirror_y_axis and representation == 2: # lower - ax.yaxis.tick_right() - P.setp(ax.get_yticklabels(), visible=False) # inner diagrams without labelling - else: - ax.axes.get_yaxis().set_visible(False) - - if not verbose: - print seq_counter, "done" - log_txt += str(seq_counter) + " done" - else: - print "\n%d done" % seq_counter - log_txt += "\n%d done" % seq_counter - logprint(log_txt, start=False, printing=False) - - if verbose: - try: - logprint(lcs_text, start=False, printing=True) - except: - pass - - # finalize layout - margins & spacing between plots - P.tick_params(axis='both', which='major', labelsize=label_size*.9) - try: - P.tight_layout(h_pad=.02, w_pad=.02) - except: - logprint("Attention - pylab.tight_layout failed! Please check sequence names and layout settings!") - # gs.tight_layout(fig, h_pad=.02, w_pad=.02) # less overlapping tick labels, but also disturbingly large spacing - if y_label_rotation == "horizontal": - if x_label_pos_top: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, top=0.87) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing, left=0.13, bottom=0.13) # space between rows - def 0.4 - else: - P.subplots_adjust(hspace=spacing, wspace=spacing) # space between rows - def 0.4 - - # save figure and close instance - fig_name = '%s%s_wordsize%i%s.%s' % (prefix, name_graph, wordsize, suffix, filetype) - P.savefig(fig_name) - P.close() - P.cla() - - - # create figure color legend - if lcs_shading: - if lcs_shading_ref == 1: # percentage of shorter sequence - legend_file_name = legend_figure(colors, lcs_shading_num, unit="%", filetype=filetype, prefix=prefix) - elif lcs_shading_ref == 2: # interval sizes - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, bins=color_bins) - else: # relative of maximum lcs - legend_file_name = legend_figure(colors, lcs_shading_num, unit=aa_bp_unit, filetype=filetype, prefix=prefix, max_len=max_lcs) - - if custom_shading: - custom_prefix = "custom-matrix-" + prefix - legend_file_name_custom = legend_figure(colors_2, lcs_shading_num, unit="%", filetype=filetype, prefix=custom_prefix, max_len=custom_max, min_len=custom_min) - - if lcs_shading and custom_shading: - return [fig_name, legend_file_name, legend_file_name_custom] - elif lcs_shading: - return [fig_name, legend_file_name] - elif custom_shading: - return [fig_name, legend_file_name_custom] - else: - return [fig_name] - - -############################### -# Function Call # -############################### - -def main(seq_list, wordsize, modes=[0, 1, 2], prefix=None, plot_size=10, label_size=10, filetype="png", type_nuc=True, convert_wobbles=False, substitution_count=0, rc_option=True, alphabetic_sorting=False, only_vs_first_seq=False, gff=None, multi=True, ncols=1, nrows=1, lcs_shading=True, lcs_shading_num=5, lcs_shading_ref=0, lcs_shading_interval_len=100, lcs_shading_ori=0, gff_color_config_file="", input_user_matrix_file="", user_matrix_print=False, length_scaling=True, title_length=50, title_clip_pos="B", spacing=0.04, max_N_percentage=49, mirror_y_axis=False, verbose=False): - - global t1, line_col_rev - - # check input variables - if convert_wobbles and max_N_percentage > 49: - max_N_percentage = 49 - if type_nuc: - ambiq_res = "N" - else: - ambiq_res = "X" - text = "Provide valid max_N_percentage, kmers with >50%% %ss are ignored\n" % (ambiq_res) - logprint(text, start=False, printing=True) - - if filetype not in ["png", "pdf", "svg"]: - text = "Provide valid file type - png, pdf, or svg - given:%s\n" % filetype - logprint(text, start=False, printing=True) - filetype = "png" - - # read gff color config file if provided - if len(input_gff_files) != 0 and input_gff_files != None: - if gff_color_config_file not in ["", None]: - text = "\n%s\n\nReading GFF color configuration file\n%s\n\n=> %s\n" % (50*"=", 28*"-", gff_color_config_file) - logprint(text, start=False, printing=True) - gff_feat_colors = read_gff_color_config(gff_color_config_file) - else: - gff_feat_colors = {} - if gff_color_config_file not in ["", None]: - text = "Please provide GFF annotation files to use configuration file", gff_color_config_file - logprint(text, start=False, printing=True) - - # if color is set to white, reverse complementary matches are skipped - if not rc_option: - line_col_rev = "white" # reverse matches not calculated - elif not type_nuc: - logprint("Reverse complement deactivated for proteins!") - line_col_rev = "white" # reverse matches not calculated - - mode_text = [] - for item in modes: - mode_text.append(str(item)) - text = "%s\n\nRunning plotting modes %s" % (50*"=", ", ".join(mode_text)) - logprint(text, start=False, printing=True) - - - # create dotplots - ########################################## - - # self dotplots - t1 = time.time() - if 0 in modes: - list_of_png_names = selfdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, multi=multi, ncols=ncols, nrows=nrows, gff_files=gff, gff_color_dict=gff_feat_colors, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # paired dotplots - if 1 in modes: - if multi: - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, only_vs_first_seq=only_vs_first_seq, multi=multi, ncols=ncols, nrows=nrows, length_scaling=length_scaling, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - else: - if not length_scaling: - text = "\nPairwise dotplot with individual output files scaled by sequence length automatically!" - logprint(text, start=False, printing=True) - list_of_png_names = pairdotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, only_vs_first_seq=only_vs_first_seq, multi=multi, ncols=ncols, nrows=nrows, length_scaling=True, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - # all-against-all dotplot - if 2 in modes: - list_of_png_names = polydotplot(seq_list, wordsize, prefix=prefix, label_size=label_size, title_length=title_length, title_clip_pos=title_clip_pos, plot_size=plot_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=convert_wobbles, substitution_count=substitution_count, alphabetic_sorting=alphabetic_sorting, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, spacing=spacing, gff_files=gff, gff_color_dict=gff_feat_colors, mirror_y_axis=mirror_y_axis, max_N_percentage=max_N_percentage, verbose=verbose) - t1 = time_track(t1) - if list_of_png_names != [] and list_of_png_names != None: - text = "-> Image file(s): %s\n" % ", ".join(list_of_png_names) - else: - text = "No image files were created!\n" - logprint(text, start=False, printing=True) - logprint(50*"=") - - text = "\n" + 50 * "#" + "\n" + 50 * "#" - text += "\n\nThank you for using FlexiDot!\n" - logprint(text, start=False, printing=True) - - -load_modules() - -# testing mode for debugging -trial_mode = True -trial_mode = False - -# parameters = check_input(sys.argv) -parameters = check_input(sys.argv, trial_mode=trial_mode) - -# read out parameters -commandline, auto_fas, input_fasta, output_file_prefix, collage_output, m_col, n_row, filetype, type_nuc, input_gff_files, gff_color_config_file, wordsize, plotting_modes, wobble_conversion, substitution_count, rc_option, alphabetic_sorting, only_vs_first_seq, lcs_shading, lcs_shading_num, lcs_shading_ref, lcs_shading_interval_len, lcs_shading_ori, input_user_matrix_file, user_matrix_print, plot_size, line_width, line_col_for, line_col_rev, x_label_pos_top, label_size, spacing, length_scaling, title_length, title_clip_pos, max_N_percentage, mirror_y_axis, representation, verbose = parameters - -# evtl. overwrite parameters for testing purposes in trial mode -if trial_mode: - input_fasta = ["test-sequences-8.fas"] - input_gff_files = ["Seq2_annotations.gff3"] - # input_user_matrix_file = "matrix.txt" - # user_matrix_print = True - output_file_prefix = "#Test" - plot_size = 10 - plotting_modes = [0,1,2] - plotting_modes = [2] - lcs_shading = False - lcs_shading = True - lcs_shading_ref = 2 - lcs_shading_num = 4 - lcs_shading_ori = 0 - lcs_shading_interval_len = 15 - wordsize = 10 - wordsize = 7 - x_label_pos_top = True - filetype = "pdf" - filetype = "png" - mirror_y_axis = False - mirror_y_axis = True - - output_file_prefix = "#R-upper" - representation = 0 # both - representation = 1 # upper - representation = 2 # lower - - wobble_conversion = False - wobble_conversion = True - - substitution_count = 0 - - rc_option = True - rc_option = False - label_size = 10 - - verbose = False - verbose = True - -if auto_fas: - path = os.path.dirname(os.path.abspath(__file__)) - files_long = glob.glob(path+"/*.fasta") - files_long.extend(glob.glob(path+"/*.fas")) - files_long.extend(glob.glob(path+"/*.fa")) - files_long.extend(glob.glob(path+"/*.fna")) - input_fasta = [] - for i in files_long: - if not "combined" in i: - filename = i[i.rfind('\\')+1:] - input_fasta.append(filename) - -if trial_mode: - # start logging file - logprint(commandline, start=True, printing=False, prefix=output_file_prefix) - - - - - -main(input_fasta, wordsize, modes=plotting_modes, prefix=output_file_prefix, plot_size=plot_size, label_size=label_size, filetype=filetype, type_nuc=type_nuc, convert_wobbles=wobble_conversion, substitution_count=substitution_count, rc_option=rc_option, alphabetic_sorting=alphabetic_sorting, only_vs_first_seq=only_vs_first_seq, gff=input_gff_files, multi=collage_output, ncols=m_col, nrows=n_row, lcs_shading=lcs_shading, lcs_shading_num=lcs_shading_num, lcs_shading_ref=lcs_shading_ref, lcs_shading_interval_len=lcs_shading_interval_len, lcs_shading_ori=lcs_shading_ori, gff_color_config_file=gff_color_config_file, input_user_matrix_file=input_user_matrix_file, user_matrix_print=user_matrix_print, length_scaling=length_scaling, title_length=title_length, title_clip_pos=title_clip_pos, spacing=spacing, max_N_percentage=max_N_percentage, mirror_y_axis=mirror_y_axis, verbose=verbose) - - diff --git a/documentation/FlexiDot__Highly_customizable_ambiguity_aware_dotplots__preprint.pdf b/docs/FlexiDot__Highly_customizable_ambiguity_aware_dotplots__preprint.pdf similarity index 100% rename from documentation/FlexiDot__Highly_customizable_ambiguity_aware_dotplots__preprint.pdf rename to docs/FlexiDot__Highly_customizable_ambiguity_aware_dotplots__preprint.pdf diff --git a/documentation/SupplementaryData.pdf b/docs/SupplementaryData.pdf similarity index 100% rename from documentation/SupplementaryData.pdf rename to docs/SupplementaryData.pdf diff --git a/images/Beetle_matrix_shading.png b/docs/images/Beetle_matrix_shading.png similarity index 100% rename from images/Beetle_matrix_shading.png rename to docs/images/Beetle_matrix_shading.png diff --git a/images/Fig-Suppl-MismatchesWobbles.png b/docs/images/Fig-Suppl-MismatchesWobbles.png similarity index 100% rename from images/Fig-Suppl-MismatchesWobbles.png rename to docs/images/Fig-Suppl-MismatchesWobbles.png diff --git a/images/FlexiLogo.png b/docs/images/FlexiLogo.png similarity index 100% rename from images/FlexiLogo.png rename to docs/images/FlexiLogo.png diff --git a/images/Selfdotplot_shaded.png b/docs/images/Selfdotplot_shaded.png similarity index 100% rename from images/Selfdotplot_shaded.png rename to docs/images/Selfdotplot_shaded.png diff --git a/images/Selfdotplots_banner.png b/docs/images/Selfdotplots_banner.png similarity index 100% rename from images/Selfdotplots_banner.png rename to docs/images/Selfdotplots_banner.png diff --git a/images/Selfdotplots_banner2.png b/docs/images/Selfdotplots_banner2.png similarity index 100% rename from images/Selfdotplots_banner2.png rename to docs/images/Selfdotplots_banner2.png diff --git a/images/Selfdotplots_banner3.png b/docs/images/Selfdotplots_banner3.png similarity index 100% rename from images/Selfdotplots_banner3.png rename to docs/images/Selfdotplots_banner3.png diff --git a/images/Selfdotplots_banner4.png b/docs/images/Selfdotplots_banner4.png similarity index 100% rename from images/Selfdotplots_banner4.png rename to docs/images/Selfdotplots_banner4.png diff --git a/images/all_against_all.png b/docs/images/all_against_all.png similarity index 100% rename from images/all_against_all.png rename to docs/images/all_against_all.png diff --git a/images/all_against_all_Flexi1.04_Para_Mirror.png b/docs/images/all_against_all_Flexi1.04_Para_Mirror.png similarity index 100% rename from images/all_against_all_Flexi1.04_Para_Mirror.png rename to docs/images/all_against_all_Flexi1.04_Para_Mirror.png diff --git a/images/all_against_all_Flexi1.04_Para_Representation.png b/docs/images/all_against_all_Flexi1.04_Para_Representation.png similarity index 100% rename from images/all_against_all_Flexi1.04_Para_Representation.png rename to docs/images/all_against_all_Flexi1.04_Para_Representation.png diff --git a/images/all_against_all_annotation_based_shading_cool.png b/docs/images/all_against_all_annotation_based_shading_cool.png similarity index 100% rename from images/all_against_all_annotation_based_shading_cool.png rename to docs/images/all_against_all_annotation_based_shading_cool.png diff --git a/images/all_against_all_shaded_orientation2.png b/docs/images/all_against_all_shaded_orientation2.png similarity index 100% rename from images/all_against_all_shaded_orientation2.png rename to docs/images/all_against_all_shaded_orientation2.png diff --git a/images/all_against_all_shaded_orientation_custom_matrix.png b/docs/images/all_against_all_shaded_orientation_custom_matrix.png similarity index 100% rename from images/all_against_all_shaded_orientation_custom_matrix.png rename to docs/images/all_against_all_shaded_orientation_custom_matrix.png diff --git a/images/ambiguities.png b/docs/images/ambiguities.png similarity index 100% rename from images/ambiguities.png rename to docs/images/ambiguities.png diff --git a/images/pairwise_low_res.png b/docs/images/pairwise_low_res.png similarity index 100% rename from images/pairwise_low_res.png rename to docs/images/pairwise_low_res.png diff --git a/images/sSaTar_cluster_flexi_300b.png b/docs/images/sSaTar_cluster_flexi_300b.png similarity index 100% rename from images/sSaTar_cluster_flexi_300b.png rename to docs/images/sSaTar_cluster_flexi_300b.png diff --git a/documentation/tutorial_add_annotation.md b/docs/tutorial_add_annotation.md similarity index 79% rename from documentation/tutorial_add_annotation.md rename to docs/tutorial_add_annotation.md index 5be642f..5392c36 100644 --- a/documentation/tutorial_add_annotation.md +++ b/docs/tutorial_add_annotation.md @@ -7,14 +7,14 @@ Combination of structural annotation with a dotplot, as possible with FlexiDot, [Franco et al. (2018)](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4653-6) "Modular assembly of transposable element arrays by microsatellite targeting in the guayule and rice genomes". BMC Genomics 19:271 ## FlexiDot illustration of this region: - + ## Input files: -- [*sSaTar.fas*: sSaTar cluster on Sorghum chromosome 1](https://github.com/molbio-dresden/flexidot/blob/master/test-data/sSaTar_example/sSaTar.fas)* -- [*sSaTar.gff3*: sSaTar annotations as gff3](https://github.com/molbio-dresden/flexidot/blob/master/test-data/sSaTar_example/sSaTar.gff3)* -- [*sSaTar.config*: sSatar config file to define colors for FlexiDot](https://github.com/molbio-dresden/flexidot/blob/master/test-data/sSaTar_example/sSaTar.config) +- [*sSaTar.fas*: sSaTar cluster on Sorghum chromosome 1](https://github.com/molbio-dresden/flexidot/blob/master/tests/test-data/sSaTar_example/sSaTar.fas)* +- [*sSaTar.gff3*: sSaTar annotations as gff3](https://github.com/molbio-dresden/flexidot/blob/master/tests/test-data/sSaTar_example/sSaTar.gff3)* +- [*sSaTar.config*: sSatar config file to define colors for FlexiDot](https://github.com/molbio-dresden/flexidot/blob/master/tests/test-data/sSaTar_example/sSaTar.config) \* *fasta* and *gff3 files* have been deduced from Franco et al.'s [Supplemental File 14](https://static-content.springer.com/esm/art%3A10.1186%2Fs12864-018-4653-6/MediaObjects/12864_2018_4653_MOESM14_ESM.pdf), showing the annotated sequence of this region. @@ -22,12 +22,12 @@ The *config file* defines color, alpha and zoom of each sequence type. Please no ## Command: -``` -python flexidot.py -i sSaTar.fas -g sSaTar.gff3 -G sSaTar.config -k 10 -S 1 -T 30 -c n -E 15 -A 2 -C black -f 1 +```bash +flexidot -i sSaTar.fas -g sSaTar.gff3 -G sSaTar.config -k 10 -S 1 -T 30 -E 15 -A 2 -C black -f pdf ``` --- -For additional application use cases, please see the [FlexiDot in-depth documentation (pdf)](https://github.com/molbio-dresden/flexidot/blob/master/documentation/SupplementaryData.pdf). +For additional application use cases, please see the [FlexiDot in-depth documentation (pdf)](https://github.com/molbio-dresden/flexidot/blob/master/docs/SupplementaryData.pdf). Back to [FlexiDot home](https://github.com/molbio-dresden/flexidot). diff --git a/documentation/usage_v1.06.pdf b/docs/usage_v1.06.pdf similarity index 100% rename from documentation/usage_v1.06.pdf rename to docs/usage_v1.06.pdf diff --git a/documentation/usage_v1.01.pdf b/documentation/usage_v1.01.pdf deleted file mode 100644 index c95bad2..0000000 Binary files a/documentation/usage_v1.01.pdf and /dev/null differ diff --git a/documentation/usage_v1.02.pdf b/documentation/usage_v1.02.pdf deleted file mode 100644 index e4f5bf1..0000000 Binary files a/documentation/usage_v1.02.pdf and /dev/null differ diff --git a/documentation/usage_v1.03.pdf b/documentation/usage_v1.03.pdf deleted file mode 100644 index b64f6e9..0000000 Binary files a/documentation/usage_v1.03.pdf and /dev/null differ diff --git a/documentation/usage_v1.04.pdf b/documentation/usage_v1.04.pdf deleted file mode 100644 index c750ad3..0000000 Binary files a/documentation/usage_v1.04.pdf and /dev/null differ diff --git a/documentation/usage_v1.05.pdf b/documentation/usage_v1.05.pdf deleted file mode 100644 index 20e1c8b..0000000 Binary files a/documentation/usage_v1.05.pdf and /dev/null differ diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..b7a04df --- /dev/null +++ b/environment.yml @@ -0,0 +1,20 @@ +name: flexidot +channels: + - conda-forge + - bioconda +dependencies: + - python>=3.8,<3.13 + - biopython + - colormap + - colour + - easydev + - matplotlib + - numpy + - regex + - pip + - pip: + - pytest + - pytest-cov + - hatch + + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8b1f287 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,75 @@ +# Build system configuration +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +# Project metadata +[project] +name = "flexidot" +description = "Flexible dotplotting of genomic sequences." +readme = "README.md" +requires-python = ">=3.8" +license = { text = "GNU Lesser General Public License v3 (LGPLv3)"} +authors = [ + { name = "Kathrin M. Seibt"}, + { name = "Thomas Schmidt"}, + { name = "Tony Heitkam"}, +] + +# Classifiers for project categorization +classifiers = [ + "Programming Language :: Python :: 3", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)", +] + +# Project dependencies +dependencies = [ + "biopython", + "colormap", + "colour", + "easydev", + "matplotlib", + "numpy", + "regex", +] + +# Dynamic versioning +dynamic = ["version"] + +# Project URLs +[project.urls] +homepage = "https://github.com/molbio-dresden/flexidot" +documentation = "https://github.com/molbio-dresden/flexidot" +repository = "https://github.com/molbio-dresden/flexidot" + +# Command-line script entry point +[project.scripts] +flexidot="flexidot.app:main" + +# Hatch build configuration +[tool.hatch.build] +source = "src" + +# Exclude files and directories from the build +exclude = [ + "environment.yml", +] + +# Hatch versioning configuration +[tool.hatch.version] +source = "vcs" + +# Version control system (VCS) versioning +[tool.hatch.version.vcs] +tag-pattern = "v*" # Git tags starting with 'v' will be used for versioning +fallback-version = "0.0.0" + +# Version file location for VCS +[tool.hatch.build.hooks.vcs] +version-file = "src/flexidot/_version.py" + +# Optional dependencies for testing and development +[project.optional-dependencies] +tests = ["pytest", "pytest-cov", "hatch"] \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..98a6b49 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +addopts = -v +testpaths = tests +python_files = test_*.py \ No newline at end of file diff --git a/src/flexidot/__init__.py b/src/flexidot/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/flexidot/app.py b/src/flexidot/app.py new file mode 100644 index 0000000..317e531 --- /dev/null +++ b/src/flexidot/app.py @@ -0,0 +1,304 @@ +import logging +import sys + +from matplotlib import rc as mplrc +from matplotlib import rcParams +import pylab as P + +from flexidot._version import __version__ +from flexidot.plotting import selfdotplot, pairdotplot, polydotplot +from flexidot.utils.args import parse_args +from flexidot.utils.checks import print_summary, check_kmer_length +from flexidot.utils.file_handling import read_gff_color_config +from flexidot.utils.logs import init_logging + +# Matplotlib settings + +# Switch to non-interactive backend to avoid _tkinter.TclError on CentOs 7 servers see Github Issue #5 +P.switch_backend("agg") + +# Font settings +mplrc("pdf", fonttype=42, compression=0) + +rcParams["font.family"] = "sans-serif" +rcParams["font.sans-serif"] = [ + "Helvetica", + "Verdana", + "Tahoma", + "DejaVu Sans", + "Droid Sans Mono", + "Sans", + "Liberation", + "Ubuntu", + "Arial", +] + +__citation__ = ( + "Please remember to cite FlexiDot as follows:\n\n" + "Kathrin M Seibt, Thomas Schmidt, Tony Heitkam,\n" + "FlexiDot: highly customizable, ambiguity-aware dotplots for visual sequence analyses,\n" + "Bioinformatics, Volume 34, Issue 20, October 2018, Pages 3575–3577,\n" + "https://doi.org/10.1093/bioinformatics/bty395" +) + +############################### +# Function Call # +############################### + + +def main(): + # Parse command line arguments + args = parse_args() + + # Set up logging + init_logging(loglevel=args.loglevel, logfile=args.logfile) + + # Print summary of arguments + print_summary(args) + + # Log version and command line arguments if debug is enabled + logging.debug("FlexiDot version: %s" % __version__) + logging.debug(" ${0}\n\n".format(" ".join(sys.argv))) + logging.debug("Command line arguments: %s" % args) + + # Check valid kmer length + check_kmer_length(args.wordsize) + + # Set up variables + alphabetic_sorting = args.sort + convert_wobbles = args.wobble_conversion + filetype = args.filetype + gff = args.gff + gff_color_config_file = args.gff_color_config + input_user_matrix_file = args.user_matrix_file + label_size = args.label_size + lcs_shading_interval_len = args.lcs_shading_interval_len + lcs_shading_num = args.lcs_shading_num + lcs_shading_ori = args.lcs_shading_ori + lcs_shading_ref = args.lcs_shading_ref + lcs_shading = args.lcs_shading + length_scaling = args.length_scaling + max_N_percentage = args.max_n + mirror_y_axis = args.mirror_y_axis + modes = args.mode + multi = args.collage + ncols = args.n_col + nrows = args.n_row + only_vs_first_seq = args.only_vs_first_seq + plot_size = args.plot_size + prefix = f"{args.outdir}/{args.output_prefix}" + norevcomp = args.norev + seq_list = args.infiles + spacing = args.spacing + substitution_count = args.substitution_count + title_clip_pos = ( + "B" # Note: This was processed out of title_length in previouis versions + ) + title_length = args.title_length + user_matrix_print = args.user_matrix_print + wordsize = args.wordsize + line_col_rev = args.line_col_rev + line_col_for = args.line_col_for + line_width = args.line_width + + # Set True if nucleotide sequence + if args.type_seq == "nuc": + type_nuc = True + elif args.type_seq == "aa": + type_nuc = False + + # Set x label position + if args.x_label_pos == "top": + x_label_pos_top = True + elif args.x_label_pos == "bottom": + x_label_pos_top = False + + # Read gff color config file if provided + if args.gff: + if gff_color_config_file: + logging.info( + f"Reading GFF color configuration file: {gff_color_config_file}" + ) + gff_feat_colors = read_gff_color_config(gff_color_config_file) + else: + gff_feat_colors = {} + if gff_color_config_file: + logging.warning( + f"Provide GFF annotation files to use configuration file: {gff_color_config_file}" + ) + + # If color is set to white, reverse complementary matches are skipped + if norevcomp: # if norev is set + line_col_rev = "white" # reverse matches not calculated + + if not type_nuc and not norevcomp: + logging.warning("Reverse complement deactivated for proteins.") + line_col_rev = "white" # reverse matches not calculated + elif not type_nuc: + line_col_rev = "white" + + # Log plotting modes + mode_text = [] + mode_names = {"0": "self", "1": "paired", "2": "poly"} + for item in modes: + mode_text.append(str(item) + ": " + mode_names[item]) + + logging.info(f"Requested plotting modes: {', '.join(mode_text)}\n\n{50 * '='}") + + # Create dotplots + ########################################## + + # Init empty list for image file names + list_of_png_names = list() + + # self dotplots + if "0" in modes: + logging.info("Calling selfdotplot") + list_of_png_names = selfdotplot( + seq_list, + wordsize, + alphabetic_sorting=alphabetic_sorting, + convert_wobbles=convert_wobbles, + filetype=args.filetype, + gff_color_dict=gff_feat_colors, + gff_files=gff, + label_size=label_size, + line_col_rev=line_col_rev, + line_col_for=line_col_for, + max_N_percentage=max_N_percentage, + mirror_y_axis=mirror_y_axis, + multi=multi, + ncols=ncols, + nrows=nrows, + plot_size=plot_size, + prefix=prefix, + substitution_count=substitution_count, + title_clip_pos=title_clip_pos, + title_length=title_length, + type_nuc=type_nuc, + line_width=line_width, + ) + if list_of_png_names: + logging.info( + f"\n-> Image file(s):\t{',\n\t\t\t'.join(list_of_png_names)}\n\n{50 * '='}" + ) + else: + logging.warning(f"No image files were created!\n\n{50 * '='}\n") + + # paired dotplots + if "1" in modes: + if multi: + logging.info("Calling pairdotplot with collage") + list_of_png_names = pairdotplot( + seq_list, + wordsize, + alphabetic_sorting=alphabetic_sorting, + convert_wobbles=convert_wobbles, + filetype=filetype, + label_size=label_size, + length_scaling=length_scaling, + line_col_rev=line_col_rev, + line_col_for=line_col_for, + max_N_percentage=max_N_percentage, + mirror_y_axis=mirror_y_axis, + multi=multi, + ncols=ncols, + nrows=nrows, + only_vs_first_seq=only_vs_first_seq, + plot_size=plot_size, + prefix=prefix, + substitution_count=substitution_count, + title_clip_pos=title_clip_pos, + title_length=title_length, + type_nuc=type_nuc, + x_label_pos_top=x_label_pos_top, + line_width=line_width, + ) + # t1 = time_track(t1) + else: + if not length_scaling: + logging.info( + "Pairwise dotplot with individual output files scaled by sequence length automatically." + ) + + logging.info("Calling pairdotplot") + list_of_png_names = pairdotplot( + seq_list, + wordsize, + alphabetic_sorting=alphabetic_sorting, + convert_wobbles=convert_wobbles, + filetype=filetype, + label_size=label_size, + length_scaling=True, + line_col_rev=line_col_rev, + line_col_for=line_col_for, + max_N_percentage=max_N_percentage, + mirror_y_axis=mirror_y_axis, + multi=multi, + ncols=ncols, + nrows=nrows, + only_vs_first_seq=only_vs_first_seq, + plot_size=plot_size, + prefix=prefix, + substitution_count=substitution_count, + title_clip_pos=title_clip_pos, + title_length=title_length, + type_nuc=type_nuc, + line_width=line_width, + ) + if list_of_png_names: + logging.info( + f"\n-> Image file(s):\t{',\n\t\t\t'.join(list_of_png_names)}\n\n{50 * '='}" + ) + else: + logging.warning(f"No image files were created!\n\n{50 * '='}\n") + + # all-against-all dotplot + if "2" in modes: + logging.info("Calling polydotplot") + list_of_png_names = polydotplot( + seq_list, + wordsize, + alphabetic_sorting=alphabetic_sorting, + convert_wobbles=convert_wobbles, + filetype=filetype, + gff_color_dict=gff_feat_colors, + gff_files=gff, + input_user_matrix_file=input_user_matrix_file, + label_size=label_size, + line_col_rev=line_col_rev, + line_col_for=line_col_for, + lcs_shading_interval_len=lcs_shading_interval_len, + lcs_shading_num=lcs_shading_num, + lcs_shading_ori=lcs_shading_ori, + lcs_shading_ref=lcs_shading_ref, + lcs_shading=lcs_shading, + max_N_percentage=max_N_percentage, + mirror_y_axis=mirror_y_axis, + plot_size=plot_size, + prefix=prefix, + representation=args.representation, + spacing=spacing, + substitution_count=substitution_count, + title_clip_pos=title_clip_pos, + title_length=title_length, + type_nuc=type_nuc, + user_matrix_print=user_matrix_print, + line_width=line_width, + ) + + if list_of_png_names: + logging.info( + f"\n-> Image file(s):\t{',\n\t\t\t'.join(list_of_png_names)}\n\n{50 * '='}" + ) + else: + logging.warning(f"No image files were created!\n\n{50 * '='}\n") + + logging.info(f"\nFinished! Thank you for using FlexiDot.\n\n{__citation__}") + + +###################### +# FlexiDot Execution # +###################### +if __name__ == "__main__": + main() diff --git a/src/flexidot/plotting.py b/src/flexidot/plotting.py new file mode 100644 index 0000000..b9f5711 --- /dev/null +++ b/src/flexidot/plotting.py @@ -0,0 +1,1972 @@ +############################### +# Dot Plot Functions # +############################### + +import logging + +import matplotlib.collections as cllct +import matplotlib.gridspec as gridspec +import matplotlib.patches as patches +import numpy as np +import pylab as P + +from flexidot.utils.file_handling import read_seq, read_gffs, legend_figure, read_matrix +from flexidot.utils.utils import ( + calc_fig_ratio, + create_color_list, + shorten_name, + unicode_name, +) +from flexidot.utils.matching import find_match_pos_regex, find_match_pos_diag + + +def selfdotplot( + input_fasta, + wordsize, + alphabetic_sorting=False, + convert_wobbles=False, + filetype="png", + gff_color_dict={"others": ("grey", 1, 0)}, + gff_files=[], + label_size=10, + line_col_for="#000000", # defalut black + line_col_rev="#009243", # default green + line_width=1, + max_N_percentage=10, + mirror_y_axis=False, + multi=True, + ncols=4, + nrows=5, + plot_size=10, + prefix=None, + substitution_count=0, + title_clip_pos="B", + title_length=float("Inf"), + type_nuc=True, +): + """ + self-against-self dotplot + partially from biopython cookbook + """ + + # read sequences + seq_dict, sequences = read_seq(input_fasta) + if seq_dict == {}: + logging.warning("Failed to load sequences.") + return [] + + if type_nuc: + aa_bp_unit = "bp" + else: + aa_bp_unit = "aa" + + if alphabetic_sorting: + sequences = sorted(sequences) + + # check if at least one input sequence + if len(sequences) == 0: + text = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % ( + 50 * "=", + len(sequences), + 28 * "-", + ) + text += " No sequences provided for selfdotplot!\n\nTerminating polydotplot!" + logging.info(text) + return + elif len(sequences) == 1 and multi: + text = "\n\nCreating collage output for single selfdotplot!" + text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" + logging.info(text) + + if multi and (ncols == 0 or nrows == 0): + ncols = max(ncols, 1) + nrows = max(nrows, 1) + text = ( + "\n\nSelfdotplot Collage: Invalid collage - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" + % (ncols, nrows) + ) + logging.info(text) + + if multi and ncols > len(sequences): + ncols = len(sequences) + nrows = 1 + text = ( + "\n\nSelfdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" + % (ncols, nrows) + ) + logging.info(text) + elif multi and ncols * (nrows - 1) > len(sequences): + nrows = ((len(sequences) - 1) // ncols) + 1 + text = ( + "\n\nSelfdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" + % (ncols, nrows) + ) + logging.info(text) + + if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size / 2: + label_size = plot_size * 3 // 2 + text = "Reducing label size for better visualization to %d\n" % label_size + logging.info(text) + + # read gff annotation data if provided for shading + if gff_files: + text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % ( + 50 * "=", + len(gff_files), + 28 * "-", + ", ".join(gff_files), + ) + logging.info(text) + if prefix: + legend_prefix = prefix + "-Selfdotplot" + else: + legend_prefix = "Selfdotplot" + feat_dict = read_gffs( + gff_files, + color_dict=gff_color_dict, + type_nuc=type_nuc, + prefix=legend_prefix, + filetype=filetype, + ) + + log_txt = "\n%s\n\nCreating %s selfdotplot images\n%s\n\n=>" % ( + 50 * "=", + len(sequences), + 28 * "-", + ) + + # preparations for file name + name_graph = "Selfdotplots" + if prefix: + if not prefix[-1] == "-": + prefix = prefix + "-" + else: + prefix = "" + suffix = "" + if convert_wobbles: + suffix += "_wobbles" + if substitution_count != 0: + suffix += "_S%d" % substitution_count + if multi: + suffix += "_collage" + + # calculate fig ratios + if not multi: + ncols = 1 + nrows = 1 + figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) + + P.cla() # clear any prior graph + if multi: + fig = P.figure(figsize=(figsize_x, figsize_y)) + page_counter = 1 + list_of_png_names = [] + + counter = 0 + for seq_name in sequences: + log_txt += "\n- " + seq_name + + counter += 1 + if not multi: + P.cla() # clear any prior graph + + # read sequence + seq_record = seq_dict[seq_name] + name_seq = seq_record.id + seq_one = seq_record.seq.upper() + length_seq = len(seq_one) + + # get positions of matches + if substitution_count != 0: + # print "RE" + x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_regex( + seq_one, + seq_one, + wordsize, + substitution_count=substitution_count, + convert_wobbles=convert_wobbles, + max_N_percentage=max_N_percentage, + type_nuc=type_nuc, + ) + else: + # print "DIAG", + x_lists, y_lists, x_lists_rc, y_lists_rc = find_match_pos_diag( + seq_one, + seq_one, + wordsize, + convert_wobbles=convert_wobbles, + max_N_percentage=max_N_percentage, + type_nuc=type_nuc, + ) + + # plotting with matplotlib + ################################# + + # combined plotting + if multi: + # plotting subplot with matplotlib + ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber + + # shade annotated regions + if gff_files: + if seq_name in list(feat_dict.keys()): + features = feat_dict[seq_name] + for item in features: + feat_type, start, stop = item + feat_color, strength, zoom = gff_color_dict[feat_type.lower()] + start = max(0, start - zoom - 0.5) + stop = min(length_seq + 1, stop + zoom + 0.5) + width = stop - start + ax.add_patch( + patches.Rectangle( + (start, start), # (x,y) + width, + width, # width, height + edgecolor=None, + linewidth=line_width + zoom, + fill=True, + facecolor=feat_color, + alpha=strength, + ) + ) + + # collect lines + lines = [] + color_list = [] + for x_lines, y_lines, col in [ + (x_lists_rc, y_lists_rc, line_col_rev), + (x_lists, y_lists, line_col_for), + ]: + # If color is not white, add lines to plot + if col != "white": + for ldx in range(len(x_lines)): + lines.append( + [ + (x_lines[ldx][0], y_lines[ldx][0]), + (x_lines[ldx][-1], y_lines[ldx][-1]), + ] + ) + color_list.append(col) + color_list = np.array(color_list) + + # draw lines + lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) + ax.add_collection(lc) + + # format axes + # print P.xticks()[0], P.yticks()[0] + P.axis("scaled") # make images quadratic + P.xlim(0, length_seq + 1) + if mirror_y_axis: + P.ylim(0, length_seq + 1) # rotate y axis (point upwards) + else: + P.ylim(length_seq + 1, 0) # rotate y axis (point downwards) + P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) + P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) + P.tick_params(axis="both", which="major", labelsize=label_size * 0.9) + + # # use same tick labels for x and y axis + # tick_locs, tick_labels = P.yticks() + # P.xticks(tick_locs) + # P.xlim(0, length_seq+1) + + P.title( + unicode_name( + shorten_name( + name_seq, max_len=title_length, title_clip_pos=title_clip_pos + ) + ), + fontsize=label_size, + fontweight="bold", + ) + # P.title(unicode_name(name_seq), fontsize=label_size*1.3, fontweight='bold') + + # save figure and reinitiate if page is full + if counter == ncols * nrows: + # finalize layout - margins & spacing between plots + try: + P.tight_layout(h_pad=0.02, w_pad=0.02) + except Exception as e: + logging.info( + "Attention - pylab.tight_layout failed! Please check sequence names and layout settings! Error: %s" + % str(e) + ) + P.subplots_adjust( + hspace=0.5, wspace=0.5 + ) # space between rows - def 0.4 + + # name and create output files (names derived from SEQNAME) + fig_name = "%s%s_wordsize%i%s-%.3d.%s" % ( + prefix, + name_graph, + wordsize, + suffix, + page_counter, + filetype, + ) + P.savefig(fig_name, bbox_inches="tight") + P.close() + P.cla() + + list_of_png_names.append(fig_name) + + counter = 0 + page_counter += 1 + + fig = P.figure(figsize=(figsize_x, figsize_y)) + + # plotting separate figure files + else: # not multi + fig = P.figure( + figsize=(plot_size, plot_size) + ) # figure size needs to be a square + ax = P.subplot(1, 1, 1) # rows, columns, plotnumber + + # shade annotated regions + if gff_files: + if seq_name in list(feat_dict.keys()): + features = feat_dict[seq_name] + for item in features: + feat_type, start, stop = item + feat_color, strength, zoom = gff_color_dict[feat_type.lower()] + start = max(0, start - zoom - 0.5) + stop = min(length_seq + 1, stop + zoom + 0.5) + width = stop - start + ax.add_patch( + patches.Rectangle( + (start, start), # (x,y) + width, + width, # width, height + edgecolor=None, + linewidth=line_width + zoom, + fill=True, + facecolor=feat_color, + alpha=strength, + ) + ) + + # collect lines + lines = [] + number = 0 + color_list = [] + for x_lines, y_lines, col in [ + (x_lists_rc, y_lists_rc, line_col_rev), + (x_lists, y_lists, line_col_for), + ]: + # If color is not white, add lines to plot + if col != "white": + for ldx in range(len(x_lines)): + lines.append( + [ + (x_lines[ldx][0], y_lines[ldx][0]), + (x_lines[ldx][-1], y_lines[ldx][-1]), + ] + ) + color_list.append(col) + + color_list = np.array(color_list) + + # draw lines + lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) + ax.add_collection(lc) + + # format axes + P.axis("scaled") # make images quadratic + P.xlim(0, length_seq + 1) + if mirror_y_axis: + P.ylim(0, length_seq + 1) # rotate y axis (point upwards) + else: + P.ylim(length_seq + 1, 0) # rotate y axis (point downwards) + P.xlabel("[%s]" % aa_bp_unit, fontsize=label_size) + P.ylabel("[%s]" % aa_bp_unit, fontsize=label_size) + P.tick_params(axis="both", which="major", labelsize=label_size * 0.9) + + # # use same tick labels for x and y axis + # tick_locs, tick_labels = P.yticks() + # P.xticks(tick_locs) + # P.xlim(0, length_seq+1) + + P.title( + unicode_name( + shorten_name( + name_seq, max_len=title_length, title_clip_pos=title_clip_pos + ) + ), + fontsize=label_size * 1.3, + fontweight="bold", + ) + + # name and create output files (names derived from SEQNAME) + fig_name = "%s%s-%d_%s_wordsize%i%s.%s" % ( + prefix, + name_graph, + counter, + shorten_name( + name_seq, max_len=title_length, title_clip_pos=title_clip_pos + ), + wordsize, + suffix, + filetype, + ) + P.savefig(fig_name, bbox_inches="tight") + + P.close() + P.cla() # clear any prior graph + + list_of_png_names.append(fig_name) + + if multi and counter >= 1: + # finalize layout - margins & spacing between plots + try: + P.tight_layout(h_pad=0.02, w_pad=0.02) + except: + logging.info( + "Attention - pylab.tight_layout failed! Please check sequence names and layout settings!" + ) + P.subplots_adjust(hspace=0.5, wspace=0.5) # space between rows - def 0.4 + + # name and create output files (names derived from SEQNAME) + fig_name = "%s%s_wordsize%i%s-%.3d.%s" % ( + prefix, + name_graph, + wordsize, + suffix, + page_counter, + filetype, + ) + P.savefig(fig_name, bbox_inches="tight") + P.close() + P.cla() # clear any prior graph + + list_of_png_names.append(fig_name) + + log_txt += "\n\nDrawing selfdotplots done.\n" + logging.info(log_txt) + + return list_of_png_names + + +def pairdotplot( + input_fasta, + wordsize, + alphabetic_sorting=False, + convert_wobbles=False, + filetype="png", + label_size=10, + length_scaling=True, + line_col_for="#000000", # defalut black + line_col_rev="#009243", # default green + line_width=1, + max_N_percentage=10, + mirror_y_axis=False, + multi=True, + ncols=4, + nrows=5, + only_vs_first_seq=False, + plot_size=10, + prefix=None, + scale_delim_col="red", + substitution_count=0, + title_clip_pos="B", + title_length=float("Inf"), + type_nuc=True, + x_label_pos_top=True, +): + """ + pairwise dotplot (all-against-all) + """ + + # read sequences + seq_dict, sequences = read_seq(input_fasta) + if seq_dict == {}: + logging.warning("Failed to load sequences.") + return [] + + if type_nuc: + aa_bp_unit = "bp" + else: + aa_bp_unit = "aa" + + if alphabetic_sorting: + sequences = sorted(sequences) + + # check if at least two input sequences + if len(sequences) < 2: + text = "\n%s\n\nCreating %d paired dotplot image \n%s\n\n=>" % ( + 50 * "=", + len(sequences) * (len(sequences) - 1) / 2, + 36 * "-", + ) + text += " Please provide at least two sequences for pairdotplot!\n\nTerminating paired dotplot!" + logging.info(text) + return + elif len(sequences) == 2 and multi: + text = "\n\nCreating collage output for single pairdotplot!" + text += "\nRecommendation: Change to individual mode by using '--collage_output n'!\n\n" + logging.info(text) + + if multi and (ncols == 0 or nrows == 0): + ncols = max(ncols, 1) + nrows = max(nrows, 1) + text = ( + "\n\nPairdotplot Collage: Invalid collage settings - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" + % (ncols, nrows) + ) + logging.info(text) + + if multi and ncols > len(sequences) * (len(sequences) - 1): + ncols = len(sequences) + nrows = 1 + text = ( + "\n\nPairdotplot Collage: Few sequences - correcting number of rows and columns:\n\tncols=%d, nrows=%d\n" + % (ncols, nrows) + ) + logging.info(text) + elif multi and ncols * (nrows - 1) > len(sequences) * (len(sequences) - 1): + nrows = ((len(sequences) - 1) // ncols) + 1 + text = ( + "\n\nPairdotplot Collage: Few sequences - correcting number of rows:\n\tncols=%d, nrows=%d\n" + % (ncols, nrows) + ) + logging.info(text) + + if not only_vs_first_seq: + text = "\n%s\n\nCreating %d paired dotplot image for\n%s\n\n=>" % ( + 50 * "=", + len(sequences) * (len(sequences) - 1) / 2, + 36 * "-", + ) + text += ",\n".join(sequences) + "\n" + else: + text = ( + "\n%s\n\nCreating %d paired dotplot images against 1st sequence '%s':\n%s\n\n=>" + % (50 * "=", len(sequences) - 1, sequences[0], 36 * "-") + ) + text += ",\n".join(sequences[1:]) + "\n" + logging.info(text) + + if multi and not (nrows == 1 and ncols == 1) and plot_size <= label_size / 2: + label_size = plot_size * 3 // 2 + text = "Reducing label size for better visualization to %d\n" % label_size + logging.info(text) + + y_label_rotation = "vertical" + # for cartesian coordinate system with mirrored y-axis: plot x labels below plot + if mirror_y_axis: + x_label_pos_top = False + + # preparations for file name + name_graph = "Pairdotplot" + if prefix: + if not prefix[-1] == "-": + prefix = prefix + "-" + else: + prefix = "" + suffix = "" + if convert_wobbles: + suffix += "_wobbles" + if substitution_count != 0: + suffix += "_S%d" % substitution_count + if length_scaling: + suffix += "_scaled" + if multi: + suffix += "_collage" + + # calculate fig ratios + if not multi: + ncols = 1 + nrows = 1 + figsize_x, figsize_y = calc_fig_ratio(ncols, nrows, plot_size) + + P.cla() # clear any prior graph + list_of_png_names = [] + if multi: + fig = P.figure(figsize=(figsize_x, figsize_y)) + page_counter = 1 + + # prepare LCS data file + lcs_data_file = open( + "%sPairdotplot_wordsize%d_lcs_data_file%s.txt" + % (prefix, wordsize, suffix.replace("_scaled", "").replace("_collage", "")), + "w", + ) + lcs_data_file.write( + "\t".join( + [ + "#title1", + "title2", + "len_seq1", + "len_seq2", + "len_lcs_for", + "%_min_seq_len", + "len_lcs_rev", + "%_min_seq_len", + ] + ) + + "\n" + ) + + counter, seq_counter = 0, 0 + log_txt = "\nDrawing pairwise dotplots" + + seq_text = "" + for idx in range(len(sequences) - 1): + logging.debug("\n%d\t%s vs." % ((seq_counter + 1), sequences[idx])) + seq_text += "\n%d\t%s vs." % ((seq_counter + 1), sequences[idx]) + + rec_two = seq_dict[sequences[idx]] + name_two = rec_two.id + seq_two = rec_two.seq + len_two = len(seq_two) + + for jdx in range(idx + 1, len(sequences)): + rec_one = seq_dict[sequences[jdx]] + name_one = rec_one.id + seq_one = rec_one.seq + len_one = len(seq_one) + + counter += 1 + seq_counter += 1 + + logging.debug(sequences[jdx]) + seq_text += " " + sequences[jdx] + + if not seq_counter % 25: + log_txt += " " + str(seq_counter) + + # get positions of matches + if substitution_count != 0: + # print "RE" + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex( + seq_one, + seq_two, + wordsize, + substitution_count=substitution_count, + convert_wobbles=convert_wobbles, + max_N_percentage=max_N_percentage, + report_lcs=True, + type_nuc=type_nuc, + ) + else: + # print "DIAG" + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag( + seq_one, + seq_two, + wordsize, + convert_wobbles=convert_wobbles, + max_N_percentage=max_N_percentage, + report_lcs=True, + type_nuc=type_nuc, + ) + + # write LCS data file + lcs_data_file.write( + "\t".join( + [ + name_one, + name_two, + str(len_one), + str(len_two), + str(lcs_for), + str(round((lcs_for * 100.0 / min(len_one, len_two)), 3)), + str(lcs_rev), + str(round((lcs_rev * 100.0 / min(len_one, len_two)), 3)), + ] + ) + + "\n" + ) + + # Plotting with matplotlib + ################################# + + # combined plotting + if multi: + # plotting subplot with matplotlib + ax = P.subplot(nrows, ncols, counter) # rows, columns, plotnumber + + else: + # calculate figure size for separate figures + if len_one >= len_two: + sizing = (plot_size, max(2, (plot_size) * len_two * 1.0 / len_one)) + # sizing = (plot_size, min(plot_size, max(2, (plot_size-2)*len_two*1./len_one+2))) + else: + sizing = (max(2, (plot_size) * len_one * 1.0 / len_two), plot_size) + # sizing = (min(plot_size, max(2, (plot_size-2)*len_one*1./len_two+2)), plot_size) + fig = P.figure(figsize=(plot_size, plot_size)) + + ax = P.subplot(1, 1, 1) + + # collect lines + lines = [] + color_list = [] + for x_lines, y_lines, col in [ + (x2, y2, line_col_rev), + (x1, y1, line_col_for), + ]: + # If color is not white, add lines to plot + if col != "white": + for ldx in range(len(x_lines)): + lines.append( + [ + (x_lines[ldx][0], y_lines[ldx][0]), + (x_lines[ldx][-1], y_lines[ldx][-1]), + ] + ) + color_list.append(col) + color_list = np.array(color_list) + + # draw lines + lc = cllct.LineCollection(lines, colors=color_list, linewidths=line_width) + ax.add_collection(lc) + + # format axes + P.xlabel( + unicode_name( + shorten_name( + name_one, max_len=title_length, title_clip_pos=title_clip_pos + ) + ) + + " [%s]" % aa_bp_unit, + fontsize=label_size, + fontweight="bold", + labelpad=4, + ) + P.ylabel( + unicode_name( + shorten_name( + name_two, max_len=title_length, title_clip_pos=title_clip_pos + ) + ) + + " [%s]" % aa_bp_unit, + fontsize=label_size, + fontweight="bold", + labelpad=4, + ) + P.tick_params(axis="both", which="major", labelsize=label_size * 0.9) + + # P.axis('scaled') # make images scaled by size ### optional update ### + if not multi: + if length_scaling: + ax.set_aspect(aspect="equal", adjustable="box", anchor="NW") + P.xlim(0, len_one + 1) + # xlimit = [0, len_one+1] + if mirror_y_axis: + P.ylim(0, len_two + 1) # rotate y axis (point upwards) + else: + P.ylim(len_two + 1, 0) # rotate y axis (point downwards) + elif not length_scaling: + P.xlim(0, len_one + 1) + # xlimit = [0, len_one+1] + if mirror_y_axis: + P.ylim(0, len_two + 1) # rotate y axis (point upwards) + else: + P.ylim(len_two + 1, 0) # rotate y axis (point downwards) + else: + max_len = max(len_one, len_two) + P.xlim(0, max_len + 1) + # xlimit = [0, max_len+1] + if mirror_y_axis: + P.ylim(0, max_len + 1) # rotate y axis (point upwards) + else: + P.ylim(max_len + 1, 0) # rotate y axis (point downwards) + + # plot line deliminating shorter sequence + if max_len != len_one: + ax.plot( + (len_one + 1, len_one + 1), + (0, len_two), + marker="", + linestyle="--", + color=scale_delim_col, + markerfacecolor="r", + ) + if max_len != len_two: + ax.plot( + (0, len_one), + (len_two + 1, len_two + 1), + marker="", + linestyle="--", + color=scale_delim_col, + markerfacecolor="r", + ) + + # # use same tick labels for x and y axis + # if P.xlim() == P.ylim(): + # tick_locs, tick_labels = P.yticks() + # P.xticks(tick_locs) + # P.xlim(xlimit) + + # evtl. switch x axis position + if x_label_pos_top: + ax.xaxis.tick_top() + ax.xaxis.set_label_position("top") + P.setp(ax.get_xticklabels(), fontsize=label_size * 0.9) + P.setp(ax.get_yticklabels(), fontsize=label_size * 0.9) + + # save figure and reinitiate if page is full + if multi and counter == ncols * nrows: + # finalize layout - margins & spacing between plots + try: + P.tight_layout(h_pad=0.02, w_pad=0.02) + except: + logging.warning( + "Attention - pylab.tight_layout failed! Please check sequence names and layout settings!" + ) + if x_label_pos_top: + P.subplots_adjust( + hspace=0.5, wspace=0.5, top=0.95 + ) # space between rows - def 0.4 + else: + P.subplots_adjust( + hspace=0.5, wspace=0.5, bottom=0.05 + ) # space between rows - def 0.4 + + # name and create output files (names derived from SEQNAME) + fig_name = "%s%s_wordsize%i%s-%.3d.%s" % ( + prefix, + name_graph, + wordsize, + suffix, + page_counter, + filetype, + ) + P.savefig(fig_name, bbox_inches="tight") + P.close() + P.cla() + + list_of_png_names.append(fig_name) + + counter = 0 + page_counter += 1 + + fig = P.figure(figsize=(figsize_x, figsize_y)) + + # plotting separate figure files + elif not multi: + # finalize layout - margins & spacing between plots + try: + P.tight_layout(h_pad=0.02, w_pad=0.02) + except: + logging.warning( + "Attention - pylab.tight_layout failed! Please check sequence names and layout settings!" + ) + if y_label_rotation == "horizontal": + if x_label_pos_top: + P.subplots_adjust( + hspace=0.02, wspace=0.02, left=0.13, top=0.95 + ) # space between rows - def 0.4 + else: + P.subplots_adjust( + hspace=0.02, wspace=0.02, left=0.13, bottom=0.05 + ) # space between rows - def 0.4 + else: + P.subplots_adjust( + hspace=0.02, wspace=0.02 + ) # space between rows - def 0.4 + + # name and create output files + fig_name = "%s%s-%d_wordsize%i%s.%s" % ( + prefix, + name_graph, + counter, + wordsize, + suffix, + filetype, + ) + P.savefig(fig_name) + P.close() + P.cla() + + list_of_png_names.append(fig_name) + fig = P.figure() + + if only_vs_first_seq: + break + + # save figure + if multi and counter >= 1: + # finalize layout - margins & spacing between plots + try: + P.tight_layout(h_pad=0.02, w_pad=0.02) + except: + logging.warning( + "Attention - pylab.tight_layout failed! Please check sequence names and layout settings!" + ) + if x_label_pos_top: + P.subplots_adjust( + hspace=0.5, wspace=0.5, top=0.95 + ) # space between rows - def 0.4 + else: + P.subplots_adjust( + hspace=0.5, wspace=0.5, bottom=0.05 + ) # space between rows - def 0.4 + + # name and create output files (names derived from SEQNAME) + fig_name = "%s%s_wordsize%i%s-%.3d.%s" % ( + prefix, + name_graph, + wordsize, + suffix, + page_counter, + filetype, + ) + P.savefig(fig_name, bbox_inches="tight") + P.close() + P.cla() + + list_of_png_names.append(fig_name) + + log_txt += "\n%d done" % seq_counter + logging.info(log_txt) + + logging.debug(seq_text) + + return list_of_png_names + + +def polydotplot( + input_fasta, + wordsize, + alphabetic_sorting=False, + convert_wobbles=False, + filetype="png", + gff_color_dict={"others": ("grey", 1, 0)}, + gff_files=[], + input_user_matrix_file="", + label_size=10, + lcs_shading_interval_len=100, + lcs_shading_num=5, + lcs_shading_ori=0, + lcs_shading_ref=0, + lcs_shading=True, + line_col_for="#000000", # defalut black + line_col_rev="#009243", # default green + line_width=1, + max_N_percentage=10, + mirror_y_axis=False, + plot_size=10, + prefix=None, + representation=0, + rotate_labels=False, + spacing=0.04, + substitution_count=0, + title_clip_pos="B", + title_length=float("Inf"), + type_nuc=True, + user_matrix_print=True, + x_label_pos_top=True, +): + """ + all-against-all dotplot + derived from dotplot function + + lcs_shading_refs: + 0 color relative to maximum lcs observed in dataset [default] + 1 color by coverage of shorter sequence (e.g. lcs = 70% of seq1) + lcs_shading_ori + 0 forward only + 1 reverse only + 2 both orientations (in opposite plot) + """ + + # read sequences + seq_dict, sequences = read_seq(input_fasta) + if seq_dict == {}: + logging.warning("Failed to load sequences.") + return [] + + if type_nuc: + aa_bp_unit = "bp" + else: + aa_bp_unit = "aa" + + if alphabetic_sorting: + sequences = sorted(sequences) + + if len(sequences) == 0: + text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % ( + 50 * "=", + len(sequences), + len(sequences), + 30 * "-", + ) + text += " No sequences provided for polydotplot!\n\nTerminating polydotplot!" + logging.info(text) + return + elif len(sequences) == 1: + text = "\n\nCreating polydotplot for single sequence!" + text += "\nRecommendation: Use selfdotplot via '--plotting_mode 0'!\n\n" + logging.info(text) + + text = "\n%s\n\nCreating %dx%d polydotplot image\n%s\n\n=>" % ( + 50 * "=", + len(sequences), + len(sequences), + 30 * "-", + ) + text += " " + " ".join(sequences) + "\n" + logging.info(text) + + # read gff annotation data if provided for shading + if gff_files is not None and gff_files != []: + text = "\n%s\n\nReading %s GFF annotation files\n%s\n\n=> %s\n" % ( + 50 * "=", + len(gff_files), + 28 * "-", + ", ".join(gff_files), + ) + logging.info(text) + if prefix is not None and prefix != "": + legend_prefix = prefix + "-Polydotplot" + else: + legend_prefix = "Polydotplot" + feat_dict = read_gffs( + gff_files, + color_dict=gff_color_dict, + type_nuc=type_nuc, + prefix=legend_prefix, + filetype=filetype, + ) + + if lcs_shading and not type_nuc: + if lcs_shading_ori != 0: + lcs_shading_ori = 0 + text = "Protein shading does not support reverse complementary matching!\n" + logging.info(text) + + # read custom shading matrix & match names of sequences to fasta + if input_user_matrix_file != "" and input_user_matrix_file != None: + logging.info("Reading user matrix file: %s" % input_user_matrix_file) + # lcs_shading_ori = 2 + custom_dict = read_matrix(input_user_matrix_file) + if custom_dict != {}: + custom_shading = True + custom_similarity_dict = {} + invalid_entries = [] + custom_max = 0 + custom_min = float("Inf") + for key in list(custom_dict.keys()): + number_key = [] + + # convert number into float + try: + value = float(custom_dict[key]) + if not "." in custom_dict[key]: + value = int(custom_dict[key]) + custom_max = max(custom_max, value) + custom_min = min(custom_min, value) + except: + value = custom_dict[key] + if value == "": + value = None + invalid_entries.append(key) + # match matrix names with sequence names + for item in key: + if item in sequences: + number_key.append(sequences.index(item)) + else: + number_key.append(-1) + # dictionary with tuple of sorted sequence indices as key and number as value + custom_similarity_dict[tuple(sorted(number_key))] = value + if len(invalid_entries) != 0: + text = ( + "No valid number in custom similarity matrix for %d entries: \n\t" + % (len(invalid_entries)) + ) + for key in invalid_entries: + text += str(key) + " - " + str(custom_dict[key]) + "; " + logging.info(text[:-2] + "\n") + + text = "Custom user matrix given: min %.2f, max %.2f\n" % ( + custom_min, + custom_max, + ) + + # artificially rounding intervals if likely identity/divergence percentages + if 0 <= custom_min < 1 and 0 < custom_max <= 1: + rounding_factor = 5 + multi_factor = 100 + text += ( + " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " + % (custom_min, custom_max) + ) + custom_min = max( + 0, + (multi_factor * custom_min // rounding_factor) + * (1.0 * rounding_factor / multi_factor), + ) + custom_max = min( + (multi_factor * custom_max // rounding_factor) + * (1.0 * rounding_factor / multi_factor), + 1, + ) + text += "new (%.2f, >%2f)\n" % (custom_min, custom_max) + + elif 0 <= custom_min < 100 and 0 < custom_max <= 100: + rounding_factor = 5 + text += ( + " > artificially rounding custom shading intervals: old (%.2f, %.2f) - " + % (custom_min, custom_max) + ) + custom_min = max(0, (custom_min // rounding_factor) * rounding_factor) + custom_max = min((custom_max // rounding_factor) * rounding_factor, 100) + text += "new (%d, >%d)\n" % (custom_min, custom_max) + + logging.info(text) + + else: + custom_shading = False + + name_graph = "Polydotplot" + suffix = "" + if convert_wobbles: + suffix += "_wobbles" + if substitution_count != 0: + suffix += "_S%d" % substitution_count + if custom_shading: + suffix += "_matrix" + if lcs_shading: + suffix += "_%dshades_ref%d_ori%s" % ( + lcs_shading_num + 1, + lcs_shading_ref, + lcs_shading_ori, + ) + if "ref2" in suffix and type_nuc: + suffix = suffix.replace("ref2", "%dbp" % lcs_shading_interval_len) + elif "ref2" in suffix: + suffix = suffix.replace("ref2", "%daa" % lcs_shading_interval_len) + + # name and create output files (names derived from SEQNAME) + if prefix: + prefix = str(prefix) + "-" + else: + prefix = "" + + # preparations for background shading + if lcs_shading or custom_shading: + # create color range white to grey + colors = create_color_list(lcs_shading_num + 1, color_map="Greys") + colors_2 = create_color_list(lcs_shading_num + 1, color_map="OrRd") + + if custom_shading: + text = "Custom Matrix Colors: " + ", ".join(colors_2) + + # write lcs lengths to file + lcs_data_file = open( + "%sPolydotplot_lcs_data_file%s.txt" + % (prefix, suffix.replace("_scaled", "").replace("_collage", "")), + "w", + ) + lcs_data_file.write( + "\t".join( + [ + "#title1", + "title2", + "len_seq1", + "len_seq2", + "len_lcs_for", + "%_min_seq_len", + "len_lcs_rev", + "%_min_seq_len", + ] + ) + + "\n" + ) + + # compare sequences pairwise - save lcs and line information in dictionary for plotting + data_dict = {} # keys = tuple(idx, jdx), value = x1, y1, x2, y2 (line positions) + lcs_dict = {} # keys = tuple(idx, jdx), value = length of lcs: lcs_len or (lcs_for, lcs_rev) + for_lcs_set = set([]) # keep lengths to calculate max (excluding self comparisons) + rev_lcs_set = set([]) # keep lengths to calculate max (all) + + text = "\nTotal plot count: %d" % (len(sequences) * (len(sequences))) + text += "\nTotal calculations: %d" % (len(sequences) * (len(sequences) + 1) / 2) + logging.info(text) + + logging.info( + "\nCalculating shared regions and lengths of longest_common_substring..." + ) + log_txt = "\nCalculating shared regions and lengths of longest_common_substring..." + # determine matches and length of lcs by comparing all sequence pairs + + seq_text = "" + counter = 0 + for idx in range(len(sequences)): + logging.debug("\n%d\t%s vs." % ((counter + 1), sequences[idx])) + seq_text += "\n%d\t%s vs." % ((counter + 1), sequences[idx]) + rec_two = seq_dict[sequences[idx]] + name_two = rec_two.id + seq_two = rec_two.seq + len_two = len(seq_two) + + for jdx in range(idx, len(sequences)): + rec_one = seq_dict[sequences[jdx]] + name_one = rec_one.id + seq_one = rec_one.seq + len_one = len(seq_one) + + counter += 1 + logging.debug(sequences[jdx]) + seq_text += " " + sequences[jdx] + + if len(sequences) < 5: + log_txt += "\n\t%s (%d %s), %s (%d %s)" % ( + name_one, + len_one, + aa_bp_unit, + name_two, + len_two, + aa_bp_unit, + ) + else: + if not counter % 25: + print(counter) + log_txt += str(counter) + + # Get positions of matches & length of longest common substring based on match lengths + if substitution_count != 0: + # print "RE" + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_regex( + seq_one, + seq_two, + wordsize, + substitution_count=substitution_count, + convert_wobbles=convert_wobbles, + max_N_percentage=max_N_percentage, + report_lcs=True, + type_nuc=type_nuc, + ) + else: + # print "DIAG" + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag( + seq_one, + seq_two, + wordsize, + convert_wobbles=convert_wobbles, + max_N_percentage=max_N_percentage, + report_lcs=True, + type_nuc=type_nuc, + ) + data_dict[(idx, jdx)] = x1[:], y1[:], x2[:], y2[:] + lcs_dict[idx, jdx] = lcs_for, lcs_rev + + if idx != jdx: + for_lcs_set.add(lcs_for) + rev_lcs_set.add(lcs_rev) + + lcs_data_file.write( + "\t".join( + [ + name_one, + name_two, + str(len_one), + str(len_two), + str(lcs_for), + str(round((lcs_for * 100.0 / min(len_one, len_two)), 3)), + str(lcs_rev), + str(round((lcs_rev * 100.0 / min(len_one, len_two)), 3)), + ] + ) + + "\n" + ) + + log_txt += "\n" + str(len(sequences) * (len(sequences) + 1) / 2) + " done" + + logging.info(log_txt) + + logging.debug("\n\nlcs_dict\n" + str(lcs_dict)) + if custom_shading: + logging.debug("\ncustom_dict\n" + str(custom_dict)) + logging.debug("\ncustom_similarity_dict\n\n" + str(custom_similarity_dict)) + + logging.info(seq_text + "\n") + + if lcs_shading_ref == 2: + color_bins = [] + text = "\nLCS lengh bins: " + for idx in range(lcs_shading_num): + color_bins.append(lcs_shading_interval_len * (idx + 1)) + text += " " + str(lcs_shading_interval_len * (idx + 1)) + logging.info(text) + + # Calculate maximum lcs length + if lcs_shading_ori == 0: # forward only + if len(for_lcs_set) != 0: + max_lcs = max(for_lcs_set) + else: + max_lcs = None + elif lcs_shading_ori == 1: # reverse complement only + if len(rev_lcs_set) != 0: + max_lcs = max(rev_lcs_set) + else: + max_lcs = None + else: # both orientations + if len(rev_lcs_set) != 0 and len(for_lcs_set) != 0: + max_lcs = max(max(rev_lcs_set), max(for_lcs_set)) + elif len(rev_lcs_set) != 0: + max_lcs = max(rev_lcs_set) + elif len(for_lcs_set) != 0: + max_lcs = max(for_lcs_set) + else: + max_lcs = None + + if max_lcs: + text = "Maximum LCS: %d %s" % (max_lcs, aa_bp_unit) + logging.info(text) + if custom_shading: + text = "Maximum custom value: %d\n" % custom_max + logging.info(text) + + # count sequences + ncols = len(sequences) + nrows = len(sequences) + + # get sequence lengths to scale plot widths and heights accordingly + size_ratios = [] + for item in sequences: + size_ratios.append(len(seq_dict[item].seq)) + + P.cla() # clear any prior graph + # use GridSpec to resize plots according to sequence length + if mirror_y_axis: + height_ratios = size_ratios[::-1] + else: + height_ratios = size_ratios[:] + gs = gridspec.GridSpec( + nrows, ncols, width_ratios=size_ratios, height_ratios=height_ratios + ) + fig = P.figure(figsize=(plot_size, plot_size)) + + # for cartesian coordinate system with mirrored y-axis: plot x labels below plot + if mirror_y_axis and representation == 1: + x_label_pos_top = True + elif mirror_y_axis or representation == 2: + x_label_pos_top = False + + # print y labels on the right, if upper right triangle is displayed + if (representation == 1 and not mirror_y_axis) or ( + representation == 2 and mirror_y_axis + ): + y_label_pos = 0 # last column + else: # left y label + y_label_pos = 1 # first column + + # determine label orientations + if len(sequences) > 5 or rotate_labels: + x_label_rotation = 45 + y_label_rotation = "horizontal" + if x_label_pos_top: + xhalign = "left" + xvalign = "bottom" + else: + xhalign = "right" + xvalign = "top" + yhalign = "right" + else: + x_label_rotation = "horizontal" + y_label_rotation = "vertical" + xvalign = "center" + xhalign = "center" + yhalign = "center" + yvalign = "center" + + # check combination of shading parameters for triangular output + if ( + representation != 0 and lcs_shading and custom_shading + ): # both directions in triangle + logging.info( + "\nAttention: For triangular output custom-shading and LCS shading cannot be combined!\n" + ) + elif ( + representation != 0 and lcs_shading and lcs_shading_ori == 2 + ): # both directions in triangle + logging.info( + "\nAttention: For triangular output LCS shading for both orientations is combined to max of both orientations!\n" + ) + + log_txt = "\nDrawing polydotplot..." + + # draw subplots + if lcs_shading and custom_shading: + lcs_text = ( + "\n" + + "\t".join( + [ + "#Seq1", + "Seq2", + "LCS for [%s]" % aa_bp_unit, + "LCS for [%s]" % aa_bp_unit, + "Custom matrix value", + "Matrix color index", + "LCS color index", + ] + ) + + "\n" + ) + elif lcs_shading: + lcs_text = ( + "\n" + + "\t".join( + [ + "#Seq1", + "Seq2", + "LCS for [%s]" % aa_bp_unit, + "LCS for [%s]" % aa_bp_unit, + "LCS color index for", + "LCS color index rev", + ] + ) + + "\n" + ) + elif custom_shading: + lcs_text = ( + "\n" + + "\t".join( + [ + "#Seq1", + "Seq2", + "Custom matrix value", + "Color index for", + "Color index rev", + ] + ) + + "\n" + ) + + seq_text = "" + + counter, seq_counter = 0, 0 + for idx in range(len(sequences)): + logging.debug("\n%d\t%s vs." % ((seq_counter + 1), sequences[idx])) + seq_text += "\n%d\t%s vs." % ((seq_counter + 1), sequences[idx]) + + rec_two = seq_dict[sequences[idx]] + len_two = len(rec_two.seq) + name_two = rec_two.id + + for jdx in range(idx, len(sequences)): + rec_one = seq_dict[sequences[jdx]] + len_one = len(rec_one.seq) + name_one = rec_one.id + + counter += 1 + seq_counter += 1 + + logging.debug(sequences[jdx]) + seq_text += " " + sequences[jdx] + + if not seq_counter % 25: + # print(seq_counter) + log_txt += str(seq_counter) + + # optional shade background according to length of LCS and/or user matrix + ######################################################################### + + # get interval based on LCS + background_colors = [None, None] + if lcs_shading and ( + lcs_shading_ref == 1 or lcs_shading_ref == 2 or max_lcs != None + ): # self plot max_lcs_for == None + lcs_len = lcs_dict[(idx, jdx)] + l1 = lcs_len[0] # forward + l2 = lcs_len[1] # reverse complement + + lcs_shading_bool = True + + # calculate shading acc. to chosen option + if lcs_shading_ref == 1: # percentage of shorter sequence + color_idx0 = min( + len(colors) - 1, l1 * lcs_shading_num // min(len_one, len_two) + ) + color_idx1 = min( + len(colors) - 1, l2 * lcs_shading_num // min(len_one, len_two) + ) + elif lcs_shading_ref == 2: # by given interval size + color_idx0 = min(len(colors) - 1, l1 // lcs_shading_interval_len) + color_idx1 = min(len(colors) - 1, l2 // lcs_shading_interval_len) + if color_idx0 >= len(colors): + color_idx0 = len(colors) + if color_idx1 >= len(colors): + color_idx1 = len(colors) + else: # percentage of maximum lcs length + color_idx0 = min(len(colors) - 1, l1 * lcs_shading_num // max_lcs) + color_idx1 = min(len(colors) - 1, l2 * lcs_shading_num // max_lcs) + else: + lcs_shading_bool = False + + # get interval based on custom matrix + if custom_shading: + # matrix value + try: + custom_value = custom_similarity_dict[(idx, jdx)] + except: + custom_value = "" + + # bottom left triangle = LCS forward/reverse or best of both + if lcs_shading_bool: + if lcs_shading_ori == 0: # forward + color_idx1 = color_idx0 + elif lcs_shading_ori == 2: # both directions + color_idx1 = max(color_idx0, color_idx1) + + # top right triangle = custom value (not colored if text matrix provided) + if type(custom_value) is int or type(custom_value) is float: + color_idx0 = int( + (custom_value - custom_min) + * lcs_shading_num + // (custom_max - custom_min) + ) + # no color if string is proviced + else: + color_idx0 = 0 + + # use best LCS of both orientations for coloring triangle with two-ori-LCS + if ( + representation != 0 and lcs_shading_ori == 2 + ): # both directions in triangle + color_idx0, color_idx1 = ( + max(color_idx0, color_idx1), + max(color_idx0, color_idx1), + ) + + # set colors dependent on lcs dependent on orientation + if lcs_shading_bool and not custom_shading: + if idx != jdx: + if lcs_shading_ori == 0: + color_idx1 = color_idx0 + elif lcs_shading_ori == 1: + color_idx0 = color_idx1 + background_colors[0] = colors[color_idx0] + background_colors[1] = colors[color_idx1] + # for selfcomparison, only color reverse complement + elif lcs_shading_ori != 0 and not custom_shading: + background_colors[0] = colors[color_idx1] + # set different colors for shading by LCS + user matrix + elif lcs_shading_bool and custom_shading: + # print colors, background_colors, color_idx0, color_idx1 + background_colors[0] = colors_2[color_idx0] + background_colors[1] = colors[color_idx1] + # set grey color range for user matrix if no LCS shading + elif custom_shading: + background_colors[0] = colors[color_idx0] + background_colors[1] = colors[color_idx0] + + if custom_shading and lcs_shading_bool: + lcs_text += ( + "\t".join( + [ + name_one, + name_two, + str(lcs_len[0]), + str(lcs_len[1]), + str(custom_value), + str(color_idx0), + str(color_idx1), + ] + ) + + "\n" + ) + elif lcs_shading_bool: + lcs_text += ( + "\t".join( + [ + name_one, + name_two, + str(lcs_len[0]), + str(lcs_len[1]), + str(color_idx0), + str(color_idx1), + ] + ) + + "\n" + ) + elif custom_shading: + lcs_text += ( + "\t".join( + [ + name_one, + name_two, + str(custom_value), + str(color_idx0), + str(color_idx1), + ] + ) + + "\n" + ) + + # calculate figure position in polyplot + # diagonal (self-dotplots) + if idx == jdx: + if mirror_y_axis: + seq_num = sequences.index(name_one) + 1 + counter1 = seq_num + len(sequences) * (len(sequences) - seq_num) + counter = counter + (counter - 1) // (nrows) + else: + # skip positions below diagonal + counter1 = counter + (counter - 1) // (nrows) # + row_pos + counter = counter1 + counters = [counter1] + + # draw both graphs at once (due to symmetry) + else: + if mirror_y_axis: + col_pos = sequences.index(name_two) + 1 + row_pos = len(sequences) - (sequences.index(name_one) + 1) + counter1 = row_pos * ncols + col_pos + counter2 = (ncols - col_pos) * ncols + ncols - row_pos + else: + counter1 = counter + col_pos = (counter - 1) % ncols + row_pos = (counter - 1) // (nrows) + counter2 = col_pos * ncols + row_pos + 1 + counters = [counter1, counter2] # lower, upper + + if len(counters) == 2: + seq_counter += 1 + if not seq_counter % 25: + # print(seq_counter) + log_txt += str(seq_counter) + + x_lists, y_lists, x_lists_rc, y_lists_rc = data_dict[(idx, jdx)] + + # plot diagram(s) + for kdx in range(len(counters)): + if ( + representation == 0 + or len(counters) == 1 + or (representation == 1 and kdx == 0) + or (representation == 2 and kdx == 1) + ): + fig_pos = counters[kdx] + # plotting subplot with matplotlib + ax = P.subplot(gs[fig_pos - 1]) # rows, columns, plotnumber + + # shade annotated regions if gff file(s) provided + if idx == jdx and gff_files: + if name_one in list(feat_dict.keys()): + features = feat_dict[name_one] + if len_two != len_one: + logging.info( + "Polydot GFF shading for diagonal fields - nequal length error!" + ) + return + for item in features: + feat_type, start, stop = item + feat_color, strength, zoom = gff_color_dict[ + feat_type.lower() + ] + start = max(0, start - zoom - 0.5) + stop = min(len_one + 1, stop + zoom + 0.5) + width = stop - start + ax.add_patch( + patches.Rectangle( + (start, start), # (x,y) + width, + width, # width, height + edgecolor=None, + linewidth=line_width + zoom, + fill=True, + facecolor=feat_color, + alpha=strength, + ) + ) + + # if custom matrix value printed into upper matrix triangle, skip data plotting + # text print in top triangle + if user_matrix_print and custom_shading and kdx == 0 and idx != jdx: + data_plotting = False + # dotplot in bottom triangle + else: + data_plotting = True + + # mirror plot, if plotting below diagonal + if kdx == 0: + l1, l2 = len_one, len_two + n1, n2 = name_one, name_two + x1, y1 = x_lists, y_lists + x2, y2 = x_lists_rc, y_lists_rc + else: + l2, l1 = len_one, len_two + n2, n1 = name_one, name_two + x1, y1 = y_lists, x_lists + x2, y2 = y_lists_rc, x_lists_rc + + if mirror_y_axis: + x1, y1, x2, y2 = y1, x1, y2, x2 + n1, n2 = n2, n1 + + if data_plotting: + # collect lines + lines = [] + color_list = [] + for x_lines, y_lines, col in [ + (x2, y2, line_col_rev), + (x1, y1, line_col_for), + ]: + # If color is not white, add lines to plot + if col != "white": + for ldx in range(len(x_lines)): + lines.append( + [ + (x_lines[ldx][0], y_lines[ldx][0]), + (x_lines[ldx][-1], y_lines[ldx][-1]), + ] + ) + color_list.append(col) + color_list = np.array(color_list) + + # draw lines + lc = cllct.LineCollection( + lines, colors=color_list, linewidths=line_width + ) + ax.add_collection(lc) + + # plot value provided by customer instead of dotplot + else: + alignment = { + "horizontalalignment": "center", + "verticalalignment": "center", + } + # P.text(0.5, 0.5, custom_value, size='medium', transform=ax.transAxes, **alignment) + P.text( + 0.5, + 0.5, + custom_value, + size=label_size * 1.5, + transform=ax.transAxes, + **alignment, + ) + # P.text(0.5, 0.5, custom_value, size=label_size*1.5, transform=ax.transAxes, + # horizontalalignment='center', verticalalignment='center', color="black") + + if custom_shading: + # omit diagonal + if idx == jdx: + ax.set_facecolor("white") + # use white background for text fields (top right triangle only [kdx 0]) + elif ( + type(custom_value) is not int + and type(custom_value) is not float + and kdx == 0 + ): + ax.set_facecolor("white") + else: + ax.set_facecolor(background_colors[kdx]) + # set background color if lcs shading + elif lcs_shading_bool and background_colors[kdx]: + ax.set_facecolor(background_colors[kdx]) + + # set axis limits + # P.xlim(0, l1+1) + if mirror_y_axis: + P.xlim(0, l2 + 1) + P.ylim(0, l1 + 1) # rotate y axis (point upwards) + else: + P.xlim(0, l1 + 1) + P.ylim(l2 + 1, 0) # rotate y axis (point downwards) + + ## axis labelling + ################## + + # determine axis positions + if x_label_pos_top: + ax.xaxis.tick_top() + ax.xaxis.set_label_position("top") + x_label_bool = fig_pos <= ncols + x_tick_bool = fig_pos > ncols * (ncols - 1) + else: + x_label_bool = fig_pos > ncols * (ncols - 1) + x_tick_bool = fig_pos <= ncols + + # settings for y labels on right side + if y_label_pos == 0: # right label + ax.yaxis.tick_right() + ax.yaxis.set_label_position("right") + label_dist = 30 + else: + label_dist = 8 + + # x axis labels dependent on plot position/number + if x_label_bool: # x title and labels on top or bottom + P.xlabel( + unicode_name( + shorten_name( + n1, + max_len=title_length, + title_clip_pos=title_clip_pos, + ) + ), + fontsize=label_size, + rotation=x_label_rotation, + verticalalignment=xvalign, + horizontalalignment=xhalign, + fontweight="bold", + labelpad=8, + ) # axis naming + if x_label_rotation not in ["horizontal", "vertical"]: + P.setp( + ax.get_xticklabels(), + fontsize=label_size * 0.9, + rotation="vertical", + ) + else: + P.setp( + ax.get_xticklabels(), + fontsize=label_size * 0.9, + rotation=x_label_rotation, + ) + elif x_tick_bool and x_label_pos_top: # x ticks on bottom row + ax.xaxis.tick_bottom() # ticks without labels on bottom + P.setp( + ax.get_xticklabels(), + fontsize=label_size, + rotation=x_label_rotation, + visible=False, + ) + elif x_tick_bool: # x ticks on top row + ax.xaxis.tick_top() # # ticks without labels on top + P.setp( + ax.get_xticklabels(), + fontsize=label_size, + rotation=x_label_rotation, + visible=False, + ) # inner diagrams without labelling + elif idx == jdx and representation != 0: + if not mirror_y_axis and representation == 1: # upper + ax.xaxis.tick_bottom() + elif mirror_y_axis and representation == 2: # lower + ax.xaxis.tick_top() + elif mirror_y_axis and representation == 1: # upper + ax.xaxis.tick_bottom() + elif not mirror_y_axis and representation == 2: # lower + ax.xaxis.tick_top() + P.setp( + ax.get_xticklabels(), visible=False + ) # inner diagrams without labelling + else: # no x ticks on internal rows + ax.axes.get_xaxis().set_visible(False) + + # y axis labels dependent on plot position/number + if fig_pos % ncols == y_label_pos or ( + ncols == 1 and nrows == 1 + ): # y title and labels in 1st column + P.ylabel( + unicode_name( + shorten_name( + n2, + max_len=title_length, + title_clip_pos=title_clip_pos, + ) + ), + fontsize=label_size, + rotation=y_label_rotation, + verticalalignment=yvalign, + horizontalalignment=yhalign, + fontweight="bold", + labelpad=label_dist, + ) + P.setp( + ax.get_yticklabels(), fontsize=label_size * 0.9 + ) # axis naming + elif fig_pos % ncols == 0: # y ticks in last column + ax.yaxis.tick_right() + P.setp( + ax.get_yticklabels(), visible=False + ) # inner diagrams without labelling + elif idx == jdx and representation != 0: + if not mirror_y_axis and representation == 1: # upper + ax.yaxis.tick_left() + elif mirror_y_axis and representation == 2: # lower + ax.yaxis.tick_left() + elif mirror_y_axis and representation == 1: # upper + ax.yaxis.tick_right() + elif not mirror_y_axis and representation == 2: # lower + ax.yaxis.tick_right() + P.setp( + ax.get_yticklabels(), visible=False + ) # inner diagrams without labelling + else: + ax.axes.get_yaxis().set_visible(False) + + log_txt += "\n%d done" % seq_counter + logging.info(log_txt) + + try: + logging.debug(lcs_text) + except: + pass + + # finalize layout - margins & spacing between plots + P.tick_params(axis="both", which="major", labelsize=label_size * 0.9) + try: + P.tight_layout(h_pad=0.02, w_pad=0.02) + except: + logging.info( + "Attention - pylab.tight_layout failed! Please check sequence names and layout settings!" + ) + # gs.tight_layout(fig, h_pad=.02, w_pad=.02) # less overlapping tick labels, but also disturbingly large spacing + if y_label_rotation == "horizontal": + if x_label_pos_top: + P.subplots_adjust( + hspace=spacing, wspace=spacing, left=0.13, top=0.87 + ) # space between rows - def 0.4 + else: + P.subplots_adjust( + hspace=spacing, wspace=spacing, left=0.13, bottom=0.13 + ) # space between rows - def 0.4 + else: + P.subplots_adjust( + hspace=spacing, wspace=spacing + ) # space between rows - def 0.4 + + # save figure and close instance + fig_name = "%s%s_wordsize%i%s.%s" % (prefix, name_graph, wordsize, suffix, filetype) + P.savefig(fig_name) + P.close() + P.cla() + + # create figure color legend + if lcs_shading: + if lcs_shading_ref == 1: # percentage of shorter sequence + legend_file_name = legend_figure( + colors, lcs_shading_num, unit="%", filetype=filetype, prefix=prefix + ) + elif lcs_shading_ref == 2: # interval sizes + legend_file_name = legend_figure( + colors, + lcs_shading_num, + unit=aa_bp_unit, + filetype=filetype, + prefix=prefix, + bins=color_bins, + ) + else: # relative of maximum lcs + legend_file_name = legend_figure( + colors, + lcs_shading_num, + unit=aa_bp_unit, + filetype=filetype, + prefix=prefix, + max_len=max_lcs, + ) + + if custom_shading: + custom_prefix = "custom-matrix-" + prefix + legend_file_name_custom = legend_figure( + colors_2, + lcs_shading_num, + unit="%", + filetype=filetype, + prefix=custom_prefix, + max_len=custom_max, + min_len=custom_min, + ) + + if lcs_shading and custom_shading: + return [fig_name, legend_file_name, legend_file_name_custom] + elif lcs_shading: + return [fig_name, legend_file_name] + elif custom_shading: + return [fig_name, legend_file_name_custom] + else: + return [fig_name] diff --git a/src/flexidot/utils/alphabets.py b/src/flexidot/utils/alphabets.py new file mode 100644 index 0000000..960330d --- /dev/null +++ b/src/flexidot/utils/alphabets.py @@ -0,0 +1,192 @@ +def alphabets(type_nuc=True): + """ + provide ambiguity code for sequences + """ + + nucleotide_alphabet = ["A", "C", "G", "T"] + + nucleotide_alphabet_full = [ + "A", + "C", + "G", + "T", + "N", + "B", + "D", + "H", + "V", + "Y", + "R", + "W", + "S", + "K", + "M", + ] + + nucleotide_ambiguity_code = { + "N": ["A", "C", "G", "T"], # any + "B": ["C", "G", "T"], # not A + "D": ["A", "G", "T"], # not C + "H": ["A", "C", "T"], # not G + "V": ["A", "C", "G"], # not T + "Y": ["C", "T"], # pyrimidine + "R": ["A", "G"], # purine + "W": ["A", "T"], # weak + "S": ["C", "G"], # strong + "K": ["G", "T"], # keto + "M": ["A", "C"], + } # amino + + nucleotide_match_dict = { + "N": "[ACGTNBDHVYRWSKM]", # any + "B": "[CGTNBDHVYRWSKM]", # not A + "D": "[AGTNBDHVYRWSKM]", # not C + "H": "[ACTNBDHVYRWSKM]", # not G + "V": "[ACGNBDHVYRWSKM]", # not T + "K": "[GTNBDHVYRWSK]", # keto - not A,C,M + "M": "[ACNBDHVYRWSM]", # amino - not G,T,K + "W": "[ATNBDHVYRWKM]", # weak - not C,G,S + "S": "[CGNBDHVYRSKM]", # strong - not A,G,W + "Y": "[CTNBDHVYWSKM]", # pyrimidine - not A,G,R + "R": "[AGNBDHVRWSKM]", # purine - not C,T,Y + "A": "[ANDHVRWM]", + "C": "[CNBHVYSM]", + "G": "[GNBDVRSK]", + "T": "[TNBDHYWK]", + } + + aminoacid_alphabet = [ + "A", + "R", + "N", + "D", + "C", + "E", + "Q", + "G", + "H", + "I", + "L", + "K", + "M", + "F", + "P", + "S", + "T", + "W", + "Y", + "V", + "U", + "O", + "*", + ] + + aminoacid_alphabet_full = [ + "A", + "R", + "N", + "D", + "C", + "E", + "Q", + "G", + "H", + "I", + "L", + "K", + "M", + "F", + "P", + "S", + "T", + "W", + "Y", + "V", + "U", + "O", + "*", + "J", + "Z", + "B", + "X", + ] + + aminoacid_ambiguity_code = { + "J": ["I", "L"], + "Z": ["Q", "E"], + "B": ["N", "D"], + "X": [ + "A", + "R", + "N", + "D", + "C", + "E", + "Q", + "G", + "H", + "I", + "L", + "K", + "M", + "F", + "P", + "S", + "T", + "W", + "Y", + "V", + "U", + "O", + "*", + ], + } # any + + aminoacid_match_dict = { + "J": "[ILJ]", + "Z": "[QEZ]", + "B": "[NDB]", + # "X": ".", + "X": "[ARNDCEQGHILKMFPSTWYVUO*XBZJ]", + "A": "[AX]", + "R": "[RX]", + "N": "[NXB]", + "D": "[DXB]", + "C": "[CX]", + "E": "[EXZ]", + "Q": "[QXZ]", + "G": "[GX]", + "H": "[HX]", + "I": "[IXJ]", + "L": "[LXJ]", + "K": "[KX]", + "M": "[MX]", + "F": "[FX]", + "P": "[PX]", + "S": "[SX]", + "T": "[TX]", + "W": "[WX]", + "Y": "[YX]", + "V": "[VX]", + "U": "[UX]", + "O": "[OX]", + "*": "[*X]", + } + + aa_only = set(["E", "F", "I", "J", "L", "O", "Q", "P", "U", "X", "Z", "*"]) + # return nucleotide_alphabet, nucleotide_alphabet_full, nucleotide_ambiguity_code, aminoacid_alphabet, aminoacid_alphabet_full, aminoacid_ambiguity_code, aa_only + + if type_nuc: + return ( + nucleotide_alphabet, + nucleotide_alphabet_full, + nucleotide_ambiguity_code, + nucleotide_match_dict, + ) + else: + return ( + aminoacid_alphabet, + aminoacid_alphabet_full, + aminoacid_ambiguity_code, + aminoacid_match_dict, + ) diff --git a/src/flexidot/utils/analysis.py b/src/flexidot/utils/analysis.py new file mode 100644 index 0000000..fedc641 --- /dev/null +++ b/src/flexidot/utils/analysis.py @@ -0,0 +1,86 @@ +############################### +# Analysis Functions # +############################### + +import logging +import numpy as np + + +def wobble_replacement(sequence, general_ambiguity_code): + """ + get all degenerated sequences for sequence with ambiguous residues + (only residues considered that are keys in wobble_dictionary) + """ + + # get positions of ambiguous residues + wobble_pos = [] + for idx in range(len(sequence)): + letter = sequence[idx] + if letter in list(general_ambiguity_code.keys()): + wobble_pos.append(idx) + + text = "\t%d wobbles" % len(wobble_pos) + logging.debug(text) + + # replace one wobble through each iteration by all possible residues + # repeat if still wobbles in new kmers + kmer_variants = [sequence] + while True: + text = "\t\t%d kmer variants" % len(kmer_variants) + logging.debug(text) + temp_kmers = set([]) + for kmer in kmer_variants: + for idx in wobble_pos: + letter = kmer[idx] + if letter in list(general_ambiguity_code.keys()): + for base in general_ambiguity_code[kmer[idx]]: + newkmer = kmer[:idx] + base + kmer[idx + 1 :] + temp_kmers.add(newkmer) + wobble = False + for kmer in temp_kmers: + for idx in range(len(kmer)): + letter = kmer[idx] + if letter in list(general_ambiguity_code.keys()): + wobble = True + break + if wobble: + break + kmer_variants = set(list(temp_kmers)[:]) + if not wobble: + break + + return kmer_variants + + +def split_diagonals(data, stepsize=1): + """ + split array if point difference exceeds stepsize + data = sorted list of numbers + """ + return np.split(data, np.where(np.diff(data) != stepsize)[0] + 1) + + +def longest_common_substring(s1, s2): + m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))] + longest, x_longest = 0, 0 + for x in range(1, 1 + len(s1)): + for y in range(1, 1 + len(s2)): + if s1[x - 1] == s2[y - 1]: + m[x][y] = m[x - 1][y - 1] + 1 + if m[x][y] > longest: + longest = m[x][y] + x_longest = x + else: + m[x][y] = 0 + return longest + + +def lcs_from_x_values(x_values): + """ + calculate length of longest common substring based on nested list of numbers + """ + if len(x_values) == 0: + return 0 + # get lengths of each subarray data + lengths = np.array([len(i) for i in x_values]) + return max(lengths) diff --git a/src/flexidot/utils/args.py b/src/flexidot/utils/args.py new file mode 100644 index 0000000..d71d1d8 --- /dev/null +++ b/src/flexidot/utils/args.py @@ -0,0 +1,284 @@ +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from flexidot._version import __version__ + + +def parse_args(): + parser = ArgumentParser( + prog="flexidot", + description="FlexiDot: Flexible dotplot generation tool", + formatter_class=ArgumentDefaultsHelpFormatter, + ) + + # Input/output arguments + parser.add_argument( + "-i", + "--infiles", + required=True, + nargs="+", + help="Input fasta files (fasta file name or space-separated file list.)", + ) + parser.add_argument( + "-o", + "--output_prefix", + default="flexidot_output", + help="File prefix to be added to the generated filenames.", + ) + parser.add_argument( + "--outdir", + default=".", + help="Output directory. Default: current directory.", + ) + + # Collage arguments + parser.add_argument( + "-c", + "--collage", + action="store_true", + default=False, + help="Combine multiple dotplots in a collage.", + ) + parser.add_argument( + "--n_col", + type=int, + default=4, + help="Number of columns per page (if collage is ON.", + ) + parser.add_argument( + "--n_row", + type=int, + default=5, + help="Number of rows per page (if collage is ON).", + ) + + # File type + parser.add_argument( + "-f", + "--filetype", + choices=["png", "pdf", "svg"], + default="png", + help="Output file format: png, pdf, svg", + ) + + # Sorting + parser.add_argument( + "-s", + "--sort", + action="store_true", + default=False, + help="Sort sequences alphabetically by name.", + ) + + # Calculation parameters + parser.add_argument( + "-k", + "--wordsize", + type=int, + default=10, + help="Wordsize (kmer length) for dotplot comparison.", + ) + parser.add_argument( + "-m", + "--mode", + action="append", + choices=["0", "1", "2"], + help="Mode of FlexiDot dotplotting. 0 = self [default], 1 = paired, 2 = poly (matrix with all-against-all dotplots). Call -m multiple times to run multiple modes.", + ) + parser.add_argument( + "-t", + "--type_seq", + choices=["aa", "nuc"], + default="nuc", + help="Biological sequence type: aa (amino acid) or nuc (nucleotide).", + ) + parser.add_argument( + "-w", + "--wobble_conversion", + action="store_true", + default=False, + help="Ambiguity handling for relaxed matching. Note: This may make kmer matching slower.", + ) + parser.add_argument( + "-S", + "--substitution_count", + type=int, + default=0, + help="Number of substitutions allowed per window.", + ) + parser.add_argument( + "--max_n", + type=float, + default=10, + help="Maximum percentage of Ns allowed in a kmer window. Applies only if --wobble_conversion is set, else kmers with Ns are skipped. Default: 10%", + ) + parser.add_argument( + "-r", + "--norev", + action="store_true", + default=False, + help="Do not calculate reverse complementary matches (only for nucleotide sequences.)", + ) + parser.add_argument( + "-O", + "--only_vs_first_seq", + action="store_true", + default=False, + help="Limit pairwise comparisons to the 1st sequence only (if plotting mode=1 paired.)", + ) + + # Graphic formatting + parser.add_argument("-A", "--line_width", type=float, default=1, help="Line width") + + parser.add_argument("-B", "--line_col_for", default="black", help="Line color") + + parser.add_argument( + "-C", "--line_col_rev", default="green", help="Reverse line color" + ) + + parser.add_argument( + "-D", + "--x_label_pos", + choices=["top", "bottom"], + default="top", + help="Position of the X-label. Default: 'top'", + ) + + parser.add_argument("-E", "--label_size", type=int, default=10, help="Font size") + + parser.add_argument( + "-F", + "--spacing", + type=float, + default=0.04, + help="Spacing between dotplots (if plotting mode=2 polyplot).", + ) + + parser.add_argument( + "-L", + "--length_scaling", + action="store_true", + default=False, + help="Scale plot size for pairwise comparison.", + ) + + parser.add_argument( + "-M", + "--mirror_y_axis", + action="store_true", + default=False, + help="Flip y-axis (bottom-to-top or top-to-bottom)", + ) + + parser.add_argument("-P", "--plot_size", type=int, default=10, help="Plot size") + + parser.add_argument( + "-R", + "--representation", + type=int, + choices=[0, 1, 2], + default=0, + help="Region of plot to display. Only if plotting mode is 2: polyplot\n\ + 0 = full [default]\n\ + 1 = upper\n\ + 2 = lower", + ) + + parser.add_argument( + "-T", + "--title_length", + type=int, + default=50, + help="Limit title length for comparisons. Default: 50 characters", + ) + + # GFF shading + parser.add_argument( + "-g", + "--gff", + nargs="+", + default=None, + help="GFF3 files for markup in self-dotplots. Provide a space-delimited list of GFF files.", + ) + parser.add_argument( + "-G", + "--gff_color_config", + default=None, + help="Config file for custom GFF shading.", + ) + + # longest common subsequence (LCS) shading + parser.add_argument( + "-x", + "--lcs_shading", + action="store_true", + default=False, + help="Shade subdotplot based on longest common subsequence (LCS).", + ) + + parser.add_argument( + "-X", + "--lcs_shading_num", + type=int, + default=5, + help="Number of shading intervals.", + ) + + parser.add_argument( + "-y", + "--lcs_shading_ref", + type=int, + choices=[0, 1, 2], + default=0, + help="Reference for LCS shading.\n\ + 0 = maximal LCS length [default]\n\ + 1 = maximally possible length (length of shorter sequence in pairwise comparison) \n\ + 2 = given interval sizes - DNA [default 100 bp] or proteins [default 10 aa] - see -Y", + ) + + parser.add_argument( + "-Y", + "--lcs_shading_interval_len", + type=int, + default=50, + help="Length of intervals for LCS shading (only if --lcs_shading_ref=2) [default for nucleotides = 50; default for amino acids = 10]", + ) + + parser.add_argument( + "-z", + "--lcs_shading_ori", + type=int, + choices=[0, 1, 2], + default=0, + help="Shade subdotplots based on LCS\n\ + 0 = forward [default]\n\ + 1 = reverse, or\n\ + 2 = both strands (forward shading above diagonal, reverse shading on diagonal and below; if using --user_matrix_file, best LCS is used below diagonal)", + ) + + # Custom matrix shading + parser.add_argument( + "-u", "--user_matrix_file", help="Matrix file for shading above diagonal." + ) + parser.add_argument( + "-U", + "--user_matrix_print", + action="store_true", + default=False, + help="Display matrix entries above diagonal.", + ) + + # Other options + parser.add_argument( + "--loglevel", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Set logging level. Default: 'INFO'", + ) + parser.add_argument( + "-v", + "--version", + action="version", + version="%(prog)s {version}".format(version=__version__), + ) + parser.add_argument("--logfile", default=None, help="Name of log file") + + return parser.parse_args() diff --git a/src/flexidot/utils/checks.py b/src/flexidot/utils/checks.py new file mode 100644 index 0000000..40ce55a --- /dev/null +++ b/src/flexidot/utils/checks.py @@ -0,0 +1,159 @@ +import logging +import os +import sys + +from flexidot._version import __version__ + + +def check_kmer_length(kmer_length): + """ + Logs a warning if the kmer length is less than 10. + + Parameters: + kmer_length (int): The kmer length to check. + """ + if kmer_length < 10: + logging.warning( + "Kmer length is less than 10. This may result in less accurate dotplots." + ) + + +def check_input(filename): + """ + Check if input files exist + """ + # Check if the input file exists + if not os.path.isfile(filename): + logging.error("Input file does not exist. Quitting.") + raise FileNotFoundError(f"Input sequence file '{filename}' does not exist.") + sys.stderr.write(f"\033[92m Input file found: {filename}\033[0m\n") + +def check_output_dir(outdir): + """ + Check if the output directory exists + """ + # Check if the output directory exists + if not os.path.isdir(outdir): + logging.error("Output directory does not exist. Quitting.") + raise FileNotFoundError(f"Output directory '{outdir}' does not exist.") + sys.stderr.write(f"\033[92m Output directory found: {outdir}\033[0m\n\n") + +def print_summary(args): + """ + Print a summary of the selected options to stderr. + Verifies the existence of any provided filenames. + """ + # Define color for headings + heading_color = "\033[1;34m" + reset_color = "\033[0m" + + # Log command line arguments + sys.stderr.write(f"\n{heading_color}CMD line input{reset_color}\n") + sys.stderr.write(" ${0}\n\n".format(" ".join(sys.argv))) + + # Input/output arguments + sys.stderr.write(f"{heading_color}Input/Output Arguments:{reset_color}\n") + sys.stderr.write(f" Input files [-i]: {args.infiles}\n") + for infile in args.infiles: + check_input(infile) + sys.stderr.write(f" Output prefix [-o]: {args.output_prefix}\n") + sys.stderr.write(f" Output directory [--outdir]: {args.outdir}\n") + check_output_dir(args.outdir) + + # Collage arguments + sys.stderr.write(f"{heading_color}Collage Arguments:{reset_color}\n") + sys.stderr.write(f" Collage [-c]: {args.collage}\n") + sys.stderr.write(f" Number of columns per page [--n_col]: {args.n_col}\n") + sys.stderr.write(f" Number of rows per page [--n_row]: {args.n_row}\n\n") + + # File type + sys.stderr.write(f"{heading_color}File Type:{reset_color}\n") + sys.stderr.write(f" Output file format [-f]: {args.filetype}\n\n") + + # Sorting + sys.stderr.write(f"{heading_color}Sorting:{reset_color}\n") + sys.stderr.write(f" Sort sequences alphabetically [-s]: {args.sort}\n\n") + + # Calculation parameters + sys.stderr.write(f"{heading_color}Calculation Parameters:{reset_color}\n") + sys.stderr.write(f" Wordsize (kmer length) [-k]: {args.wordsize}\n") + mode_descriptions = { + "0": "self [default]", + "1": "paired", + "2": "poly (matrix with all-against-all dotplots)", + } + modes = [mode_descriptions[mode] for mode in args.mode] + sys.stderr.write(f" Modes [-m]: {', '.join(modes)}\n") + sys.stderr.write(f" Sequence type [-t]: {args.type_seq}\n") + sys.stderr.write(f" Wobble conversion [-w]: {args.wobble_conversion}\n") + sys.stderr.write(f" Maximum 'N' percentage [-n_max]: {args.max_n}%\n") + sys.stderr.write(f" Substitution count [-S]: {args.substitution_count}\n") + sys.stderr.write(f" No reverse complementary matches [-r]: {args.norev}\n") + sys.stderr.write(f" Only vs first sequence [-O]: {args.only_vs_first_seq}\n\n") + + # Graphic formatting + sys.stderr.write(f"{heading_color}Graphic Formatting:{reset_color}\n") + sys.stderr.write(f" Line width [-A]: {args.line_width}\n") + sys.stderr.write(f" Line color (forward) [-B]: {args.line_col_for}\n") + sys.stderr.write(f" Line color (reverse) [-C]: {args.line_col_rev}\n") + sys.stderr.write(f" X-label position [-D]: {args.x_label_pos}\n") + sys.stderr.write(f" Font size [-E]: {args.label_size}\n") + sys.stderr.write(f" Spacing between dotplots [-F]: {args.spacing}\n") + sys.stderr.write(f" Length scaling [-L]: {args.length_scaling}\n") + sys.stderr.write(f" Mirror y-axis [-M]: {args.mirror_y_axis}\n") + sys.stderr.write(f" Plot size [-P]: {args.plot_size}\n") + representation_descriptions = {0: "full [default]", 1: "upper", 2: "lower"} + sys.stderr.write( + f" Representation [-R]: {representation_descriptions[args.representation]}\n" + ) + sys.stderr.write(f" Title length [-T]: {args.title_length}\n\n") + + # GFF shading + sys.stderr.write(f"{heading_color}GFF Shading:{reset_color}\n") + sys.stderr.write(f" GFF files [-g]: {args.gff}\n") + if args.gff: + for gff_file in args.gff: + check_input(gff_file) + if args.gff_color_config: + sys.stderr.write(f" GFF color config [-G]: {args.gff_color_config}\n") + check_input(args.gff_color_config) + sys.stderr.write("\n") + + # Longest Common Subsequence (LCS) shading + sys.stderr.write( + f"\n{heading_color}Longest Common Subsequence (LCS) Shading:{reset_color}\n" + ) + sys.stderr.write(f" LCS shading [-x]: {args.lcs_shading}\n") + sys.stderr.write(f" LCS shading intervals [-X]: {args.lcs_shading_num}\n") + lcs_shading_ref_descriptions = { + 0: "maximal LCS length [default]", + 1: "maximally possible length", + 2: "given interval sizes", + } + sys.stderr.write( + f" LCS shading reference [-y]: {lcs_shading_ref_descriptions[args.lcs_shading_ref]}\n" + ) + sys.stderr.write( + f" LCS shading interval length [-Y]: {args.lcs_shading_interval_len}\n" + ) + lcs_shading_ori_descriptions = { + 0: "forward [default]", + 1: "reverse", + 2: "both strands", + } + sys.stderr.write( + f" LCS shading orientation [-z]: {lcs_shading_ori_descriptions[args.lcs_shading_ori]}\n\n" + ) + + # Custom matrix shading + sys.stderr.write(f"{heading_color}Custom Matrix Shading:{reset_color}\n") + sys.stderr.write(f" User matrix file [-u]: {args.user_matrix_file}\n") + if args.user_matrix_file: + check_input(args.user_matrix_file) + sys.stderr.write(f" User matrix print [-U]: {args.user_matrix_print}\n\n") + + # Other options + sys.stderr.write(f"{heading_color}Other Options:{reset_color}\n") + sys.stderr.write(f" Logging level [--loglevel]: {args.loglevel}\n") + sys.stderr.write(f" Log file [--logfile]: {args.logfile}\n") + sys.stderr.write(f" Version [-v]: {__version__}\n\n") diff --git a/src/flexidot/utils/file_handling.py b/src/flexidot/utils/file_handling.py new file mode 100644 index 0000000..d07d94c --- /dev/null +++ b/src/flexidot/utils/file_handling.py @@ -0,0 +1,671 @@ +import logging +from Bio import SeqIO +import os +import matplotlib.colors as mcolors +import pylab as P + + +def read_seq(input_fasta, degap=False): + """ + Read fasta sequences from (all) file(s) + """ + + # Check if file provided + if input_fasta == [] or input_fasta == "": + raise ValueError("No file names provided: %s" % input_fasta) + + # Initialize variables + concat_created = False + + # Combine sequence files, if required + if type(input_fasta) is list: + # Concatenate fasta files + if len(input_fasta) > 1: + logging.info( + "Concatenating sequences from multiple files: {}".format(input_fasta) + ) + input_fasta_combi = concatenate_files(input_fasta) + concat_created = True + else: + input_fasta_combi = input_fasta[0] + else: + # Single fasta file + input_fasta_combi = input_fasta + + # read sequences + logging.info(f"Reading sequences from {input_fasta_combi}") + try: + # SeqIO.index returns a dictionary with the sequence id as key and the sequence as value + seq_dict = SeqIO.index(input_fasta_combi, "fasta") + except ValueError as e: + logging.error( + f"ValueError: {e} - please check input files, e.g. for duplicate names!" + ) + return {}, [] + except FileNotFoundError as e: + logging.error(f"FileNotFoundError: {e} - please check if the file exists!") + return {}, [] + except Exception as e: + logging.error(f"Unexpected error: {e} - please check input files!") + return {}, [] + + for seq_id in seq_dict: + if "-" in seq_dict[seq_id].seq: + logging.warning("Gaps detected in sequence: %s" % seq_id) + return read_seq(degap_fasta(input_fasta), degap=True) + + # Get sequence names for sorting + sequences = [] + for item in SeqIO.parse(input_fasta_combi, "fasta"): + sequences.append(item.id) + + # If degap=True remove input file after processing + if degap and type(input_fasta) is list: + logging.info("Removing temp degapped input fasta files: {}".format(input_fasta)) + for item in input_fasta: + os.remove(item) + elif degap: + logging.info("Removing temp degapped input fasta file: {}".format(input_fasta)) + os.remove(input_fasta) + + # If concatenation was required, remove combined file + if concat_created: + logging.info("Removing concatenated tempfile: {}".format(input_fasta_combi)) + os.remove(input_fasta_combi) + + return seq_dict, sequences + + +def read_gff_color_config(gff_color_config_file=""): + """ + define coloring options for gff-based color shading of self-dotplots + """ + + # default aestetics for annotation shading (e.g. if no user config file is provided) + # dictionary with feature_type as key and tuple(color, transparency, zoom) as value + gff_feat_colors = { + "orf": ("#b41a31", 0.2, 0), + "orf_rev": ("#ff773b", 0.3, 0), + "gene": ("#b41a31", 0.2, 0), + "cds": ("darkorange", 0.2, 0), + "exon": ("orange", 0.2, 0), + "intron": ("lightgrey", 0.2, 0), + "utr": ("lightblue", 0.2, 0), + "repeat_region": ("green", 0.3, 0), + "repeat": ("green", 0.3, 0), + "tandem_repeat": ("red", 0.3, 0), + "transposable_element": ("blue", 0.3, 0), + "ltr_retrotransposon": ("#cccccc", 0.5, 0), + "ltr-retro": ("#cccccc", 0.5, 0), + "long_terminal_repeat": ("#2dd0f0", 0.75, 2), + "ltr": ("#2dd0f0", 0.75, 2), + "pbs": ("purple", 0.75, 2), + "ppt": ("#17805a", 0.5, 2), + "target_site_duplication": ("red", 0.75, 2), + "misc_feature": ("grey", 0.3, 0), + "misc_feat": ("grey", 0.3, 0), + "misc": ("grey", 0.3, 0), + "others": ("grey", 0.5, 0), + } + if not gff_color_config_file or not os.path.exists(str(gff_color_config_file)): + logging.info("No custom GFF color configuration found. Using defaults.") + return gff_feat_colors + + logging.info("Updating GFF color configuration with custom specifications.") + + # Read custom gff_color_config_file + in_file = open(gff_color_config_file, "r") + overwritten = set([]) + for line in in_file: + if len(line.strip().split("\t")) >= 4 and not line.startswith("#"): + data = line.strip().split("\t") + feat = data[0].lower() + color = data[1].lower() + + # Check, if settings are valid + if not mcolors.is_color_like(color): + color = "grey" + text = "Invalid color specified for %s: %s - default grey" % ( + data[0], + data[1], + ) + logging.info(text) + try: + alpha = float(data[2]) + except: + alpha = 0.75 + text = "Invalid alpha specified for %s: %s - default 0.75" % ( + data[0], + data[2], + ) + logging.info(text) + try: + zoom = float(data[3]) + except: + zoom = 0 + text = "Invalid zoom specified for %s: %s - default 0" % ( + data[0], + data[3], + ) + logging.info(text) + + # Track changes of predefined settings + if feat in list(gff_feat_colors.keys()): + overwritten.add(data[0].lower()) + + gff_feat_colors[feat] = (color, alpha, zoom) + + in_file.close() + + # Default coloring for unknown annotations + if "others" not in list(gff_feat_colors.keys()): + gff_feat_colors["others"] = ("grey", 0.5, 0) + + # Print configuration + text = "\n\nGFF color specification:\n%s\n" % (60 * ".") + for item in sorted(gff_feat_colors.keys()): + text += "%-30s\t%-10s\t%-5s\t%s\n" % ( + item, + str(gff_feat_colors[item][0]), + str(gff_feat_colors[item][1]), + str(gff_feat_colors[item][2]), + ) + logging.debug(text) + + # print overwritting feature type specifications + if len(overwritten) != 0: + text = "%d feature type specifications overwritten:" % len(overwritten) + text += "\n\t" + ", ".join(overwritten) + "\n" + logging.info(text) + + text = "GFF color specification updated acc. to %s\n\t%s\n\n" % ( + gff_color_config_file, + ", ".join(gff_feat_colors), + ) + logging.info(text) + + return gff_feat_colors + + +def read_gffs( + input_gff_files, + color_dict={"others": ("grey", 1, 0)}, + type_nuc=True, + prefix="", + filetype="png", +): + """ + Create feature dictionary from input_gff + sequence name as key and (feature type, start, stop) as value + """ + if type(input_gff_files) is not list: + input_gff_files = [input_gff_files] + + # Create dictionary with seq_name as key and (type, start and stop) as value + unknown_feats = set([]) + used_feats = set([]) + feat_dict = {} + for input_gff in input_gff_files: + text = "...reading " + input_gff + logging.info(text) + + in_file = open(input_gff, "r") + for line in in_file: + if not line.startswith("#") and line.strip() != "": + data = line.strip().split("\t") + feat_type = data[2].lower() + if data[6] == "-": + feat_type += "_rev" + if feat_type.lower() not in list(color_dict.keys()): + if feat_type.lower().replace("_rev", "") in list(color_dict.keys()): + feat_type = feat_type.replace("_rev", "") + else: + unknown_feats.add(feat_type) + feat_type = "others" + used_feats.add(feat_type) + if data[0] not in list(feat_dict.keys()): + feat_dict[data[0]] = [ + (feat_type, int(data[3]), int(data[4])) + ] # feature type, start, stop + else: + feat_dict[data[0]].append( + (feat_type, int(data[3]), int(data[4])) + ) # feature type, start, stop + + text = "\nAnnotations for: %s\n" % ", ".join(list(feat_dict.keys())[:10]) + if len(list(feat_dict.keys())) > 10: + text = text[:-1] + ", ...\n" + logging.info(text) + + in_file.close() + + # print feature types without specific shading settings + if len(unknown_feats) != 0: + text = "Missing shading specification for %d feature type(s):\n\t%s\n" % ( + len(unknown_feats), + ", ".join(sorted(unknown_feats)), + ) + logging.info(text) + + # create color legend + colors, alphas = [], [] + for item in sorted(used_feats): + colors.append(color_dict[item][0]) + alphas.append(color_dict[item][1]) + legend_figure( + colors=colors, + lcs_shading_num=len(used_feats), + type_nuc=type_nuc, + bins=sorted(used_feats), + alphas=alphas, + gff_legend=True, + prefix=prefix, + filetype=filetype, + ) + + # print settings + text = "GFF Feature Types: %s\nGFF Colors: %s" % ( + ", ".join(sorted(used_feats)), + ", ".join(sorted(colors)), + ) + logging.info(text) + + return feat_dict + + +def read_matrix(matrix_file_name, delim="\t", symmetric=True, recursion=False): + input_file = open(matrix_file_name, "r") + + # read sequence names from first column + names = [] + for line in input_file: + if not line.startswith("#") and not line.startswith(delim) and delim in line: + names.append(line.strip().split(delim)[0]) + logging.info( + "Delimiter '%s': %d names - %s\n" % (delim, len(names), ", ".join(names)) + ) + + # check if names were found - otherwise try another delimiter + if names == [] and not recursion: + if delim == "\t": + new_delim = "," + else: + new_delim = "\t" + logging.info( + "\nMatrix file not containing data delimited by '%s' - trying to read matrix with delimiter '%s'" + % (delim.replace("\t", "\\t"), new_delim) + ) + info_dict = read_matrix( + matrix_file_name, + delim=new_delim, + symmetric=symmetric, + recursion=True, + ) + return info_dict + elif names == []: + logging.info("Empty matrix file with alternative delimiter!") + return info_dict + input_file.close() + + input_file = open(matrix_file_name, "r") + # read matrix entries as values in dictionary with tuple(names) as key + info_dict = {} + contradictory_entries = [] + for line in input_file: + if not line.startswith("#") and not line.startswith(delim) and delim in line: + data = line.strip().split(delim) + for idx in range(len(data[1:])): + # print tuple(sorted([data[0], names[idx]])), data[idx+1] + if symmetric: + key = tuple(sorted([names[idx], data[0]])) + else: + key = tuple(names[idx], data[0]) + if key in list(info_dict.keys()): + if ( + symmetric + and info_dict[key] != data[idx + 1] + and data[idx + 1] not in ["", "-"] + and info_dict[key] not in ["", "-"] + ): + contradictory_entries.append(key) + info_dict[key] = data[idx + 1] + input_file.close() + + if len(contradictory_entries) != 0: + try: + logging.info( + "\nContradictory entries in matrix file %s:\n\t%s" + % (matrix_file_name, ", ".join(contradictory_entries)) + ) + except: + log_txt = "\nContradictory entries in matrix file %s:\n\t" % ( + matrix_file_name + ) + for item in contradictory_entries: + log_txt += str(item).replace("'", "") + ", " + log_txt = log_txt[:-2] + logging.info(log_txt) + logging.info("Using value from bottom left triangle!") + + logging.debug(f"Matrix information: {', '.join(names)}") + + return info_dict + + +def concatenate_files(file_list, combi_filename="temp_combined.fasta"): + """ + concatenate content of all files in file_list into a combined file named combi_filename + """ + out_file = open(combi_filename, "w") + text = "" + for item in file_list: + text += item + " " + # read in_file linewise and write to out_file + in_file = open(item, "r") + for line in in_file: + out_file.write(line.strip() + "\n") + in_file.close() + out_file.close() + + logging.debug(text) + + return combi_filename + + +def degap_fasta(input_fasta): + """ + Remove gaps from fasta - Write new degapped fasta. + """ + logging.info("Removing gaps from fasta files: %s" % input_fasta) + # degap all sequence files + output_fastas = [] + if type(input_fasta) is not list: + input_fasta = list(input_fasta) + for input_fas in input_fasta: + output_fas = input_fas[: input_fas.rfind(".")] + "_degapped.fas" + in_file = open(input_fas, "r") + out_file = open(output_fas, "w") + for line in in_file: + if line.startswith(">"): + out_file.write(line.strip() + "\n") + else: + out_file.write(line.strip().replace("-", "") + "\n") + out_file.close() + in_file.close() + output_fastas.append(output_fas) + return output_fastas + + +def legend_figure( + colors, + lcs_shading_num, + type_nuc=True, + unit="%", + filetype="png", + max_len=None, + min_len=0, + bins=[], + alphas=[], + gff_legend=False, + prefix="", +): + """ + create figure color legend + """ + max_legend_length_row = 8 + max_legend_length_col = 4 + + # define output file + if filetype not in ["png", "pdf", "svg"]: + text = "Provide valid file type - png, pdf, or svg" + logging.info(text) + filetype = "png" + + # check if length of information fit + if not gff_legend and ( + (bins != [] and len(colors) != lcs_shading_num + 1) + or (bins != [] and len(colors) != len(bins) + 1) + ): + if bins != [] and len(colors) != lcs_shading_num + 1: + text = ( + "**Attention**\nlcs_shading_num (%d) does not match number of colors (%d)!\n" + % (lcs_shading_num, len(bins)) + ) + elif bins != [] and len(colors) != len(bins) + 1: + text = ( + "**Attention**\nnumber of LCS length bins (%d) does not match number of colors (%d)!\n" + % (len(colors), len(bins)) + ) + logging.info(text) + elif gff_legend and len(bins) != len(colors): + text = ( + "**Attention**\nnumber of GFF Feature Types (%d) does not match number of colors (%d)!\n" + % (len(colors), len(bins)) + ) + logging.info(text) + + # set alpha values to opaque if none are provided + if alphas == []: + for item in colors: + alphas.append(1) + + # legend data points + data_points = list(range(len(colors))) + if not gff_legend: + # specify intervals, if max_len provided + if max_len is not None: + multi_factor = 100 # one digit + if max_len <= 1: + multi_factor = 1000 # two digits + # len_interval_size = (max_len-min_len) * multi_factor *1. // lcs_shading_num * (1./ multi_factor) + len_interval_size = (max_len - min_len) * 1.0 / lcs_shading_num + len_pos = [float("%.2f" % (min_len))] + # calculate interval positions + for idx in range(lcs_shading_num): + len_pos.append(float("%.2f" % (len_pos[-1] + len_interval_size))) + + if prefix.startswith("custom-matrix") and ( + 0 <= max_len <= 100 and 0 <= min_len <= 100 + ): + unit = "%" + elif prefix.startswith("custom-matrix"): + unit = "" + + text = ( + "\n%d Legend intervals from %.2f to %.2f: \n\t%s - number: %d, step: %.2f, unit: %s\n" + % ( + lcs_shading_num + 1, + min_len, + max_len, + str(len_pos), + len(len_pos), + len_interval_size, + unit, + ) + ) + logging.info(text) + pos = len_pos + interval_size = len_interval_size + # generate legend labels acc. to standard interval notation + else: + # use default max_len = 100 and min_len = 0 + len_interval_size = 100.0 / lcs_shading_num + pos = [float("%.2f" % (0))] + # calculate interval positions + for idx in range(lcs_shading_num): + pos.append(float("%.2f" % (pos[-1] + len_interval_size))) + + # interval_size = 100 // lcs_shading_num + # pos = range(interval_size, 101+interval_size, interval_size) + + # remove unneccessary zeros in decimal places (i.e. if x.x00 in all entries) + while True: + last_digit_all_zero = True + no_delim = False + for idx in range(len(pos)): + # only process if fraction with decimal places + if "." not in str(pos[idx]): + no_delim = True + break + # only process when all entries end in zero + elif str(pos[idx])[-1] != "0": + last_digit_all_zero = False + break + if not last_digit_all_zero or no_delim: + break + # remove last decimal place (== 0) from all entries + else: + temp_pos = pos[:] + for idx in range(len(pos)): + if not str(pos[idx])[-2] == ".": + pos[idx] = float(str(pos[idx])[:-1]) + else: + pos[idx] = int(str(pos[idx])[:-2]) + logging.info("Shortening legend entries: %s - %s" % (temp_pos, pos)) + + # eliminate fractions if unit == bp/aa + if unit in ["aa", "bp"]: + for idx in range(len(pos)): + temp_pos = pos[:] + rounded_unit = False + if "." in str(pos[idx]): + rounded_unit = True + # round values up to next integer (keep integer, if not a fraction) + pos[idx] = int(pos[idx] / 1) + int(pos[idx] % 1 > 0) + if idx == len(pos) - 1 and pos[idx] == 101: + pos[idx] = 100 + if rounded_unit: + logging.info( + "Fractions not permitted for unit '%s': %s -> %s" + % (unit, temp_pos, pos) + ) + + if bins != []: # labels provided + legend_labels = bins[:] + legend_labels.append("max") + legend_labels_lengths = [] + for item in bins: + legend_labels_lengths.append( + "[%d %s, %d %s)" % (item - min(bins), unit, item, unit) + ) + if len(bins) == len(colors) - 1: + legend_labels_lengths.append( + "[%d %s, %s]" % (max(bins), unit, "\u221e") + ) # infinite + + else: + legend_labels = [] + legend_labels_lengths = [] + for idx in range(len(pos)): + num = pos[idx] + try: + legend_labels.append("[%d%%, %d%%)" % (num - interval_size, num)) + except: + legend_labels.append( + "[%d%%, %d%%)" % (num - len_interval_size, num) + ) + if max_len is not None: + num = len_pos[idx] + # as int or float + if num == int(num) and int(len_interval_size) == len_interval_size: + legend_labels_lengths.append( + "[%d %s, %d %s)" + % (num, unit, num + len_interval_size, unit) + ) + else: + legend_labels_lengths.append( + "[%.2f %s, %.2f %s)" + % (num, unit, num + len_interval_size, unit) + ) + legend_labels[-1] = "100" + unit + if max_len is not None: + if num == int(num) and int(len_interval_size) == len_interval_size: + legend_labels_lengths[-1] = "[%d %s, \u221e]" % (max_len, unit) + else: + legend_labels_lengths[-1] = "[%.2f %s, \u221e]" % (max_len, unit) + + # set labels and choose file name + if gff_legend: + label_text = bins[:] + edge_col = None + legend_file_name = "GFF_Shading_Legend_n%d." % lcs_shading_num + filetype + elif max_len is not None: + label_text = legend_labels_lengths[:] + edge_col = "black" + legend_file_name = ( + "Polydotplot_LCS_Shading_Legend_max%d%s_n%d." + % (max_len, unit, lcs_shading_num + 1) + + filetype + ) + elif bins != []: + label_text = legend_labels_lengths[:] + edge_col = "black" + legend_file_name = ( + "Polydotplot_LCS_Shading_Legend_%d%s_n%d." + % (bins[0], unit, lcs_shading_num + 1) + + filetype + ) + else: + label_text = legend_labels[:] + edge_col = "black" + legend_file_name = ( + "Polydotplot_LCS_Shading_Legend_%%len_n%d." % (lcs_shading_num + 1) + + filetype + ) + + if prefix is not None and prefix != "": + if not prefix.endswith("-"): + prefix = prefix + "-" + legend_type = "LCS" + if prefix.startswith("custom-matrix"): + prefix = prefix.replace("custom-matrix", "")[1:] + legend_type = "CustomMatrix" + legend_file_name = prefix + legend_file_name.replace("LCS", legend_type) + + # plot legend figure + fig, ax = P.subplots(3, 1, figsize=(len(colors) * 2, len(colors) * 2)) + for idx in range(len(colors)): + ax[0].bar( + data_points[idx] + 1, + data_points[idx] + 1, + color=colors[idx], + label=label_text[idx], + alpha=alphas[idx], + edgecolor=edge_col, + ) + ax[1].bar( + data_points[idx] + 1, + 0, + color=colors[idx], + label=label_text[idx], + alpha=alphas[idx], + edgecolor=edge_col, + ) + ax[2].bar( + data_points[idx] + 1, + 0, + color=colors[idx], + label=label_text[idx], + alpha=alphas[idx], + edgecolor=edge_col, + ) + ax[1].set_ylim(0, 1) + ax[2].set_ylim(0, 1) + ax[1].legend( + ncol=((len(colors) - 1) // max_legend_length_row) + 1, framealpha=1 + ) # vertical legend + col_num = len(colors) + if len(colors) > max_legend_length_col: + remainder = 0 + if len(colors) % max_legend_length_col != 0: + remainder = 1 + row_num = len(colors) // max_legend_length_col + remainder + remainder = 0 + if len(colors) % row_num != 0: + remainder = 1 + col_num = len(colors) // row_num + remainder + ax[2].legend(ncol=col_num, framealpha=1) # horizontal legend + + P.savefig(legend_file_name) + + return legend_file_name diff --git a/src/flexidot/utils/logs.py b/src/flexidot/utils/logs.py new file mode 100644 index 0000000..d743e4a --- /dev/null +++ b/src/flexidot/utils/logs.py @@ -0,0 +1,84 @@ +import logging +import sys + + +class CustomFormatter(logging.Formatter): + """ + Custom logging formatter to add color to log messages based on their severity level. + """ + + # ANSI escape codes for colors + grey = "\x1b[38;21m" + blue = "\x1b[38;5;39m" + yellow = "\x1b[38;5;226m" + red = "\x1b[38;5;196m" + bold_red = "\x1b[31;1m" + reset = "\x1b[0m" + + def __init__(self, fmt): + """ + Initializes the CustomFormatter with a specified format string. + + Parameters: + fmt (str): The format string for log messages. + """ + super().__init__() + self.fmt = fmt + self.FORMATS = { + logging.DEBUG: self.grey + self.fmt + self.reset, + logging.INFO: self.blue + self.fmt + self.reset, + logging.WARNING: self.yellow + self.fmt + self.reset, + logging.ERROR: self.red + self.fmt + self.reset, + logging.CRITICAL: self.bold_red + self.fmt + self.reset, + } + + def format(self, record): + """ + Formats a log record with the appropriate color based on its severity level. + + Parameters: + record (logging.LogRecord): The log record to format. + + Returns: + str: The formatted log message. + """ + log_fmt = self.FORMATS.get(record.levelno) + formatter = logging.Formatter(log_fmt) + return formatter.format(record) + + +def init_logging(loglevel="DEBUG", logfile=None): + """ + Initializes the logging system with a specified log level and custom formatter. + + Parameters: + loglevel (str): The log level to use (e.g., "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"). + Defaults to "DEBUG". + logfile (str, optional): The file to which log messages should be written. If None, log messages + will only be output to stderr. Defaults to None. + + Raises: + ValueError: If the provided log level is invalid. + """ + # Convert log level string to numeric value + numeric_level = getattr(logging, loglevel.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError(f"Invalid log level: {loglevel}") + + # Define log message format + fmt = "%(asctime)s | %(levelname)s | %(module)s | %(funcName)s | %(lineno)d | %(message)s" + + # Create a StreamHandler to output log messages to stderr + handler_sh = logging.StreamHandler(sys.stderr) + handler_sh.setFormatter(CustomFormatter(fmt)) + + # Configure the logging system + if logfile is not None: + # Create a FileHandler to output log messages to a file + handler_fh = logging.FileHandler(logfile) + handler_fh.setFormatter(logging.Formatter(fmt)) + logging.basicConfig( + format=fmt, level=numeric_level, handlers=[handler_sh, handler_fh] + ) + else: + logging.basicConfig(format=fmt, level=numeric_level, handlers=[handler_sh]) diff --git a/src/flexidot/utils/matching.py b/src/flexidot/utils/matching.py new file mode 100644 index 0000000..0d44f9d --- /dev/null +++ b/src/flexidot/utils/matching.py @@ -0,0 +1,386 @@ +############################### +# Matching Functions # +############################### + +import logging + +import regex +import numpy as np +from typing import Tuple, Union +from Bio.Seq import Seq + +# from flexidot.utils.utils import time_track +from flexidot.utils.alphabets import alphabets +from collections import defaultdict +from flexidot.utils.analysis import ( + wobble_replacement, + split_diagonals, + lcs_from_x_values, +) + + +def find_match_pos_diag( + seq1: Union[str, Seq], + seq2: Union[str, Seq], + wordsize: int, + report_lcs: bool = False, + rc_option: bool = True, + convert_wobbles: bool = False, + max_N_percentage: float = 0, + type_nuc: bool = True, +) -> Union[ + Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], + Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], +]: + """ + Find all matching positions with matches >= wordsize + convert matching points into lines of the length of the match + (+ optional handling of ambiguities) + """ + # TODO: check if seq1 and se2 are the same sequence (e.g. in case of self-alignment) + # If so, then can skip counting the fwd kmers for seq2 and recycle the results from seq1 + if seq1 == seq2: + logging.debug("Self-alignment detected. Recycling results from seq1 for seq2.") + self_alignment = True + else: + self_alignment = False + + # Look for Ns in DNA or Xs in proeins (minimum word size) + if type_nuc: + unknown_residue = "N" + else: + unknown_residue = "X" + + # Calculate the maximum number of Ns allowed in a kmer + max_N_count = (max_N_percentage / 100.0) * wordsize + + # read sequences + seq_one = seq1.upper() + len_one = len(seq_one) + seq_two = seq2.upper() + len_two = len(seq_two) + + # Set ambiguity code for wobble replacement + general_ambiguity_code = alphabets(type_nuc)[ + 2 + ] # nucleotide_ambiguity_code or aminoacid_ambiguity_code + + # Forward + ################################# + kmer_pos_dict_one = defaultdict(list) # Seq1 + kmer_pos_dict_two = defaultdict(list) # Seq2 + + # Reverse complement + ################################# + kmer_pos_dict_three = defaultdict(list) # Seq1 + kmer_pos_dict_four = defaultdict(list) # Seq2 + + # Create dictionaries to index kmer (wordsize) positions in the sequence + if self_alignment and rc_option: + # Compare seq1 forward to self and self reverse + data_list = [ + (str(seq_one), kmer_pos_dict_one), + (str(seq_one.reverse_complement()), kmer_pos_dict_three), + ] + elif self_alignment and not rc_option: + # Compare seq1 forward to self only + data_list = [ + (str(seq_one), kmer_pos_dict_one), + ] + elif rc_option: + # Compare seq1 forward to seq2 forward and reverse + data_list = [ + (str(seq_one), kmer_pos_dict_one), + (str(seq_two), kmer_pos_dict_two), + # (str(seq_one), kmer_pos_dict_three), #TODO: Check if this is needed + (str(seq_two.reverse_complement()), kmer_pos_dict_four), + ] + else: + # Compare seq1 and seq2 forward only + data_list = [ + (str(seq_one), kmer_pos_dict_one), + (str(seq_two), kmer_pos_dict_two), + ] + + # Step through each sequence and add kmers to dictionary + for seq, kmer_pos_dict in data_list: + # Track number of kmers skipped due to Ns > max_N_count + skipped_Ns = 0 + # Step through sequence and add kmer positions to dictionary + for i in range(len(seq) - wordsize + 1): + # Extract kmer + kmer = seq[i : i + wordsize] + # Count Ns in kmer + Ns_in_kmer = kmer.count(unknown_residue) + # Discard kmer, if too many Ns included + if Ns_in_kmer <= max_N_count: + if not convert_wobbles: + if Ns_in_kmer == 0: + # Add kmer to dictionary if no Ns. + kmer_pos_dict[kmer].append(i) + else: + # Skip kmers with Ns + skipped_Ns += 1 + else: # Deal with ambiguous characters + # Set as True if any wobble characters are present in the kmer + has_wobbles = any( + item in kmer for item in general_ambiguity_code.keys() + ) + if not has_wobbles: + kmer_pos_dict[kmer].append(i) + else: + # Replace wobble characters with all possible variants + kmer_variants = wobble_replacement(kmer, general_ambiguity_code) + for new_kmer in kmer_variants: + kmer_pos_dict[new_kmer].append(i) + else: + skipped_Ns += 1 + + # Log number of skipped kmers + if skipped_Ns > 0: + if convert_wobbles: + logging.debug( + "Skipped %i kmers due to {unknown_residue}s > %i" + % (skipped_Ns, max_N_count) + ) + else: + logging.debug( + "Skipped %i kmers containing {unknown_residue}s" % (skipped_Ns) + ) + + # If self alignment, duplicate self fwd and rev dictionaries + if self_alignment: + # If self alignment, copy kmer_pos_dict_one to kmer_pos_dict_two + kmer_pos_dict_two = kmer_pos_dict_one.copy() + # Copy kmer_pos_dict_three to kmer_pos_dict_four + kmer_pos_dict_four = kmer_pos_dict_three.copy() + + # Find kmers shared between both sequences in forward orientation + matches_for = set(kmer_pos_dict_one).intersection(kmer_pos_dict_two) + + # Find kmers shared between Seq1 forward and Seq2 reverse orientation + matches_rc = set(kmer_pos_dict_one).intersection(kmer_pos_dict_four) + + # TODO: Check if above is correct. Do we gain any extra kmers over set(matches_for, matches_rc) if we also check the seq1_rc? + # print("matches_for: ", type(matches_for) ,matches_for) + # print("matches_rc: ", matches_rc) + + logging.debug( + "[matches: %i forward; %i reverse]" % (len(matches_for), len(matches_rc)) + ) + + # Create lists of x and y coordinates for scatter plot + # Keep all coordinates of all shared kmers (may match multiple times) + diag_dict_for = defaultdict(set) + diag_dict_rc = defaultdict(set) + + for match_list, pos_dict1, pos_dict2, diag_dict in [ + (matches_for, kmer_pos_dict_one, kmer_pos_dict_two, diag_dict_for), + (matches_rc, kmer_pos_dict_one, kmer_pos_dict_four, diag_dict_rc), + ]: + for kmer in match_list: + for i in pos_dict1[kmer]: + for j in pos_dict2[kmer]: + diag = i - j + points = set(range(i + 1, i + wordsize + 1)) + diag_dict[diag].update(points) + + # Convert coordinate points to line start and stop positions + x1 = [] # x values reverse + y1 = [] # y values forward + for diag in list(diag_dict_for.keys()): + x_values = np.array(sorted(diag_dict_for[diag])) + x1.extend(split_diagonals(x_values)) + y_values = split_diagonals(x_values - diag) + y1.extend(y_values) + + x2 = [] # x values rc + y2 = [] # y values rc + if rc_option: + for diag in list(diag_dict_rc.keys()): + factor = len_two + diag + 1 + x_values = np.array(sorted(diag_dict_rc[diag])) + x2.extend(split_diagonals(x_values)) + y_values = split_diagonals(factor - x_values, -1) + y2.extend(y_values) + + if not report_lcs: + return ( + # x values forward matches + np.array([np.array(x) for x in x1], dtype=object), + # y values forward matches + np.array([np.array(y) for y in y1], dtype=object), + # x values rc (ascending) + np.array([np.array(x) for x in x2], dtype=object), + # y values rc (descending) + np.array([np.array(y) for y in y2], dtype=object), + ) + else: + # Get length of longest common substring based on match lengths + lcs_for = lcs_from_x_values(x1) + lcs_rev = lcs_from_x_values(x2) + return ( + np.array([np.array(x) for x in x1], dtype=object), + np.array([np.array(y) for y in y1], dtype=object), + np.array([np.array(x) for x in x2], dtype=object), + np.array([np.array(y) for y in y2], dtype=object), + lcs_for, + lcs_rev, + ) + + +def find_match_pos_regex( + seq1, + seq2, + wordsize, + substitution_count=0, + report_lcs=False, + rc_option=True, + convert_wobbles=False, + max_N_percentage=0, + type_nuc=True, +): + """ + find all matching positions with matches >= wordsize via regular expression search + fuzzy matching - allow up to substitution_count substitutions + convert matching points into lines of the length of the match + (+ optional handling of ambiguities) + """ + + # read sequences + seq_one = seq1.upper() + len_one = len(seq_one) + seq_two = seq2.upper() + len_two = len(seq_two) + + # set ambiguity code for wobble replacement + general_ambiguity_code = alphabets(type_nuc)[ + 2 + ] # nucleotide_ambiguity_code or aminoacid_ambiguity_code + ambiguity_match_dict = alphabets(type_nuc)[3] + + ambiq_residues = "[%s]" % "".join(list(general_ambiguity_code.keys())) + + # look for Ns in DNA or Xs in proeins (minimum word size) + if type_nuc: + any_residue = "N" + else: + any_residue = "X" + + # check for wobble presence + if not ( + regex.search(ambiq_residues, str(seq_one)) == None + and regex.search(ambiq_residues, str(seq_two)) == None + ): + wobble_found = True + else: + wobble_found = False + + # dictionary for matches + diag_dict_for = {} + diag_dict_rc = {} + counter = [0, 0] + + # one-way matching + if rc_option: + data_list = [ + (str(seq_one), str(seq_two), diag_dict_for, 0), + (str(seq_one), str(seq_two.reverse_complement()), diag_dict_rc, 1), + ] + else: + data_list = [(str(seq_one), str(seq_two), diag_dict_for, 0)] + + for seq_query, seq_target, diag_dict, counter_pos in data_list: + # split query sequence into kmers + if not rc_option and counter_pos == 1: + break + + for idx in range(len(str(seq_query)) - wordsize + 1): + kmer = str(seq_query)[idx : idx + wordsize] + + # skip excessive N/X stretches (big black areas) + if kmer.count(any_residue) * 100.0 / wordsize <= max_N_percentage: + # convert kmer to regular expression for wobble_matching + if convert_wobbles and wobble_found: + kmer_string = "" + # replace each residue with matching residues or wobbles + for jdx in range(len(kmer)): + kmer_string += ambiguity_match_dict[kmer[jdx]] + else: + kmer_string = kmer + + # convert to regular expression tolerating substitution errors + if type(substitution_count) is int and substitution_count != 0: + kmer_string = "(%s){s<=%d}" % (kmer_string, substitution_count) + + # search for regular expression in target sequence + kdx = 0 + start = True + if regex.search(kmer_string, seq_target[kdx:]) != None: + counter[counter_pos] += 1 + while regex.search(kmer_string, seq_target[kdx:]) != None: + # search for regular expression pattern in target sequence + result = regex.search(kmer_string, seq_target[kdx:]) + + kmer2 = seq_target[kdx:][result.start() : result.end()] + + # skip excessive N/X stretches (big black areas) + if ( + kmer2.count(any_residue) * 100.0 / wordsize + <= max_N_percentage + ): + diag = idx - (kdx + result.start()) + points = set(range(idx + 1, idx + wordsize + 1)) + if diag not in list(diag_dict.keys()): + diag_dict[diag] = points + else: + diag_dict[diag].update(points) + + kdx += result.start() + 1 + if kdx >= len(seq_target): + break + elif regex.search(kmer_string, seq_target[kdx:]) != None: + counter[counter_pos] += 1 + + text = "%5.i \tforward matches" % counter[0] + text += "\n%5.i \treverse complementary matches" % counter[1] + logging.debug(text) + + # convert coordinate points to line start and stop positions + x1 = [] # x values reverse + y1 = [] # y values forward + for diag in list(diag_dict_for.keys()): + x_values = np.array(sorted(diag_dict_for[diag])) + x1.extend(split_diagonals(x_values)) + y_values = split_diagonals(x_values - diag) + y1.extend(y_values) + + x2 = [] # x values rc + y2 = [] # y values rc + if rc_option: + for diag in list(diag_dict_rc.keys()): + factor = len_two + diag + 1 + x_values = np.array(sorted(diag_dict_rc[diag])) + x2.extend(split_diagonals(x_values)) + y_values = split_diagonals(factor - x_values, -1) + y2.extend(y_values) + + if not report_lcs: + return ( + np.array([np.array(x) for x in x1], dtype=object), + np.array([np.array(y) for y in y1], dtype=object), + np.array([np.array(x) for x in x2], dtype=object), + np.array([np.array(y) for y in y2], dtype=object), + ) + else: + # get length of longest common substring based on match lengths + lcs_for = lcs_from_x_values(x1) + lcs_rev = lcs_from_x_values(x2) + return ( + np.array([np.array(x) for x in x1], dtype=object), + np.array([np.array(y) for y in y1], dtype=object), + np.array([np.array(x) for x in x2], dtype=object), + np.array([np.array(y) for y in y2], dtype=object), + lcs_for, + lcs_rev, + ) diff --git a/src/flexidot/utils/utils.py b/src/flexidot/utils/utils.py new file mode 100644 index 0000000..26de624 --- /dev/null +++ b/src/flexidot/utils/utils.py @@ -0,0 +1,139 @@ +import sys +import logging +import time +import pylab as P +from matplotlib import colormaps + +from colormap import rgb2hex +from colour import Color + + +# TODO: Remove internal logging message. Return delta time instead of printing it. +# Check if "now" value is used elsewhere. +def time_track(starting_time, show=True): + """ + calculate time passed since last time measurement + """ + now = time.time() + delta = now - starting_time + if show: + logging.info(f"{delta} seconds") + return delta + + +def calc_fig_ratio(ncols, nrows, plot_size): + """ + Calculate size ratio for given number of columns (ncols) and rows (nrows) + with plot_size as maximum width and length + """ + ratio = ncols * 1.0 / nrows + logging.debug(" ".join([str(ncols), str(nrows), str(ratio)])) + if ncols >= nrows: + figsize_x = plot_size + figsize_y = plot_size / ratio + else: + figsize_x = plot_size * ratio + figsize_y = plot_size + return figsize_x, figsize_y + + +def shorten_name(seq_name, max_len=20, title_clip_pos="B"): # , delim="_"): + """ + shorten sequence names (for diagram titles) + """ + + if len(seq_name) <= max_len: + return seq_name + + # take last characters + if title_clip_pos == "E": + name = seq_name[len(seq_name) - max_len :] + + # take first characters + else: + name = seq_name[:max_len] + + """# keep first and last part if multiple parts separated by delimiter (e.g. species_prefix + sequence_id) + if delim in seq_name: + if seq_name.count(delim) >= 2: + name = "%s..." % delim.join(seq_name.split(delim)[:1]) + seq_name.split(delim)[-1] # .replace("_000", "-") + else: + name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] + + if len(name) > max_len: + name = name[:((max_len-2)//2)] + "..." + name[((max_len-2)//2):] + else: + name = seq_name[:((max_len-2)//2)] + "..." + seq_name[((max_len-2)//2):] + """ + + return name + + +def unicode_name(name): + """ + replace non-ascii characters in string (e.g. for use in matplotlib) + """ + unicode_string = eval('u"%s"' % name) + return "".join(char for char in unicode_string if ord(char) < 128) + # return unicodedata.normalize("NFKD", unicode_string).encode("ascii", "ignore") + + +def create_color_list( + number: int, color_map: str = "Greys", max_grey: str = "#595959" +) -> list: + """ + Create color list with given number of entries + grey by default, matplotlib color_map can be provided + """ + if color_map not in list(colormaps): + logging.warning( + f"Invalid color_map {color_map} provided! - Examples: {list(colormaps)}." + ) + logging.warning("See https://matplotlib.org/users/colormaps.html\n") + + try: + # create pylab colormap + cmap = eval("P.cm." + color_map) + + # get descrete color list from pylab + cmaplist = [cmap(i) for i in range(cmap.N)] # extract colors from map + + # determine positions for number of colors required + steps = round((len(cmaplist) - 1) / (number)) + numbers = list(range(0, len(cmaplist), int(steps))) + + # extract RGB color and convert to hex code + colors = [] + for idx in numbers[:-1]: + rgba_color = cmaplist[idx] + rgb_color = rgba_color[:3] + col = rgb2hex( + int(rgb_color[0] * 255), + int(rgb_color[1] * 255), + int(rgb_color[2] * 255), + ) + colors.append(col) + + # Default to Greys color scheme if an error occurs + except Exception as e: + logging.warning(f"An error occurred: {e}") + logging.warning("Using grey color scheme instead.") + + old_max_grey = "#373737" + old_max_grey = "#444444" + colors = list(Color("#FFFFFF").range_to(Color(max_grey), number)) # grey + for idx in range(len(colors)): + colors[idx] = str(colors[idx]).replace("Color ", "") + if "#" in colors[idx] and len(colors[idx]) != 7: + # print colors[idx] + colors[idx] = colors[idx] + colors[idx][-(7 - len(colors[idx])) :] + + logging.info("%d Colors: %s" % (len(colors), ", ".join(colors))) + + if len(colors) < number: + logging.info( + "\nError in color range definition! %d colors missing\n" + % (number - len(colors)) + ) + + return colors diff --git a/tests/test-data/Seq1.fasta b/tests/test-data/Seq1.fasta new file mode 100644 index 0000000..343864c --- /dev/null +++ b/tests/test-data/Seq1.fasta @@ -0,0 +1,3 @@ +>Seq1 +CGAACCGATATAATCAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGA +CAGGATATCGAGTAAAATGAGCTATAGGACAGTGACGTGACTTAGAGG \ No newline at end of file diff --git a/tests/test-data/Seq2.fasta b/tests/test-data/Seq2.fasta new file mode 100644 index 0000000..522e6d4 --- /dev/null +++ b/tests/test-data/Seq2.fasta @@ -0,0 +1,5 @@ +>Seq2 +CGAACCGATATAATCAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGA +CAGGATATCGAGTAAAATGAGCTATAGGACAGTGATTACTCGATATCCTGTCACTTTACT +CGATATCCTGTCACTTTACTCGATATCCTGTCACTTTACTCGATATCCTGTCACTTTACT +CGATATCCTGTCACTTTCGTGACTTAGAGG \ No newline at end of file diff --git a/tests/test-data/Seq4.fasta b/tests/test-data/Seq4.fasta new file mode 100644 index 0000000..b8014af --- /dev/null +++ b/tests/test-data/Seq4.fasta @@ -0,0 +1,5 @@ +>Seq4 +CCTCTAAGTCACGDAAGTGACSGGATATCGAGTAAAGTGNCAGGATATNGAGYAAAGVGA +CAGGAKATCGAGTHAAGTGACAGGATATCGAGTAAAGTGACAGGATBSCGAHTAATCAVT +GVCCTATAGRRCATTTTACTCGATATCCVGTYACTTTBSTCGATATCCTGKCACTTMACT +CGATATCCTGTCACTGATTATATCGGTTCG \ No newline at end of file diff --git a/test-data/custom_matrix.txt b/tests/test-data/custom_matrix.txt similarity index 95% rename from test-data/custom_matrix.txt rename to tests/test-data/custom_matrix.txt index 8f1a34b..f6039e0 100644 --- a/test-data/custom_matrix.txt +++ b/tests/test-data/custom_matrix.txt @@ -1,7 +1,7 @@ - Seq1 Seq2 Seq3 Seq4 Seq5 Seq6 -Seq1 100 -Seq2 50 100 -Seq3 40 60 100 -Seq4 70 80 70 100 -Seq5 30 20 60 10 100 -Seq6 20 30 50 10 10 100 + Seq1 Seq2 Seq3 Seq4 Seq5 Seq6 +Seq1 100 +Seq2 50 100 +Seq3 40 60 100 +Seq4 70 80 70 100 +Seq5 30 20 60 10 100 +Seq6 20 30 50 10 10 100 diff --git a/test-data/example.gff3 b/tests/test-data/example.gff3 similarity index 98% rename from test-data/example.gff3 rename to tests/test-data/example.gff3 index 2bbb1c2..2d556b3 100644 --- a/test-data/example.gff3 +++ b/tests/test-data/example.gff3 @@ -1,6 +1,6 @@ -Seq2 manual_annotations Spacer1 1 15 . - . ID=0001 -Seq2 manual_annotations repeat_region 16 76 . + . ID=0001 -Seq2 manual_annotations SpacerZoom 77 95 . + . ID=0001 -Seq2 manual_annotations Spacer2 77 95 . + . ID=0001 -Seq2 manual_annotations repeat_region 96 197 . - . ID=0002 -Seq2 manual_annotations Spacer3 198 210 . + . ID=0001 +Seq2 manual_annotations Spacer1 1 15 . - . ID=0001 +Seq2 manual_annotations repeat_region 16 76 . + . ID=0001 +Seq2 manual_annotations SpacerZoom 77 95 . + . ID=0001 +Seq2 manual_annotations Spacer2 77 95 . + . ID=0001 +Seq2 manual_annotations repeat_region 96 197 . - . ID=0002 +Seq2 manual_annotations Spacer3 198 210 . + . ID=0001 diff --git a/test-data/example2.gff3 b/tests/test-data/example2.gff3 similarity index 98% rename from test-data/example2.gff3 rename to tests/test-data/example2.gff3 index 9e04b81..1552c62 100644 --- a/test-data/example2.gff3 +++ b/tests/test-data/example2.gff3 @@ -1,21 +1,21 @@ -Seq1 manual_annotations Spacer1 1 15 . - . ID=0001 -Seq1 manual_annotations repeat_region 16 76 . + . ID=0001 -Seq1 manual_annotations Spacer2 77 96 . - . ID=0001 -Seq1 manual_annotations Spacer3 97 109 . - . ID=0001 -Seq2 manual_annotations Spacer1 1 15 . - . ID=0001 -Seq2 manual_annotations repeat_region 16 76 . + . ID=0001 -Seq2 manual_annotations SpacerZoom 77 95 . + . ID=0001 -Seq2 manual_annotations Spacer2 77 95 . + . ID=0001 -Seq2 manual_annotations repeat_region 96 197 . - . ID=0002 -Seq2 manual_annotations Spacer3 198 210 . + . ID=0001 -Seq3 manual_annotations Spacer3 1 13 . - . ID=0001 -Seq3 manual_annotations repeat_region 14 115 . + . ID=0001 -Seq3 manual_annotations Spacer2 116 135 . + . ID=0001 -Seq3 manual_annotations repeat_region 136 195 . - . ID=0002 -Seq3 manual_annotations Spacer1 196 210 . + . ID=0001 -Seq4 manual_annotations Spacer3 1 13 . - . ID=0001 -Seq4 manual_annotations repeat_region 14 115 . + . ID=0001 -Seq4 manual_annotations Spacer2 116 135 . + . ID=0001 -Seq4 manual_annotations repeat_region 136 195 . - . ID=0002 -Seq4 manual_annotations Spacer1 196 210 . + . ID=0001 +Seq1 manual_annotations Spacer1 1 15 . - . ID=0001 +Seq1 manual_annotations repeat_region 16 76 . + . ID=0001 +Seq1 manual_annotations Spacer2 77 96 . - . ID=0001 +Seq1 manual_annotations Spacer3 97 109 . - . ID=0001 +Seq2 manual_annotations Spacer1 1 15 . - . ID=0001 +Seq2 manual_annotations repeat_region 16 76 . + . ID=0001 +Seq2 manual_annotations SpacerZoom 77 95 . + . ID=0001 +Seq2 manual_annotations Spacer2 77 95 . + . ID=0001 +Seq2 manual_annotations repeat_region 96 197 . - . ID=0002 +Seq2 manual_annotations Spacer3 198 210 . + . ID=0001 +Seq3 manual_annotations Spacer3 1 13 . - . ID=0001 +Seq3 manual_annotations repeat_region 14 115 . + . ID=0001 +Seq3 manual_annotations Spacer2 116 135 . + . ID=0001 +Seq3 manual_annotations repeat_region 136 195 . - . ID=0002 +Seq3 manual_annotations Spacer1 196 210 . + . ID=0001 +Seq4 manual_annotations Spacer3 1 13 . - . ID=0001 +Seq4 manual_annotations repeat_region 14 115 . + . ID=0001 +Seq4 manual_annotations Spacer2 116 135 . + . ID=0001 +Seq4 manual_annotations repeat_region 136 195 . - . ID=0002 +Seq4 manual_annotations Spacer1 196 210 . + . ID=0001 Seq5 manual_annotations repeat_region 3 22 . + . ID=0001 \ No newline at end of file diff --git a/test-data/gff_color.config b/tests/test-data/gff_color.config similarity index 96% rename from test-data/gff_color.config rename to tests/test-data/gff_color.config index aaedbdf..a635c3e 100644 --- a/test-data/gff_color.config +++ b/tests/test-data/gff_color.config @@ -1,7 +1,7 @@ -#annotation_type color transparency(alpha) zoom(linewidth_adjustment) -repeat_region #2dd0f0 1 0 -repeat_region_rev #2E8B57 0.6 0 -spacer1 black 0.15 0 -Spacer2 grey 1 0 -SPACER3 black 0.7 0 -SpacerZoom #b41a31 0.0 8 +#annotation_type color transparency(alpha) zoom(linewidth_adjustment) +repeat_region #2dd0f0 1 0 +repeat_region_rev #2E8B57 0.6 0 +spacer1 black 0.15 0 +Spacer2 grey 1 0 +SPACER3 black 0.7 0 +SpacerZoom #b41a31 0.0 8 diff --git a/tests/test-data/mintir.fasta b/tests/test-data/mintir.fasta new file mode 100644 index 0000000..369c770 --- /dev/null +++ b/tests/test-data/mintir.fasta @@ -0,0 +1,8 @@ +>min_tir +ATGGCTCAAGCCCATGGTTCCTAY-CATAACGAACCATGGGCTTGAGCCAT +>min_tir_2 +ATGGCTCAAGCCCATGGTTCCTAYCATAACGAACCATGGGCTTGAGCCAT +>left_tandem +ATGGCTCAAGCCCATGGTTCGCGCGCGGCGATGGCTCAAGCCCATGGTTC +>right_tandem +GAACCATGGGCTTGAGCCATATATGATATAGAACCATGGGCTTGAGCCAT \ No newline at end of file diff --git a/tests/test-data/mintir3.fasta b/tests/test-data/mintir3.fasta new file mode 100644 index 0000000..7daa47f --- /dev/null +++ b/tests/test-data/mintir3.fasta @@ -0,0 +1,2 @@ +>min_tir_3 +ATGGCTCAAGCCCATGGTTCCTAYN-CATAACGAACCATGGGCTTGAGCCAT \ No newline at end of file diff --git a/test-data/sSaTar_example/foo.txt b/tests/test-data/sSaTar_example/foo.txt similarity index 100% rename from test-data/sSaTar_example/foo.txt rename to tests/test-data/sSaTar_example/foo.txt diff --git a/test-data/sSaTar_example/sSaTar.config b/tests/test-data/sSaTar_example/sSaTar.config similarity index 96% rename from test-data/sSaTar_example/sSaTar.config rename to tests/test-data/sSaTar_example/sSaTar.config index 7e39b6e..38e44a1 100644 --- a/test-data/sSaTar_example/sSaTar.config +++ b/tests/test-data/sSaTar_example/sSaTar.config @@ -1,6 +1,6 @@ -#annotation_type color transparency(alpha) zoom(linewidth_adjustment) -micro #e41a1c 0.7 10 -sSaTar1 #09602d 0.7 0 -sSaTar2 #7fff99 0.7 0 -sSaTar3 #629cd7 0.7 0 - +#annotation_type color transparency(alpha) zoom(linewidth_adjustment) +micro #e41a1c 0.7 10 +sSaTar1 #09602d 0.7 0 +sSaTar2 #7fff99 0.7 0 +sSaTar3 #629cd7 0.7 0 + diff --git a/test-data/sSaTar_example/sSaTar.fas b/tests/test-data/sSaTar_example/sSaTar.fas similarity index 100% rename from test-data/sSaTar_example/sSaTar.fas rename to tests/test-data/sSaTar_example/sSaTar.fas diff --git a/test-data/sSaTar_example/sSaTar.gff3 b/tests/test-data/sSaTar_example/sSaTar.gff3 similarity index 98% rename from test-data/sSaTar_example/sSaTar.gff3 rename to tests/test-data/sSaTar_example/sSaTar.gff3 index 219ba11..ba5846e 100644 --- a/test-data/sSaTar_example/sSaTar.gff3 +++ b/tests/test-data/sSaTar_example/sSaTar.gff3 @@ -1,17 +1,17 @@ -sSaTar_cluster_chr1_sorghum manual_annotations micro 41 51 . + . ID=0001 -sSaTar_cluster_chr1_sorghum manual_annotations sSaTar1 52 254 . + . ID=0002 -sSaTar_cluster_chr1_sorghum manual_annotations micro 255 277 . + . ID=0003 -sSaTar_cluster_chr1_sorghum manual_annotations sSaTar3 278 526 . + . ID=0004 -sSaTar_cluster_chr1_sorghum manual_annotations micro 527 538 . + . ID=0005 -sSaTar_cluster_chr1_sorghum manual_annotations sSaTar1 539 771 . + . ID=0006 -sSaTar_cluster_chr1_sorghum manual_annotations micro 772 783 . + . ID=0007 -sSaTar_cluster_chr1_sorghum manual_annotations sSaTar2 784 1141 . + . ID=0008 -sSaTar_cluster_chr1_sorghum manual_annotations micro 1142 1151 . + . ID=0009 -sSaTar_cluster_chr1_sorghum manual_annotations sSaTar2 1152 1520 . + . ID=0010 -sSaTar_cluster_chr1_sorghum manual_annotations micro 1521 1591 . + . ID=0011 -sSaTar_cluster_chr1_sorghum manual_annotations sSaTar3 1592 1846 . + . ID=0012 -sSaTar_cluster_chr1_sorghum manual_annotations micro 1847 1856 . + . ID=0013 -sSaTar_cluster_chr1_sorghum manual_annotations sSaTar1 1857 2051 . + . ID=0014 -sSaTar_cluster_chr1_sorghum manual_annotations micro 2052 2056 . + . ID=0015 -sSaTar_cluster_chr1_sorghum manual_annotations sSaTar1 2057 2251 . + . ID=0016 +sSaTar_cluster_chr1_sorghum manual_annotations micro 41 51 . + . ID=0001 +sSaTar_cluster_chr1_sorghum manual_annotations sSaTar1 52 254 . + . ID=0002 +sSaTar_cluster_chr1_sorghum manual_annotations micro 255 277 . + . ID=0003 +sSaTar_cluster_chr1_sorghum manual_annotations sSaTar3 278 526 . + . ID=0004 +sSaTar_cluster_chr1_sorghum manual_annotations micro 527 538 . + . ID=0005 +sSaTar_cluster_chr1_sorghum manual_annotations sSaTar1 539 771 . + . ID=0006 +sSaTar_cluster_chr1_sorghum manual_annotations micro 772 783 . + . ID=0007 +sSaTar_cluster_chr1_sorghum manual_annotations sSaTar2 784 1141 . + . ID=0008 +sSaTar_cluster_chr1_sorghum manual_annotations micro 1142 1151 . + . ID=0009 +sSaTar_cluster_chr1_sorghum manual_annotations sSaTar2 1152 1520 . + . ID=0010 +sSaTar_cluster_chr1_sorghum manual_annotations micro 1521 1591 . + . ID=0011 +sSaTar_cluster_chr1_sorghum manual_annotations sSaTar3 1592 1846 . + . ID=0012 +sSaTar_cluster_chr1_sorghum manual_annotations micro 1847 1856 . + . ID=0013 +sSaTar_cluster_chr1_sorghum manual_annotations sSaTar1 1857 2051 . + . ID=0014 +sSaTar_cluster_chr1_sorghum manual_annotations micro 2052 2056 . + . ID=0015 +sSaTar_cluster_chr1_sorghum manual_annotations sSaTar1 2057 2251 . + . ID=0016 sSaTar_cluster_chr1_sorghum manual_annotations micro 2252 2265 . + . ID=0017 \ No newline at end of file diff --git a/test-data/sSaTar_example/sSaTar_cluster_flexi_300b.png b/tests/test-data/sSaTar_example/sSaTar_cluster_flexi_300b.png similarity index 100% rename from test-data/sSaTar_example/sSaTar_cluster_flexi_300b.png rename to tests/test-data/sSaTar_example/sSaTar_cluster_flexi_300b.png diff --git a/test-data/sSaTar_example/sSaTar_paper.png b/tests/test-data/sSaTar_example/sSaTar_paper.png similarity index 100% rename from test-data/sSaTar_example/sSaTar_paper.png rename to tests/test-data/sSaTar_example/sSaTar_paper.png diff --git a/test-data/test-seqs.fas b/tests/test-data/test-seqs.fasta similarity index 97% rename from test-data/test-seqs.fas rename to tests/test-data/test-seqs.fasta index 11a0b51..c27e32c 100644 --- a/test-data/test-seqs.fas +++ b/tests/test-data/test-seqs.fasta @@ -1,22 +1,22 @@ ->Seq1 -CGAACCGATATAATCAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGA -CAGGATATCGAGTAAAATGAGCTATAGGACAGTGACGTGACTTAGAGG ->Seq2 -CGAACCGATATAATCAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGA -CAGGATATCGAGTAAAATGAGCTATAGGACAGTGATTACTCGATATCCTGTCACTTTACT -CGATATCCTGTCACTTTACTCGATATCCTGTCACTTTACTCGATATCCTGTCACTTTACT -CGATATCCTGTCACTTTCGTGACTTAGAGG ->Seq3 -CCTCTAAGTCACGAAAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGA -CAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAATCACT -GTCCTATAGCTCATTTTACTCGATATCCTGTCACTTTACTCGATATCCTGTCACTTTACT -CGATATCCTGTCACTGATTATATCGGTTCG ->Seq4 -CCTCTAAGTCACGDAAGTGACSGGATATCGAGTAAAGTGNCAGGATATNGAGYAAAGVGA -CAGGAKATCGAGTHAAGTGACAGGATATCGAGTAAAGTGACAGGATBSCGAHTAATCAVT -GVCCTATAGRRCATTTTACTCGATATCCVGTYACTTTBSTCGATATCCTGKCACTTMACT -CGATATCCTGTCACTGATTATATCGGTTCG ->Seq5 -CCAAAGTGACAGGATATCGAGTCCGG ->Seq6 -AATCAGTGACAGGATATCCTGTCACTGATT +>Seq1 +CGAACCGATATAATCAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGA +CAGGATATCGAGTAAAATGAGCTATAGGACAGTGACGTGACTTAGAGG +>Seq2 +CGAACCGATATAATCAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGA +CAGGATATCGAGTAAAATGAGCTATAGGACAGTGATTACTCGATATCCTGTCACTTTACT +CGATATCCTGTCACTTTACTCGATATCCTGTCACTTTACTCGATATCCTGTCACTTTACT +CGATATCCTGTCACTTTCGTGACTTAGAGG +>Seq3 +CCTCTAAGTCACGAAAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGA +CAGGATATCGAGTAAAGTGACAGGATATCGAGTAAAGTGACAGGATATCGAGTAATCACT +GTCCTATAGCTCATTTTACTCGATATCCTGTCACTTTACTCGATATCCTGTCACTTTACT +CGATATCCTGTCACTGATTATATCGGTTCG +>Seq4 +CCTCTAAGTCACGDAAGTGACSGGATATCGAGTAAAGTGNCAGGATATNGAGYAAAGVGA +CAGGAKATCGAGTHAAGTGACAGGATATCGAGTAAAGTGACAGGATBSCGAHTAATCAVT +GVCCTATAGRRCATTTTACTCGATATCCVGTYACTTTBSTCGATATCCTGKCACTTMACT +CGATATCCTGTCACTGATTATATCGGTTCG +>Seq5 +CCAAAGTGACAGGATATCGAGTCCGG +>Seq6 +AATCAGTGACAGGATATCCTGTCACTGATT diff --git a/tests/test_matching.py b/tests/test_matching.py new file mode 100644 index 0000000..51976bc --- /dev/null +++ b/tests/test_matching.py @@ -0,0 +1,114 @@ +import pytest +import numpy as np +from Bio.Seq import Seq + +# Import the function to be tested +from flexidot.utils.matching import find_match_pos_diag + +def test_basic_functionality_nucleotide(): + seq1 = Seq("ATCGATCG") + seq2 = Seq("GATCGATC") + wordsize = 4 + x1, y1, x2, y2 = find_match_pos_diag(seq1, seq2, wordsize, rc_option=False) + assert len(x1) > 0 + assert len(y1) > 0 + assert len(x2) == 0 + assert len(y2) == 0 + +def test_basic_functionality_protein(): + seq1 = Seq("MKVLY") + seq2 = Seq("KVLYM") + wordsize = 3 + x1, y1, x2, y2 = find_match_pos_diag(seq1, seq2, wordsize, type_nuc=False, rc_option=False) + assert len(x1) > 0 + assert len(y1) > 0 + assert len(x2) == 0 + assert len(y2) == 0 + +def test_reverse_complement_enabled(): + seq1 = Seq("ATCGATCG") + seq2 = Seq("CGATCGAT") + wordsize = 4 + x1, y1, x2, y2 = find_match_pos_diag(seq1, seq2, wordsize, rc_option=True) + assert len(x1) > 0 + assert len(y1) > 0 + assert len(x2) > 0 + assert len(y2) > 0 + +def test_reverse_complement_disabled(): + seq1 = Seq("ATCGATCG") + seq2 = Seq("CGATCGAT") + wordsize = 4 + x1, y1, x2, y2 = find_match_pos_diag(seq1, seq2, wordsize, rc_option=False) + assert len(x1) > 0 + assert len(y1) > 0 + assert len(x2) == 0 + assert len(y2) == 0 + +def test_ambiguous_residues_no_conversion(): + seq1 = Seq("ATCGNNNN") + seq2 = Seq("NNNNGATC") + wordsize = 4 + # When convert_wobbles is False, the function should skip any kmers with ambiguous residues + x1, y1, x2, y2 = find_match_pos_diag(seq1, seq2, wordsize, convert_wobbles=False, max_N_percentage=60) + assert len(x1) == 0 + assert len(y1) == 0 + assert len(x2) == 0 + assert len(y2) == 0 + +def test_ambiguous_residues_with_conversion(): + seq1 = Seq("ATCGNNNN") + seq2 = Seq("NNNNGATC") + wordsize = 4 + x1, y1, x2, y2 = find_match_pos_diag(seq1, seq2, wordsize, convert_wobbles=True, max_N_percentage=26) + # One rev match ATCG -> NGAT + assert len(x1) == 0 + assert len(y1) == 0 + assert len(x2) == 1 + assert len(y2) == 1 + +def test_report_lcs_nucleotide(): + seq1 = Seq("ATCGATCG") + seq2 = Seq("GATCGATC") + wordsize = 4 + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq1, seq2, wordsize, report_lcs=True) + assert lcs_for == 7 + assert lcs_rev == 7 + +def test_report_lcs_protein(): + seq1 = Seq("MKVLY") + seq2 = Seq("KVLYM") + wordsize = 3 + x1, y1, x2, y2, lcs_for, lcs_rev = find_match_pos_diag(seq1, seq2, wordsize, report_lcs=True, type_nuc=False) + assert lcs_for > 0 + assert lcs_rev == 0 + +def test_short_sequences(): + seq1 = Seq("ATC") + seq2 = Seq("GAT") + wordsize = 4 + x1, y1, x2, y2 = find_match_pos_diag(seq1, seq2, wordsize) + assert len(x1) == 0 + assert len(y1) == 0 + assert len(x2) == 0 + assert len(y2) == 0 + +def test_all_ambiguous_residues(): + seq1 = Seq("NNNN") + seq2 = Seq("NNNN") + wordsize = 4 + x1, y1, x2, y2 = find_match_pos_diag(seq1, seq2, wordsize) + assert len(x1) == 0 + assert len(y1) == 0 + assert len(x2) == 0 + assert len(y2) == 0 + +def test_empty_sequences(): + seq1 = Seq("") + seq2 = Seq("") + wordsize = 4 + x1, y1, x2, y2 = find_match_pos_diag(seq1, seq2, wordsize) + assert len(x1) == 0 + assert len(y1) == 0 + assert len(x2) == 0 + assert len(y2) == 0 \ No newline at end of file