forked from galaxyproject/tools-iuc
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add colabfold * missed shed in git add * tar file * Update tools/colabfold/.shed.yml Co-authored-by: Wolfgang Maier <maierw@posteo.de> * update with most of the suggestion * add tests for msa, change minor params * fix shed file to make suite * fix file name problem, add pre-set param for naming files to prevent problems, update docker image version * add num_outputs test to alphafold tool * fix test because inputs removed conditionals * fix missing text modifier * Add hardcoded file names to prevent file headers from breaking dataset history names, modify help text, put params into advanced section * fix typo from creating an advanced section * fix msa filenames * update archive member paths, assert expect_error * test update * print dir content test * try as just tar * add back ls * wrong place for && * quotes and another archive test * make tests match now that one passed * switch back to colab.tar * test fail state of alphafold tool * Failed properly, tool ready * Try expect code * Expect both failure and exit code * Expect fail, exit code, and num outputs * remove num_outputs to fix lint * matthias final pass * better descrption for alphafold * small changes * another round --------- Co-authored-by: Alexander OSTROVSKY <alexanderostrovsky@Chell.local> Co-authored-by: Wolfgang Maier <maierw@posteo.de> Co-authored-by: Björn Grüning <bjoern@gruenings.eu>
- Loading branch information
1 parent
c7f54e6
commit a95dcf3
Showing
9 changed files
with
3,243 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
name: colabfold | ||
owner: iuc | ||
categories: | ||
- Proteomics | ||
- Graphics | ||
description: "Protein prediction based on AlphaFold2" | ||
homepage_url: https://github.com/sokrypton/ColabFold | ||
long_description: | | ||
ColabFold offers accelerated (40-60x faster) prediction of protein structures | ||
and complexes by combining the fast homology search of MMseqs2 | ||
with AlphaFold2 or RoseTTAFold. | ||
remote_repository_url: https://github.com/sokrypton/ColabFold | ||
type: unrestricted | ||
auto_tool_repositories: | ||
name_template: "{{ tool_id }}" | ||
description_template: "Wrapper for the colabfold tool suite: {{ tool_name }}" | ||
suite: | ||
name: "suite_colabfold" | ||
description: "Protein prediction based on AlphaFold2" | ||
long_description: | | ||
ColabFold offers accelerated (40-60x faster) prediction of protein structures | ||
and complexes by combining the fast homology search of MMseqs2 | ||
with AlphaFold2 or RoseTTAFold. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
<tool id="colabfold_alphafold" name="Colabfold Alphafold" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05"> | ||
<description>Predict protein structures with Colabfold</description> | ||
<macros> | ||
<import>macros.xml</import> | ||
</macros> | ||
<expand macro="biotools"/> | ||
<expand macro="requirements" /> | ||
<command detect_errors="exit_code"><![CDATA[ | ||
#import os | ||
mkdir input_data && | ||
tar -xmf '$input' --strip-components 1 -C input_data && | ||
mkdir output && | ||
colabfold_batch | ||
#if str($advanced.num_recycles)!="": | ||
--num-recycle $advanced.num_recycles | ||
#end if | ||
#if str($advanced.recycle_early_stop_tolerance)!="": | ||
--recycle-early-stop-tolerance $advanced.recycle_early_stop_tolerance | ||
#end if | ||
#if $advanced.num_ensemble: | ||
--num-ensemble $advanced.num_ensemble | ||
#end if | ||
#if str($advanced.random_seed)!="": | ||
--random-seed $advanced.random_seed | ||
#end if | ||
#if str($advanced.num_seeds)!="": | ||
--num-seeds $advanced.num_seeds | ||
#end if | ||
#if $advanced.num_models: | ||
--num-models $advanced.num_models | ||
#end if | ||
$advanced.use_dropout | ||
--max-msa $advanced.max_msa | ||
#if $advanced.amber.use_amber == "yes": | ||
--amber | ||
--num-relax $advanced.amber.num_relaxed | ||
#end if | ||
$output_options.save_all | ||
$output_options.save_recycles | ||
$output_options.save_single_representations | ||
$output_options.save_pair_representations | ||
--jobname-prefix "galaxy" | ||
input_data | ||
output | ||
&& cd output | ||
&& mv *.a3m output.a3m | ||
&& mkdir png_out | ||
&& mkdir json_out | ||
&& mkdir pdb_out | ||
&& mv ./*.png png_out | ||
&& mv ./*.json json_out | ||
&& mv ./*.pdb pdb_out | ||
&& mv json_out/config.json . | ||
#if $output_options.save_all: | ||
&& mkdir pickle_out | ||
&& mv ./*.pickle pickle_out | ||
#end if | ||
#if $output_options.save_pair_representations or $output_options.save_single_representations: | ||
&& mkdir npy_out | ||
&& mv ./*.npy npy_out | ||
#end if | ||
]]></command> | ||
<inputs> | ||
<param name="input" type="data" format="colab.tar" label="Tar file output from colabfold MSA tool"/> | ||
<section name="advanced" title="Advanced options"> | ||
<param argument="--num-recycles" label="How many recycles to run?" type="integer" optional="true" help="Number of prediction recycles. Increasing recycles can improve the prediction quality but slows down the prediction."/> | ||
<param argument="--recycle-early-stop-tolerance" type="float" optional="true" min="0.0" max="1.0" help="Specify convergence criteria. Run recycles until the distance between recycles is within the given tolerance value."/> | ||
<param argument='--num-ensemble' label="Number of ensembles" type="integer" min="1" optional="true" help="Number of ensembles. The trunk of the network is run multiple times with different random choices for the MSA cluster centers. This can result in a better prediction at the cost of longer runtime."/> | ||
<param argument="--random-seed" label="Set seed" type="integer" min="0" optional="true"/> | ||
<param argument="--num-seeds" label="Number of seeds" type="integer" min="0" optional="true" help="Number of seeds to try iterated based on random seed"/> | ||
<param argument="--num-models" label="Number of models to use for structure prediction" type="integer" min="1" max="5" help="Reducing the number of models speeds up the prediction but results in lower quality"/> | ||
<param name="max_msa" label="Max msa" type="select" help="Defines the ratio of max-seq to max-extra-seq for one run. Enable dropouts and increase the number of seeds to sample predictions from uncertainty of the model. Decrease to increase uncertainity"> | ||
<!-- <option value="auto">auto</option> --> | ||
<option value="512:1024">512:1024</option> | ||
<option value="256:512">256:512</option> | ||
<option value="64:128">64:128</option> | ||
<option value="32:64">32:64</option> | ||
<option value="16:32">16:32</option> | ||
</param> | ||
<param argument="--use-dropout" label="Use dropouts" type="boolean" truevalue="--use-dropout" falsevalue="" help="Activate dropouts during inference to sample from the uncertainty of the models."/> | ||
<conditional name="amber"> | ||
<param name="use_amber" label="Use AMBER" type="select" help="Use AMBER force field for structure refinement and side chain optimization"> | ||
<option value="yes">Use AMBER</option> | ||
<option value="no">Don't use AMBER</option> | ||
</param> | ||
<when value="no"/> | ||
<when value="yes"> | ||
<param argument="--num-relaxed" label="How many top-ranked structures to relax using AMBER?" type="integer" min="0" value="0" help="Increased values may increase runtime"/> | ||
</when> | ||
</conditional> | ||
</section> | ||
<!-- Add for second version of tool for batch jobs --> | ||
<!-- <param name="stop_at" label="Stop score" type="float" min="0.0" optional="true" help="Compute models until pLDDT (single chain) or pTM-score (multimer) > threshold is reached. This speeds up prediction by running less models for easier queries."/> --> | ||
<section name="output_options" title="Output Options"> | ||
<param argument="--save-all" type="boolean" label="Save raw outputs from model to a pickle file" truevalue="--save-all" falsevalue=""/> | ||
<param argument="--save-recycles" type="boolean" label="Save all intermediate predictions at each recycle iteration" truevalue="--save-recycles" falsevalue=""/> | ||
<param argument="--save-single-representations" type="boolean" label="Save the single representation embeddings of all models." truevalue="--save-single-representations" falsevalue=""/> | ||
<param argument="--save-pair-representations" type="boolean" label="Save the pair representation embeddings of all models." truevalue="--save-pair-representations" falsevalue=""/> | ||
</section> | ||
</inputs> | ||
<outputs> | ||
<collection name="png_files" type="list" format="png" label="${tool.name} on ${on_string}: Figures"> | ||
<discover_datasets format="png" pattern="__name_and_ext__" directory="output/png_out"/> | ||
</collection> | ||
<collection name="json_files" type="list" format="json" label="${tool.name} on ${on_string}: JSON predictions"> | ||
<discover_datasets format="json" pattern="__name_and_ext__" directory="output/json_out"/> | ||
</collection> | ||
<collection name="pdb" type="list" format="pdb" label="${tool.name} on ${on_string}: PDB predictions"> | ||
<discover_datasets format="pdb" pattern="__name_and_ext__" directory="output/pdb_out"/> | ||
</collection> | ||
<collection name="pickle" type="list" format="pickle" label="${tool.name} on ${on_string}: Pickle file outputs"> | ||
<discover_datasets format="pickle" pattern="__name_and_ext__" directory="output/pickle_out"/> | ||
<filter>output_options['save_all']</filter> | ||
</collection> | ||
<collection name="npy" type="list" format="npy" label="${tool.name} on ${on_string}: Numpy embeddings"> | ||
<discover_datasets format="npy" pattern="__name_and_ext__" directory="output/npy_out"/> | ||
<filter>output_options['save_single_representations'] or output_options['save_pair_representations']</filter> | ||
</collection> | ||
<data name="a3m_out" format="a3m" from_work_dir="output/output.a3m" label="${tool.name} on ${on_string}: a3m file"/> | ||
</outputs> | ||
<tests> | ||
<test expect_failure="true" expect_exit_code="1"> | ||
<param name="input" value="input.tar"/> | ||
<section name="advanced"> | ||
<param name="num_recycles" value="4"/> | ||
<param name="recycle_early_stop_tolerance" value="0.4"/> | ||
<param name="num_ensemble" value="1"/> | ||
<param name="random_seed" value="43"/> | ||
<param name="num_seeds" value="2"/> | ||
<param name="num_models" value="2"/> | ||
<param name="max_msa" value="64:128"/> | ||
<param name="use_dropout" value="--use-dropout"/> | ||
<conditional name="amber"> | ||
<param name="use_amber" value="yes"/> | ||
<param name="num_relaxed" value="0"/> | ||
</conditional> | ||
</section> | ||
<section name="output_options"> | ||
<param name="save_all" value="--save-all"/> | ||
<param name="save_recycles" value="--save-recycles"/> | ||
<param name="save_single_representations" value="--save-single-representations"/> | ||
<param name="save_pair_representations" value="--save-pair-representations"/> | ||
</section> | ||
<assert_command> | ||
<has_text text="colabfold_batch --num-recycle 4 --recycle-early-stop-tolerance 0.4 --num-ensemble 1 --random-seed 43"/> | ||
<has_text text="--num-seeds 2 --num-models 2 --use-dropout --max-msa 64:128 --amber --num-relax 0"/> | ||
<has_text text="--save-all --save-recycles --save-single-representations --save-pair-representations"/> | ||
</assert_command> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
Generate run a folding step on the output of the colabfold MSA run | ||
]]></help> | ||
<expand macro="citations"/> | ||
</tool> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
<tool id="colabfold_msa" name="Colabfold MSA" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05"> | ||
<description>Generate MSAs for the Alphafold step of Colabfold</description> | ||
<macros> | ||
<import>macros.xml</import> | ||
</macros> | ||
<expand macro="biotools"/> | ||
<expand macro="requirements" /> | ||
<command detect_errors="exit_code"><![CDATA[ | ||
#import re | ||
## Symlinking and formatting | ||
#if $custom_template: | ||
mkdir template_dir && | ||
#for $file in $custom_template: | ||
#set input_file = re.sub('[^\w\-_\.]', '_', str($file.element_identifier)) + '.pdb' | ||
ln -s $file 'template_dir/${file.element_identifier}.pdb' && | ||
#end for | ||
#end if | ||
## For single file runs. Will need to be updated for multiple file calls | ||
#set input_file = re.sub('[^\w\-_\.]', '_', str($query_type.input.element_identifier)) + "." + str($query_type.input.ext) | ||
ln -s $query_type.input '$input_file' && | ||
mkdir output && | ||
colabfold_batch --msa-only | ||
#if $query_type.select_query_type == "fasta": | ||
#if $query_type.select_db.use_db == "yes": | ||
--msa-mode $query_type.select_db.msa_mode | ||
#end if | ||
#end if | ||
--pair-mode $pair_mode | ||
## --pair-strategy $pairing_strategy | ||
$templates | ||
#if $custom_template: | ||
--custom-template-path template_dir | ||
#end if | ||
--jobname-prefix 'galaxy' | ||
'$input_file' | ||
output.colab && | ||
tar -cf output.colab.tar output.colab | ||
]]></command> | ||
<inputs> | ||
<conditional name="query_type"> | ||
<param name="select_query_type" label="Data input method" type="select"> | ||
<option value="fasta">FASTA file</option> | ||
<option value="a3m">a3m file</option> | ||
</param> | ||
<when value="fasta"> | ||
<param name="input" type="data" format="fasta" label="Query sequence fasta"/> | ||
<expand macro="db_selector"/> | ||
</when> | ||
<when value="a3m"> | ||
<param name="input" type="data" format="a3m" label="Query sequence a3m file"/> | ||
</when> | ||
</conditional> | ||
<param name="pair_mode" label="Pair mode" type="select"> | ||
<option value="unpaired_paired">Attempt to pair sequences from the same operon within the genome</option> | ||
<option value="paired">Only used sequences that were successfully paired</option> | ||
<option value="unpaired">Generate separate MSA for each protein</option> | ||
</param> | ||
<!-- Non functional in current release of colabfold, planned to expose in next one --> | ||
<!-- <param name="pairing_strategy" label="Pairing strategy" type="select"> | ||
<option value="greedy">Greedy: MSA sequences should only be paired if the same species exist in at least two MSAs </option> | ||
<option value="complete">Complete: MSA sequences should only be paired if the same species exist in all MSAs</option> | ||
</param> --> | ||
<param argument="--templates" type="boolean" truevalue="--templates" falsevalue="" label="Query PDB templates from the MSA server"/> | ||
<param name="custom_template" type="data" format="pdb" multiple="true" optional="true" label="List of PDB files to provide the prediction as custom templates" /> | ||
</inputs> | ||
<outputs> | ||
<data name="output" format="colab.tar" from_work_dir="output.colab.tar" label="${tool.name} on ${on_string}: tar file"/> | ||
</outputs> | ||
<tests> | ||
<!-- fasta --> | ||
<test expect_num_outputs="1"> | ||
<conditional name="query_type"> | ||
<param name="select_query_type" value="fasta"/> | ||
<param name="input" value="test.fasta"/> | ||
<conditional name="select_db"> | ||
<param name="use_db" value="yes"/> | ||
<param name="msa_mode" value="mmseqs2_uniref"/> | ||
</conditional> | ||
</conditional> | ||
<assert_command> | ||
<has_text text="colabfold_batch --msa-only --msa-mode mmseqs2_uniref --pair-mode unpaired_paired"/> | ||
</assert_command> | ||
<output name="output"> | ||
<assert_contents> | ||
<has_archive_member path=".*\/galaxy_0_all/msa.sh"/> | ||
</assert_contents> | ||
</output> | ||
</test> | ||
<!-- a3m --> | ||
<test expect_num_outputs="1"> | ||
<conditional name="query_type"> | ||
<param name="select_query_type" value="a3m"/> | ||
<param name="input" value="test.a3m" ftype="a3m"/> | ||
</conditional> | ||
<param name="pair_mode" value="paired"/> | ||
<param name="templates" value="--templates"/> | ||
<param name="custom_template" value="test.pdb,test_2.pdb"/> | ||
<assert_command> | ||
<has_text text="colabfold_batch --msa-only --pair-mode paired --templates --custom-template-path template_dir"/> | ||
</assert_command> | ||
<output name="output"> | ||
<assert_contents> | ||
<has_archive_member path=".*\/galaxy_0.pickle"/> | ||
</assert_contents> | ||
</output> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
Generate MSAs for the alphafold step of Colabfold | ||
]]></help> | ||
<expand macro="citations"/> | ||
</tool> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
<macros> | ||
<token name="@VERSION@">1.5.5</token> | ||
<token name="@CUDA_VERSION@">12.2.2</token> | ||
<token name="@VERSION_SUFFIX@">0</token> | ||
<xml name="requirements"> | ||
<requirements> | ||
<container type="docker">ghcr.io/sokrypton/colabfold:@VERSION@-cuda@CUDA_VERSION@</container> | ||
</requirements> | ||
</xml> | ||
<xml name="biotools"> | ||
<xrefs> | ||
<xref type="bio.tools">Colabfold</xref> | ||
</xrefs> | ||
</xml> | ||
<xml name="citations"> | ||
<citations> | ||
<citation type="doi">10.1038/s41592-022-01488-1</citation> | ||
</citations> | ||
</xml> | ||
<xml name="db_selector"> | ||
<conditional name="select_db"> | ||
<param name="use_db" type="select" label="Manually set database?"> | ||
<option value="yes">Yes</option> | ||
<option value="no">No</option> | ||
</param> | ||
<when value="no"></when> | ||
<when value="yes"> | ||
<param name="msa_mode" label="MSA mode" type="select"> | ||
<option value="mmseqs2_uniref_env">mmseqs2_uniref_env</option> | ||
<option value="mmseqs2_uniref">mmseqs2_uniref</option> | ||
<option value="single_sequence">Use single sequence input</option> | ||
<!-- <option value="custom">custom</option> --> | ||
</param> | ||
</when> | ||
</conditional> | ||
</xml> | ||
</macros> |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#38 1 | ||
>101 | ||
MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS | ||
>UniRef100_N6VR80 62 0.971 1.153E-09 3 37 38 17 51 52 | ||
---IKRSSRRWKKKGRMRWKWYKKRLRRLKRERRRARS | ||
>UniRef100_A0A534JJZ5 55 0.722 4.940E-07 3 37 38 0 35 80 | ||
---MKRSSRAWKKRGKMRWKWRKKRMRRRKREQKlRART | ||
>UniRef100_A0A8T5HQN6 54 0.647 9.355E-07 3 36 38 14 47 48 | ||
---MKRSSRRWKKKGQMRWKWQRKRMKKEKRKRAKSR- | ||
>UniRef100_A0A2E4RG04 53 0.666 1.772E-06 3 35 38 36 68 71 | ||
---MKRGSRAWKKQGNQRWKWRKKKLRRRKASRKRA-- | ||
>UniRef100_R1E4G0 53 0.617 2.438E-06 3 36 38 0 33 38 | ||
---MRRSSRRWKKYLRSRWKWQRRRIREEKRLRKIAR- | ||
>UniRef100_A0A397WLW3 53 0.676 2.438E-06 3 36 38 0 33 38 | ||
---MKRSSRRWKKYKRSRWKWQKKRMKEEKRLRKLAR- | ||
>UniRef100_A0A2K3J9R6 52 0.700 4.619E-06 3 32 38 0 29 32 | ||
---MKRSSRVWKKRHKMRWKWRKKRMRREKRSR----- | ||
>UniRef100_A6VJM7 51 0.862 8.751E-06 3 31 38 5 33 39 | ||
---IKRSSRRWKKKGQMRWKHYKKRIRRMKRE------ | ||
>UniRef100_A0A7M3WK46 51 0.666 8.751E-06 3 35 38 30 62 66 | ||
---MKRGSRAWKKQGKQRWKWRKKKLRRRKAARKRA-- | ||
>UniRef100_A0A915SG42 51 0.617 1.205E-05 3 36 38 0 33 38 | ||
---MKRSSRRWKKYLRSRWKWQRRRIREEKRLRKVTR- | ||
>UniRef100_A0A510BD48 51 0.900 1.205E-05 3 32 38 31 60 64 | ||
---IKRSSRRWKKKGRMRWRHYKKRLRRRKRER----- | ||
>UniRef100_A0A075M0T1 50 0.638 1.658E-05 1 36 38 22 57 61 | ||
-VIMKRRPRKWKKKGRMRWKWLKKRIRRLKRQHRKER- | ||
>UniRef100_A4FYQ5 48 0.851 8.201E-05 3 29 38 5 31 39 | ||
---IKRSSRRWKKKGQMRWKHYKKRIRRMK-------- | ||
>UniRef100_A0A8J7USD9 48 0.888 8.201E-05 3 29 38 13 39 47 | ||
---IKRSSRRWKKKGQMRWKHYKKRLRRMK-------- | ||
>UniRef100_A0A2K3JJ52 48 0.700 1.129E-04 3 32 38 0 29 34 | ||
---MKRSSRVWKKRRKMRWKWRKKRMRREKRMR----- | ||
>UniRef100_A6UVG5 48 0.821 1.129E-04 3 30 38 9 36 42 | ||
---IKRSSRRWKKKGQMRWSHYKKRIRRMKR------- | ||
>UniRef100_A0A5E4HZQ2 43 0.750 7.251E-03 7 34 38 25 52 58 | ||
-------PRKWKKKGRMRWKWVKKRRKRLKRKIKR--- | ||
>UniRef100_A0A2H6JYE4 36 0.566 1.234E+00 4 33 38 3 32 37 | ||
----KHSSRKWKKRGKCRWKTRKKKLKERRRQRK---- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
>testing | ||
MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS |
Oops, something went wrong.