Skip to content

Commit

Permalink
Add colabfold (galaxyproject#5785)
Browse files Browse the repository at this point in the history
* add colabfold

* missed shed in git add

* tar file

* Update tools/colabfold/.shed.yml

Co-authored-by: Wolfgang Maier <maierw@posteo.de>

* update with most of the suggestion

* add tests for msa, change minor params

* fix shed file to make suite

* fix file name problem, add pre-set param for naming files to prevent problems, update docker image version

* add num_outputs test to alphafold tool

* fix test because inputs removed conditionals

* fix missing text modifier

* Add hardcoded file names to prevent file headers from breaking dataset history names, modify help text, put params into advanced section

* fix typo from creating an advanced section

* fix msa filenames

* update archive member paths, assert expect_error

* test update

* print dir content test

* try as just tar

* add back ls

* wrong place for &&

* quotes and another archive test

* make tests match now that one passed

* switch back to colab.tar

* test fail state of alphafold tool

* Failed properly, tool ready

* Try expect code

* Expect both failure and exit code

* Expect fail, exit code, and num outputs

* remove num_outputs to fix lint

* matthias final pass

* better descrption for alphafold

* small changes

* another round

---------

Co-authored-by: Alexander OSTROVSKY <alexanderostrovsky@Chell.local>
Co-authored-by: Wolfgang Maier <maierw@posteo.de>
Co-authored-by: Björn Grüning <bjoern@gruenings.eu>
  • Loading branch information
4 people authored Mar 24, 2024
1 parent c7f54e6 commit a95dcf3
Show file tree
Hide file tree
Showing 9 changed files with 3,243 additions and 0 deletions.
23 changes: 23 additions & 0 deletions tools/colabfold/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: colabfold
owner: iuc
categories:
- Proteomics
- Graphics
description: "Protein prediction based on AlphaFold2"
homepage_url: https://github.com/sokrypton/ColabFold
long_description: |
ColabFold offers accelerated (40-60x faster) prediction of protein structures
and complexes by combining the fast homology search of MMseqs2
with AlphaFold2 or RoseTTAFold.
remote_repository_url: https://github.com/sokrypton/ColabFold
type: unrestricted
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "Wrapper for the colabfold tool suite: {{ tool_name }}"
suite:
name: "suite_colabfold"
description: "Protein prediction based on AlphaFold2"
long_description: |
ColabFold offers accelerated (40-60x faster) prediction of protein structures
and complexes by combining the fast homology search of MMseqs2
with AlphaFold2 or RoseTTAFold.
156 changes: 156 additions & 0 deletions tools/colabfold/colabfold_alphafold.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
<tool id="colabfold_alphafold" name="Colabfold Alphafold" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
<description>Predict protein structures with Colabfold</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="biotools"/>
<expand macro="requirements" />
<command detect_errors="exit_code"><![CDATA[
#import os
mkdir input_data &&
tar -xmf '$input' --strip-components 1 -C input_data &&
mkdir output &&
colabfold_batch
#if str($advanced.num_recycles)!="":
--num-recycle $advanced.num_recycles
#end if
#if str($advanced.recycle_early_stop_tolerance)!="":
--recycle-early-stop-tolerance $advanced.recycle_early_stop_tolerance
#end if
#if $advanced.num_ensemble:
--num-ensemble $advanced.num_ensemble
#end if
#if str($advanced.random_seed)!="":
--random-seed $advanced.random_seed
#end if
#if str($advanced.num_seeds)!="":
--num-seeds $advanced.num_seeds
#end if
#if $advanced.num_models:
--num-models $advanced.num_models
#end if
$advanced.use_dropout
--max-msa $advanced.max_msa
#if $advanced.amber.use_amber == "yes":
--amber
--num-relax $advanced.amber.num_relaxed
#end if
$output_options.save_all
$output_options.save_recycles
$output_options.save_single_representations
$output_options.save_pair_representations
--jobname-prefix "galaxy"
input_data
output
&& cd output
&& mv *.a3m output.a3m
&& mkdir png_out
&& mkdir json_out
&& mkdir pdb_out
&& mv ./*.png png_out
&& mv ./*.json json_out
&& mv ./*.pdb pdb_out
&& mv json_out/config.json .
#if $output_options.save_all:
&& mkdir pickle_out
&& mv ./*.pickle pickle_out
#end if
#if $output_options.save_pair_representations or $output_options.save_single_representations:
&& mkdir npy_out
&& mv ./*.npy npy_out
#end if
]]></command>
<inputs>
<param name="input" type="data" format="colab.tar" label="Tar file output from colabfold MSA tool"/>
<section name="advanced" title="Advanced options">
<param argument="--num-recycles" label="How many recycles to run?" type="integer" optional="true" help="Number of prediction recycles. Increasing recycles can improve the prediction quality but slows down the prediction."/>
<param argument="--recycle-early-stop-tolerance" type="float" optional="true" min="0.0" max="1.0" help="Specify convergence criteria. Run recycles until the distance between recycles is within the given tolerance value."/>
<param argument='--num-ensemble' label="Number of ensembles" type="integer" min="1" optional="true" help="Number of ensembles. The trunk of the network is run multiple times with different random choices for the MSA cluster centers. This can result in a better prediction at the cost of longer runtime."/>
<param argument="--random-seed" label="Set seed" type="integer" min="0" optional="true"/>
<param argument="--num-seeds" label="Number of seeds" type="integer" min="0" optional="true" help="Number of seeds to try iterated based on random seed"/>
<param argument="--num-models" label="Number of models to use for structure prediction" type="integer" min="1" max="5" help="Reducing the number of models speeds up the prediction but results in lower quality"/>
<param name="max_msa" label="Max msa" type="select" help="Defines the ratio of max-seq to max-extra-seq for one run. Enable dropouts and increase the number of seeds to sample predictions from uncertainty of the model. Decrease to increase uncertainity">
<!-- <option value="auto">auto</option> -->
<option value="512:1024">512:1024</option>
<option value="256:512">256:512</option>
<option value="64:128">64:128</option>
<option value="32:64">32:64</option>
<option value="16:32">16:32</option>
</param>
<param argument="--use-dropout" label="Use dropouts" type="boolean" truevalue="--use-dropout" falsevalue="" help="Activate dropouts during inference to sample from the uncertainty of the models."/>
<conditional name="amber">
<param name="use_amber" label="Use AMBER" type="select" help="Use AMBER force field for structure refinement and side chain optimization">
<option value="yes">Use AMBER</option>
<option value="no">Don't use AMBER</option>
</param>
<when value="no"/>
<when value="yes">
<param argument="--num-relaxed" label="How many top-ranked structures to relax using AMBER?" type="integer" min="0" value="0" help="Increased values may increase runtime"/>
</when>
</conditional>
</section>
<!-- Add for second version of tool for batch jobs -->
<!-- <param name="stop_at" label="Stop score" type="float" min="0.0" optional="true" help="Compute models until pLDDT (single chain) or pTM-score (multimer) > threshold is reached. This speeds up prediction by running less models for easier queries."/> -->
<section name="output_options" title="Output Options">
<param argument="--save-all" type="boolean" label="Save raw outputs from model to a pickle file" truevalue="--save-all" falsevalue=""/>
<param argument="--save-recycles" type="boolean" label="Save all intermediate predictions at each recycle iteration" truevalue="--save-recycles" falsevalue=""/>
<param argument="--save-single-representations" type="boolean" label="Save the single representation embeddings of all models." truevalue="--save-single-representations" falsevalue=""/>
<param argument="--save-pair-representations" type="boolean" label="Save the pair representation embeddings of all models." truevalue="--save-pair-representations" falsevalue=""/>
</section>
</inputs>
<outputs>
<collection name="png_files" type="list" format="png" label="${tool.name} on ${on_string}: Figures">
<discover_datasets format="png" pattern="__name_and_ext__" directory="output/png_out"/>
</collection>
<collection name="json_files" type="list" format="json" label="${tool.name} on ${on_string}: JSON predictions">
<discover_datasets format="json" pattern="__name_and_ext__" directory="output/json_out"/>
</collection>
<collection name="pdb" type="list" format="pdb" label="${tool.name} on ${on_string}: PDB predictions">
<discover_datasets format="pdb" pattern="__name_and_ext__" directory="output/pdb_out"/>
</collection>
<collection name="pickle" type="list" format="pickle" label="${tool.name} on ${on_string}: Pickle file outputs">
<discover_datasets format="pickle" pattern="__name_and_ext__" directory="output/pickle_out"/>
<filter>output_options['save_all']</filter>
</collection>
<collection name="npy" type="list" format="npy" label="${tool.name} on ${on_string}: Numpy embeddings">
<discover_datasets format="npy" pattern="__name_and_ext__" directory="output/npy_out"/>
<filter>output_options['save_single_representations'] or output_options['save_pair_representations']</filter>
</collection>
<data name="a3m_out" format="a3m" from_work_dir="output/output.a3m" label="${tool.name} on ${on_string}: a3m file"/>
</outputs>
<tests>
<test expect_failure="true" expect_exit_code="1">
<param name="input" value="input.tar"/>
<section name="advanced">
<param name="num_recycles" value="4"/>
<param name="recycle_early_stop_tolerance" value="0.4"/>
<param name="num_ensemble" value="1"/>
<param name="random_seed" value="43"/>
<param name="num_seeds" value="2"/>
<param name="num_models" value="2"/>
<param name="max_msa" value="64:128"/>
<param name="use_dropout" value="--use-dropout"/>
<conditional name="amber">
<param name="use_amber" value="yes"/>
<param name="num_relaxed" value="0"/>
</conditional>
</section>
<section name="output_options">
<param name="save_all" value="--save-all"/>
<param name="save_recycles" value="--save-recycles"/>
<param name="save_single_representations" value="--save-single-representations"/>
<param name="save_pair_representations" value="--save-pair-representations"/>
</section>
<assert_command>
<has_text text="colabfold_batch --num-recycle 4 --recycle-early-stop-tolerance 0.4 --num-ensemble 1 --random-seed 43"/>
<has_text text="--num-seeds 2 --num-models 2 --use-dropout --max-msa 64:128 --amber --num-relax 0"/>
<has_text text="--save-all --save-recycles --save-single-representations --save-pair-representations"/>
</assert_command>
</test>
</tests>
<help><![CDATA[
Generate run a folding step on the output of the colabfold MSA run
]]></help>
<expand macro="citations"/>
</tool>
114 changes: 114 additions & 0 deletions tools/colabfold/colabfold_msa.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
<tool id="colabfold_msa" name="Colabfold MSA" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
<description>Generate MSAs for the Alphafold step of Colabfold</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="biotools"/>
<expand macro="requirements" />
<command detect_errors="exit_code"><![CDATA[
#import re
## Symlinking and formatting
#if $custom_template:
mkdir template_dir &&
#for $file in $custom_template:
#set input_file = re.sub('[^\w\-_\.]', '_', str($file.element_identifier)) + '.pdb'
ln -s $file 'template_dir/${file.element_identifier}.pdb' &&
#end for
#end if
## For single file runs. Will need to be updated for multiple file calls
#set input_file = re.sub('[^\w\-_\.]', '_', str($query_type.input.element_identifier)) + "." + str($query_type.input.ext)
ln -s $query_type.input '$input_file' &&
mkdir output &&
colabfold_batch --msa-only
#if $query_type.select_query_type == "fasta":
#if $query_type.select_db.use_db == "yes":
--msa-mode $query_type.select_db.msa_mode
#end if
#end if
--pair-mode $pair_mode
## --pair-strategy $pairing_strategy
$templates
#if $custom_template:
--custom-template-path template_dir
#end if
--jobname-prefix 'galaxy'
'$input_file'
output.colab &&
tar -cf output.colab.tar output.colab
]]></command>
<inputs>
<conditional name="query_type">
<param name="select_query_type" label="Data input method" type="select">
<option value="fasta">FASTA file</option>
<option value="a3m">a3m file</option>
</param>
<when value="fasta">
<param name="input" type="data" format="fasta" label="Query sequence fasta"/>
<expand macro="db_selector"/>
</when>
<when value="a3m">
<param name="input" type="data" format="a3m" label="Query sequence a3m file"/>
</when>
</conditional>
<param name="pair_mode" label="Pair mode" type="select">
<option value="unpaired_paired">Attempt to pair sequences from the same operon within the genome</option>
<option value="paired">Only used sequences that were successfully paired</option>
<option value="unpaired">Generate separate MSA for each protein</option>
</param>
<!-- Non functional in current release of colabfold, planned to expose in next one -->
<!-- <param name="pairing_strategy" label="Pairing strategy" type="select">
<option value="greedy">Greedy: MSA sequences should only be paired if the same species exist in at least two MSAs </option>
<option value="complete">Complete: MSA sequences should only be paired if the same species exist in all MSAs</option>
</param> -->
<param argument="--templates" type="boolean" truevalue="--templates" falsevalue="" label="Query PDB templates from the MSA server"/>
<param name="custom_template" type="data" format="pdb" multiple="true" optional="true" label="List of PDB files to provide the prediction as custom templates" />
</inputs>
<outputs>
<data name="output" format="colab.tar" from_work_dir="output.colab.tar" label="${tool.name} on ${on_string}: tar file"/>
</outputs>
<tests>
<!-- fasta -->
<test expect_num_outputs="1">
<conditional name="query_type">
<param name="select_query_type" value="fasta"/>
<param name="input" value="test.fasta"/>
<conditional name="select_db">
<param name="use_db" value="yes"/>
<param name="msa_mode" value="mmseqs2_uniref"/>
</conditional>
</conditional>
<assert_command>
<has_text text="colabfold_batch --msa-only --msa-mode mmseqs2_uniref --pair-mode unpaired_paired"/>
</assert_command>
<output name="output">
<assert_contents>
<has_archive_member path=".*\/galaxy_0_all/msa.sh"/>
</assert_contents>
</output>
</test>
<!-- a3m -->
<test expect_num_outputs="1">
<conditional name="query_type">
<param name="select_query_type" value="a3m"/>
<param name="input" value="test.a3m" ftype="a3m"/>
</conditional>
<param name="pair_mode" value="paired"/>
<param name="templates" value="--templates"/>
<param name="custom_template" value="test.pdb,test_2.pdb"/>
<assert_command>
<has_text text="colabfold_batch --msa-only --pair-mode paired --templates --custom-template-path template_dir"/>
</assert_command>
<output name="output">
<assert_contents>
<has_archive_member path=".*\/galaxy_0.pickle"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
Generate MSAs for the alphafold step of Colabfold
]]></help>
<expand macro="citations"/>
</tool>
37 changes: 37 additions & 0 deletions tools/colabfold/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<macros>
<token name="@VERSION@">1.5.5</token>
<token name="@CUDA_VERSION@">12.2.2</token>
<token name="@VERSION_SUFFIX@">0</token>
<xml name="requirements">
<requirements>
<container type="docker">ghcr.io/sokrypton/colabfold:@VERSION@-cuda@CUDA_VERSION@</container>
</requirements>
</xml>
<xml name="biotools">
<xrefs>
<xref type="bio.tools">Colabfold</xref>
</xrefs>
</xml>
<xml name="citations">
<citations>
<citation type="doi">10.1038/s41592-022-01488-1</citation>
</citations>
</xml>
<xml name="db_selector">
<conditional name="select_db">
<param name="use_db" type="select" label="Manually set database?">
<option value="yes">Yes</option>
<option value="no">No</option>
</param>
<when value="no"></when>
<when value="yes">
<param name="msa_mode" label="MSA mode" type="select">
<option value="mmseqs2_uniref_env">mmseqs2_uniref_env</option>
<option value="mmseqs2_uniref">mmseqs2_uniref</option>
<option value="single_sequence">Use single sequence input</option>
<!-- <option value="custom">custom</option> -->
</param>
</when>
</conditional>
</xml>
</macros>
Binary file added tools/colabfold/test-data/input.tar
Binary file not shown.
39 changes: 39 additions & 0 deletions tools/colabfold/test-data/test.a3m
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#38 1
>101
MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS
>UniRef100_N6VR80 62 0.971 1.153E-09 3 37 38 17 51 52
---IKRSSRRWKKKGRMRWKWYKKRLRRLKRERRRARS
>UniRef100_A0A534JJZ5 55 0.722 4.940E-07 3 37 38 0 35 80
---MKRSSRAWKKRGKMRWKWRKKRMRRRKREQKlRART
>UniRef100_A0A8T5HQN6 54 0.647 9.355E-07 3 36 38 14 47 48
---MKRSSRRWKKKGQMRWKWQRKRMKKEKRKRAKSR-
>UniRef100_A0A2E4RG04 53 0.666 1.772E-06 3 35 38 36 68 71
---MKRGSRAWKKQGNQRWKWRKKKLRRRKASRKRA--
>UniRef100_R1E4G0 53 0.617 2.438E-06 3 36 38 0 33 38
---MRRSSRRWKKYLRSRWKWQRRRIREEKRLRKIAR-
>UniRef100_A0A397WLW3 53 0.676 2.438E-06 3 36 38 0 33 38
---MKRSSRRWKKYKRSRWKWQKKRMKEEKRLRKLAR-
>UniRef100_A0A2K3J9R6 52 0.700 4.619E-06 3 32 38 0 29 32
---MKRSSRVWKKRHKMRWKWRKKRMRREKRSR-----
>UniRef100_A6VJM7 51 0.862 8.751E-06 3 31 38 5 33 39
---IKRSSRRWKKKGQMRWKHYKKRIRRMKRE------
>UniRef100_A0A7M3WK46 51 0.666 8.751E-06 3 35 38 30 62 66
---MKRGSRAWKKQGKQRWKWRKKKLRRRKAARKRA--
>UniRef100_A0A915SG42 51 0.617 1.205E-05 3 36 38 0 33 38
---MKRSSRRWKKYLRSRWKWQRRRIREEKRLRKVTR-
>UniRef100_A0A510BD48 51 0.900 1.205E-05 3 32 38 31 60 64
---IKRSSRRWKKKGRMRWRHYKKRLRRRKRER-----
>UniRef100_A0A075M0T1 50 0.638 1.658E-05 1 36 38 22 57 61
-VIMKRRPRKWKKKGRMRWKWLKKRIRRLKRQHRKER-
>UniRef100_A4FYQ5 48 0.851 8.201E-05 3 29 38 5 31 39
---IKRSSRRWKKKGQMRWKHYKKRIRRMK--------
>UniRef100_A0A8J7USD9 48 0.888 8.201E-05 3 29 38 13 39 47
---IKRSSRRWKKKGQMRWKHYKKRLRRMK--------
>UniRef100_A0A2K3JJ52 48 0.700 1.129E-04 3 32 38 0 29 34
---MKRSSRVWKKRRKMRWKWRKKRMRREKRMR-----
>UniRef100_A6UVG5 48 0.821 1.129E-04 3 30 38 9 36 42
---IKRSSRRWKKKGQMRWSHYKKRIRRMKR-------
>UniRef100_A0A5E4HZQ2 43 0.750 7.251E-03 7 34 38 25 52 58
-------PRKWKKKGRMRWKWVKKRRKRLKRKIKR---
>UniRef100_A0A2H6JYE4 36 0.566 1.234E+00 4 33 38 3 32 37
----KHSSRKWKKRGKCRWKTRKKKLKERRRQRK----
2 changes: 2 additions & 0 deletions tools/colabfold/test-data/test.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>testing
MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS
Loading

0 comments on commit a95dcf3

Please sign in to comment.