Add colabfold (galaxyproject#5785)

* add colabfold * missed shed in git add * tar file * Update tools/colabfold/.shed.yml Co-authored-by: Wolfgang Maier <maierw@posteo.de> * update with most of the suggestion * add tests for msa, change minor params * fix shed file to make suite * fix file name problem, add pre-set param for naming files to prevent problems, update docker image version * add num_outputs test to alphafold tool * fix test because inputs removed conditionals * fix missing text modifier * Add hardcoded file names to prevent file headers from breaking dataset history names, modify help text, put params into advanced section * fix typo from creating an advanced section * fix msa filenames * update archive member paths, assert expect_error * test update * print dir content test * try as just tar * add back ls * wrong place for && * quotes and another archive test * make tests match now that one passed * switch back to colab.tar * test fail state of alphafold tool * Failed properly, tool ready * Try expect code * Expect both failure and exit code * Expect fail, exit code, and num outputs * remove num_outputs to fix lint * matthias final pass * better descrption for alphafold * small changes * another round --------- Co-authored-by: Alexander OSTROVSKY <alexanderostrovsky@Chell.local> Co-authored-by: Wolfgang Maier <maierw@posteo.de> Co-authored-by: Björn Grüning <bjoern@gruenings.eu>
mvdbeek · Mar 24, 2024 · a95dcf3 · a95dcf3
1 parent c7f54e6
commit a95dcf3
Show file tree

Hide file tree

Showing 9 changed files with 3,243 additions and 0 deletions.
diff --git a/tools/colabfold/.shed.yml b/tools/colabfold/.shed.yml
@@ -0,0 +1,23 @@
+name: colabfold
+owner: iuc
+categories:
+- Proteomics
+- Graphics
+description: "Protein prediction based on AlphaFold2"
+homepage_url: https://github.com/sokrypton/ColabFold
+long_description: |
+  ColabFold offers accelerated (40-60x faster) prediction of protein structures 
+  and complexes by combining the fast homology search of MMseqs2 
+  with AlphaFold2 or RoseTTAFold. 
+remote_repository_url: https://github.com/sokrypton/ColabFold
+type: unrestricted
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "Wrapper for the colabfold tool suite: {{ tool_name }}"
+suite:
+  name: "suite_colabfold"
+  description: "Protein prediction based on AlphaFold2"
+  long_description: |
+    ColabFold offers accelerated (40-60x faster) prediction of protein structures 
+    and complexes by combining the fast homology search of MMseqs2 
+    with AlphaFold2 or RoseTTAFold. 
diff --git a/tools/colabfold/colabfold_alphafold.xml b/tools/colabfold/colabfold_alphafold.xml
@@ -0,0 +1,156 @@
+<tool id="colabfold_alphafold" name="Colabfold Alphafold" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
+    <description>Predict protein structures with Colabfold</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="biotools"/>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[ 
+    #import os
+    mkdir input_data &&
+    tar -xmf '$input' --strip-components 1 -C input_data &&
+    mkdir output &&
+    colabfold_batch
+    #if str($advanced.num_recycles)!="":
+        --num-recycle $advanced.num_recycles
+    #end if
+    #if str($advanced.recycle_early_stop_tolerance)!="":
+        --recycle-early-stop-tolerance $advanced.recycle_early_stop_tolerance
+    #end if
+    #if $advanced.num_ensemble:
+        --num-ensemble $advanced.num_ensemble 
+    #end if
+    #if str($advanced.random_seed)!="":
+        --random-seed $advanced.random_seed 
+    #end if
+    #if str($advanced.num_seeds)!="":
+        --num-seeds $advanced.num_seeds 
+    #end if    
+    #if $advanced.num_models:
+        --num-models $advanced.num_models
+    #end if
+    $advanced.use_dropout
+    --max-msa $advanced.max_msa
+    #if $advanced.amber.use_amber == "yes":
+        --amber
+        --num-relax $advanced.amber.num_relaxed
+    #end if
+    $output_options.save_all
+    $output_options.save_recycles
+    $output_options.save_single_representations
+    $output_options.save_pair_representations
+    --jobname-prefix "galaxy"
+    input_data
+    output
+    && cd output
+    && mv *.a3m output.a3m
+    && mkdir png_out
+    && mkdir json_out
+    && mkdir pdb_out
+    && mv ./*.png png_out
+    && mv ./*.json json_out
+    && mv ./*.pdb pdb_out
+    && mv json_out/config.json .
+    #if  $output_options.save_all:
+        && mkdir pickle_out    
+        && mv ./*.pickle pickle_out
+    #end if
+    #if  $output_options.save_pair_representations or $output_options.save_single_representations:
+        && mkdir npy_out    
+        && mv ./*.npy npy_out
+    #end if
+
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="colab.tar" label="Tar file output from colabfold MSA tool"/>
+        <section name="advanced" title="Advanced options">
+            <param argument="--num-recycles" label="How many recycles to run?" type="integer" optional="true" help="Number of prediction recycles. Increasing recycles can improve the prediction quality but slows down the prediction."/>
+            <param argument="--recycle-early-stop-tolerance" type="float" optional="true" min="0.0" max="1.0" help="Specify convergence criteria. Run recycles until the distance between recycles is within the given tolerance value."/>
+            <param argument='--num-ensemble' label="Number of ensembles" type="integer" min="1" optional="true" help="Number of ensembles. The trunk of the network is run multiple times with different random choices for the MSA cluster centers. This can result in a better prediction at the cost of longer runtime."/>
+            <param argument="--random-seed" label="Set seed" type="integer" min="0" optional="true"/>
+            <param argument="--num-seeds" label="Number of seeds" type="integer" min="0" optional="true" help="Number of seeds to try iterated based on random seed"/>
+            <param argument="--num-models" label="Number of models to use for structure prediction" type="integer" min="1" max="5" help="Reducing the number of models speeds up the prediction but results in lower quality"/>
+            <param name="max_msa" label="Max msa" type="select" help="Defines the ratio of max-seq to max-extra-seq for one run. Enable dropouts and increase the number of seeds to sample predictions from uncertainty of the model. Decrease to increase uncertainity">
+                <!-- <option value="auto">auto</option> -->
+                <option value="512:1024">512:1024</option>
+                <option value="256:512">256:512</option>
+                <option value="64:128">64:128</option>
+                <option value="32:64">32:64</option>
+                <option value="16:32">16:32</option>
+            </param>
+            <param argument="--use-dropout" label="Use dropouts" type="boolean" truevalue="--use-dropout" falsevalue="" help="Activate dropouts during inference to sample from the uncertainty of the models."/>
+            <conditional name="amber">
+                <param name="use_amber" label="Use AMBER" type="select" help="Use AMBER force field for structure refinement and side chain optimization">
+                    <option value="yes">Use AMBER</option>
+                    <option value="no">Don't use AMBER</option>
+                </param>
+                <when value="no"/>
+                <when value="yes">
+                    <param argument="--num-relaxed" label="How many top-ranked structures to relax using AMBER?" type="integer" min="0" value="0" help="Increased values may increase runtime"/>
+                </when>
+            </conditional>
+        </section>
+        <!-- Add for second version of tool for batch jobs -->
+        <!-- <param name="stop_at" label="Stop score" type="float" min="0.0" optional="true" help="Compute models until pLDDT (single chain) or pTM-score (multimer) > threshold is reached. This speeds up prediction by running less models for easier queries."/> -->
+        <section name="output_options" title="Output Options">
+            <param argument="--save-all" type="boolean" label="Save raw outputs from model to a pickle file" truevalue="--save-all" falsevalue=""/>
+            <param argument="--save-recycles" type="boolean" label="Save all intermediate predictions at each recycle iteration" truevalue="--save-recycles" falsevalue=""/>
+            <param argument="--save-single-representations" type="boolean" label="Save the single representation embeddings of all models." truevalue="--save-single-representations" falsevalue=""/>
+            <param argument="--save-pair-representations" type="boolean" label="Save the pair representation embeddings of all models." truevalue="--save-pair-representations" falsevalue=""/>
+        </section>
+    </inputs>
+    <outputs>
+        <collection name="png_files" type="list" format="png" label="${tool.name} on ${on_string}: Figures">
+            <discover_datasets format="png" pattern="__name_and_ext__" directory="output/png_out"/>
+        </collection>
+        <collection name="json_files" type="list" format="json" label="${tool.name} on ${on_string}: JSON predictions">
+            <discover_datasets format="json" pattern="__name_and_ext__" directory="output/json_out"/>
+        </collection>
+        <collection name="pdb" type="list" format="pdb" label="${tool.name} on ${on_string}: PDB predictions">
+            <discover_datasets format="pdb" pattern="__name_and_ext__" directory="output/pdb_out"/>
+        </collection>
+        <collection name="pickle" type="list" format="pickle" label="${tool.name} on ${on_string}: Pickle file outputs">
+            <discover_datasets format="pickle" pattern="__name_and_ext__" directory="output/pickle_out"/>
+            <filter>output_options['save_all']</filter>
+        </collection>
+        <collection name="npy" type="list" format="npy" label="${tool.name} on ${on_string}: Numpy embeddings">
+            <discover_datasets format="npy" pattern="__name_and_ext__" directory="output/npy_out"/>
+            <filter>output_options['save_single_representations'] or output_options['save_pair_representations']</filter>
+        </collection>
+        <data name="a3m_out" format="a3m" from_work_dir="output/output.a3m" label="${tool.name} on ${on_string}: a3m file"/>
+    </outputs>
+    <tests>
+        <test expect_failure="true" expect_exit_code="1">
+            <param name="input" value="input.tar"/>
+            <section name="advanced">
+                <param name="num_recycles" value="4"/>
+                <param name="recycle_early_stop_tolerance" value="0.4"/>
+                <param name="num_ensemble" value="1"/>
+                <param name="random_seed" value="43"/>
+                <param name="num_seeds" value="2"/>
+                <param name="num_models" value="2"/>
+                <param name="max_msa" value="64:128"/>
+                <param name="use_dropout" value="--use-dropout"/>
+                <conditional name="amber">
+                    <param name="use_amber" value="yes"/>
+                    <param name="num_relaxed" value="0"/>
+                </conditional>
+            </section>
+            <section name="output_options">
+                <param name="save_all" value="--save-all"/>
+                <param name="save_recycles" value="--save-recycles"/>
+                <param name="save_single_representations" value="--save-single-representations"/>
+                <param name="save_pair_representations" value="--save-pair-representations"/>
+            </section>
+            <assert_command>
+                <has_text text="colabfold_batch --num-recycle 4 --recycle-early-stop-tolerance 0.4 --num-ensemble 1 --random-seed 43"/>
+                <has_text text="--num-seeds 2 --num-models 2 --use-dropout --max-msa 64:128 --amber --num-relax 0"/>
+                <has_text text="--save-all --save-recycles --save-single-representations --save-pair-representations"/>
+            </assert_command>
+        </test>
+    </tests>
+    <help><![CDATA[
+        Generate run a folding step on the output of the colabfold MSA run
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/colabfold/colabfold_msa.xml b/tools/colabfold/colabfold_msa.xml
@@ -0,0 +1,114 @@
+<tool id="colabfold_msa" name="Colabfold MSA" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
+    <description>Generate MSAs for the Alphafold step of Colabfold</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="biotools"/>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[ 
+    #import re
+
+    ## Symlinking and formatting
+    #if $custom_template:
+        mkdir template_dir &&
+        #for $file in $custom_template:
+            #set input_file = re.sub('[^\w\-_\.]', '_', str($file.element_identifier)) + '.pdb'
+            ln -s $file 'template_dir/${file.element_identifier}.pdb' &&
+        #end for
+    #end if   
+    ## For single file runs. Will need to be updated for multiple file calls
+    #set input_file = re.sub('[^\w\-_\.]', '_', str($query_type.input.element_identifier)) + "." + str($query_type.input.ext)
+    ln -s $query_type.input '$input_file' &&
+
+    mkdir output &&
+    colabfold_batch --msa-only
+    #if $query_type.select_query_type == "fasta":
+        #if $query_type.select_db.use_db == "yes":
+            --msa-mode $query_type.select_db.msa_mode
+        #end if
+    #end if
+    --pair-mode $pair_mode
+    ## --pair-strategy $pairing_strategy
+    $templates
+    #if $custom_template:
+        --custom-template-path template_dir
+    #end if
+    --jobname-prefix 'galaxy'
+    '$input_file'
+    output.colab &&
+    tar -cf output.colab.tar output.colab
+    ]]></command>
+    <inputs>
+        <conditional name="query_type">
+            <param name="select_query_type" label="Data input method" type="select">
+                <option value="fasta">FASTA file</option>
+                <option value="a3m">a3m file</option>
+            </param>
+            <when value="fasta">
+                <param name="input" type="data" format="fasta" label="Query sequence fasta"/>
+                <expand macro="db_selector"/>
+            </when>
+            <when value="a3m">
+                <param name="input" type="data" format="a3m" label="Query sequence a3m file"/>
+            </when>
+        </conditional>
+        <param name="pair_mode" label="Pair mode" type="select">
+            <option value="unpaired_paired">Attempt to pair sequences from the same operon within the genome</option>
+            <option value="paired">Only used sequences that were successfully paired</option>
+            <option value="unpaired">Generate separate MSA for each protein</option>
+        </param> 
+        <!-- Non functional in current release of colabfold, planned to expose in next one -->
+        <!-- <param name="pairing_strategy" label="Pairing strategy" type="select">
+            <option value="greedy">Greedy: MSA sequences should only be paired if the same species exist in at least two MSAs </option>
+            <option value="complete">Complete: MSA sequences should only be paired if the same species exist in all MSAs</option>
+        </param> -->
+        <param argument="--templates" type="boolean" truevalue="--templates" falsevalue="" label="Query PDB templates from the MSA server"/>
+        <param name="custom_template" type="data" format="pdb" multiple="true" optional="true" label="List of PDB files to provide the prediction as custom templates" />
+    </inputs>
+    <outputs>
+        <data name="output" format="colab.tar" from_work_dir="output.colab.tar" label="${tool.name} on ${on_string}: tar file"/>
+    </outputs>
+    <tests>
+        <!-- fasta -->
+        <test expect_num_outputs="1">
+            <conditional name="query_type">
+                <param name="select_query_type" value="fasta"/>
+                <param name="input" value="test.fasta"/>
+                <conditional name="select_db">
+                    <param name="use_db" value="yes"/>
+                    <param name="msa_mode" value="mmseqs2_uniref"/>
+                </conditional>
+            </conditional>
+            <assert_command>
+                <has_text text="colabfold_batch --msa-only --msa-mode mmseqs2_uniref --pair-mode unpaired_paired"/>
+            </assert_command>
+            <output name="output">
+                <assert_contents>
+                    <has_archive_member path=".*\/galaxy_0_all/msa.sh"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- a3m -->
+        <test expect_num_outputs="1">
+            <conditional name="query_type">
+                <param name="select_query_type" value="a3m"/>
+                <param name="input" value="test.a3m" ftype="a3m"/>
+            </conditional>
+            <param name="pair_mode" value="paired"/>
+            <param name="templates" value="--templates"/>
+            <param name="custom_template" value="test.pdb,test_2.pdb"/>
+            <assert_command>
+                <has_text text="colabfold_batch --msa-only --pair-mode paired --templates --custom-template-path template_dir"/>
+            </assert_command>
+            <output name="output">
+                <assert_contents>
+                    <has_archive_member path=".*\/galaxy_0.pickle"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+        Generate MSAs for the alphafold step of Colabfold
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/colabfold/macros.xml b/tools/colabfold/macros.xml
@@ -0,0 +1,37 @@
+<macros>
+    <token name="@VERSION@">1.5.5</token>
+    <token name="@CUDA_VERSION@">12.2.2</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <xml name="requirements">
+        <requirements>
+            <container type="docker">ghcr.io/sokrypton/colabfold:@VERSION@-cuda@CUDA_VERSION@</container>
+        </requirements>
+    </xml>
+    <xml name="biotools">
+        <xrefs>
+            <xref type="bio.tools">Colabfold</xref>
+        </xrefs>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1038/s41592-022-01488-1</citation>
+        </citations>
+    </xml>
+    <xml name="db_selector">
+        <conditional name="select_db">
+            <param name="use_db" type="select" label="Manually set database?">
+                <option value="yes">Yes</option>
+                <option value="no">No</option>
+            </param>
+            <when value="no"></when>
+            <when value="yes">
+                <param name="msa_mode" label="MSA mode" type="select">
+                    <option value="mmseqs2_uniref_env">mmseqs2_uniref_env</option>
+                    <option value="mmseqs2_uniref">mmseqs2_uniref</option>
+                    <option value="single_sequence">Use single sequence input</option>
+                    <!-- <option value="custom">custom</option> -->
+                </param>
+            </when>
+        </conditional>
+    </xml>
+</macros>
diff --git a/tools/colabfold/test-data/input.tar b/tools/colabfold/test-data/input.tar
diff --git a/tools/colabfold/test-data/test.a3m b/tools/colabfold/test-data/test.a3m
@@ -0,0 +1,39 @@
+#38	1
+>101
+MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS
+>UniRef100_N6VR80	62	0.971	1.153E-09	3	37	38	17	51	52
+---IKRSSRRWKKKGRMRWKWYKKRLRRLKRERRRARS
+>UniRef100_A0A534JJZ5	55	0.722	4.940E-07	3	37	38	0	35	80
+---MKRSSRAWKKRGKMRWKWRKKRMRRRKREQKlRART
+>UniRef100_A0A8T5HQN6	54	0.647	9.355E-07	3	36	38	14	47	48
+---MKRSSRRWKKKGQMRWKWQRKRMKKEKRKRAKSR-
+>UniRef100_A0A2E4RG04	53	0.666	1.772E-06	3	35	38	36	68	71
+---MKRGSRAWKKQGNQRWKWRKKKLRRRKASRKRA--
+>UniRef100_R1E4G0	53	0.617	2.438E-06	3	36	38	0	33	38
+---MRRSSRRWKKYLRSRWKWQRRRIREEKRLRKIAR-
+>UniRef100_A0A397WLW3	53	0.676	2.438E-06	3	36	38	0	33	38
+---MKRSSRRWKKYKRSRWKWQKKRMKEEKRLRKLAR-
+>UniRef100_A0A2K3J9R6	52	0.700	4.619E-06	3	32	38	0	29	32
+---MKRSSRVWKKRHKMRWKWRKKRMRREKRSR-----
+>UniRef100_A6VJM7	51	0.862	8.751E-06	3	31	38	5	33	39
+---IKRSSRRWKKKGQMRWKHYKKRIRRMKRE------
+>UniRef100_A0A7M3WK46	51	0.666	8.751E-06	3	35	38	30	62	66
+---MKRGSRAWKKQGKQRWKWRKKKLRRRKAARKRA--
+>UniRef100_A0A915SG42	51	0.617	1.205E-05	3	36	38	0	33	38
+---MKRSSRRWKKYLRSRWKWQRRRIREEKRLRKVTR-
+>UniRef100_A0A510BD48	51	0.900	1.205E-05	3	32	38	31	60	64
+---IKRSSRRWKKKGRMRWRHYKKRLRRRKRER-----
+>UniRef100_A0A075M0T1	50	0.638	1.658E-05	1	36	38	22	57	61
+-VIMKRRPRKWKKKGRMRWKWLKKRIRRLKRQHRKER-
+>UniRef100_A4FYQ5	48	0.851	8.201E-05	3	29	38	5	31	39
+---IKRSSRRWKKKGQMRWKHYKKRIRRMK--------
+>UniRef100_A0A8J7USD9	48	0.888	8.201E-05	3	29	38	13	39	47
+---IKRSSRRWKKKGQMRWKHYKKRLRRMK--------
+>UniRef100_A0A2K3JJ52	48	0.700	1.129E-04	3	32	38	0	29	34
+---MKRSSRVWKKRRKMRWKWRKKRMRREKRMR-----
+>UniRef100_A6UVG5	48	0.821	1.129E-04	3	30	38	9	36	42
+---IKRSSRRWKKKGQMRWSHYKKRIRRMKR-------
+>UniRef100_A0A5E4HZQ2	43	0.750	7.251E-03	7	34	38	25	52	58
+-------PRKWKKKGRMRWKWVKKRRKRLKRKIKR---
+>UniRef100_A0A2H6JYE4	36	0.566	1.234E+00	4	33	38	3	32	37
+----KHSSRKWKKRGKCRWKTRKKKLKERRRQRK----
diff --git a/tools/colabfold/test-data/test.fasta b/tools/colabfold/test-data/test.fasta
@@ -0,0 +1,2 @@
+>testing
+MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS