Merge pull request #412 from Ensembl/lcampbell/dump_container_tweak

Update/Fixes to container Dumper pipeline
Ensembl · Jul 31, 2024 · ac54171 · ac54171
2 parents e61721a + d73d791
commit ac54171
Show file tree

Hide file tree

Showing 8 changed files with 128 additions and 50 deletions.
diff --git a/pipelines/nextflow/modules/database/dump_db.nf b/pipelines/nextflow/modules/database/dump_db.nf
@@ -14,9 +14,9 @@
 // limitations under the License.
 
 process DUMP_DB {
-    publishDir "$out_dir/build_$db.release/coredb/$db.division", mode: 'copy'
     tag "$db.species"
     label "variable_2_8_32"
+    publishDir "$out_dir/$release_dir/coredb/$db.division", mode: 'copy'
     maxForks params.max_database_forks
 
     input:
@@ -26,21 +26,28 @@ process DUMP_DB {
     output:
         path "*.sql.gz"
 
-    script:
+    shell:
         output_file = "${db.species}.sql.gz"
-        """
-        db_pass=""
-        if [ "${db.server.password}" != "" ]; then
-            db_pass="--password '${db.server.password}'"
-        fi
 
-        mysqldump '${db.server.database}' \
-            --host '${db.server.host}' \
-            --port '${db.server.port}' \
-            --user '${db.server.user}' \
-            \$db_pass \
-            | gzip > $output_file
-        """
+        // check if the core DB had an expected release version or not (internal/unformatted db name)
+        if ( "${db.release}".isEmpty() ) {
+            release_dir = "unreleased"
+        }
+        else{
+            release_dir = "build_${db.release}"
+        }
+
+        // formatted_db_pass = validate_db_password(${db.server.password})
+        db_pass = db.server.password ? "--password $db.server.password" : ""
+
+        '''
+        mysqldump '!{db.server.database}' \
+            --host '!{db.server.host}' \
+            --port '!{db.server.port}' \
+            --user '!{db.server.user}' \
+            !{db_pass} \
+            | gzip > !{output_file}
+        '''
 
     stub:
         output_file = "${db.species}.sql.gz"

diff --git a/...ules/genome_metadata/datasets_metadata.nf → ...download/datasets_genome_meta_from_acc.nf b/...ules/genome_metadata/datasets_metadata.nf → ...download/datasets_genome_meta_from_acc.nf
@@ -13,10 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-process DATASETS_METADATA {
+process DOWNLOAD_GENOME_META_FROM_ACC {
     tag "$accession"
     label 'local'
     label 'cached'
+    label 'datasets_container'
 
     input:
         val(accession)
@@ -25,14 +26,24 @@ process DATASETS_METADATA {
         tuple val(accession), path("ncbi_meta.json")
 
     shell:
+        output = "ncbi_meta.json"
         '''
-        datasets summary genome accession !{accession} > ncbi_meta.json
+        echo "Calling datasets-cli.... datasets 'summary' 'genome' 'accession' [!{accession}]'"
+
+        # Pipe datasets to jq instead of '--as-json-lines' to 
+        # obtain a total_count of reports returned.
+        datasets summary genome accession !{accession} | jq '.' > !{output}
+
         if [ "$?" -ne 0 ]; then
             echo "Invalid or unsupported assembly accession: !{accession}"
             exit 1
-        elif [[ $(jq -r '.total_count' ncbi_meta.json) -eq 0 ]]; then
-            echo "No metadata returned for !{accession}"
-            exit 1
+        fi
+
+        # Check if it should maybe be using RefSeq?           
+        if [[ $(jq '.total_count' !{output}) -eq 0 ]] && [[ !{accession} =~ "GCA_" ]]; then
+            accession=$(echo !{accession} | sed 's/^GCA_/GCF_/')
+            echo "Trying again with RefSeq accession: $accession"
+            datasets summary genome accession !{accession} | jq '.' > !{output}
         fi
         '''
 

diff --git a/pipelines/nextflow/modules/download/datasets_genome_meta_from_db.nf b/pipelines/nextflow/modules/download/datasets_genome_meta_from_db.nf
@@ -0,0 +1,59 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+process DOWNLOAD_GENOME_META_FROM_DB {
+    tag "${db.species}"
+    label 'local'
+    label 'cached'
+    label 'datasets_container'
+
+    input:
+        tuple val(db), val(accession)
+
+    output:
+        tuple val(db), path("ncbi_stats.json")
+
+    shell:
+        output = "ncbi_stats.json"
+        password_arg = db.server.password ? "--password $db.server.password" : ""
+        '''
+        echo "Calling datasets-cli.... datasets 'summary' 'genome' 'accession' [!{accession}]'"
+
+        # Pipe datasets to jq instead of '--as-json-lines' to 
+        # obtain a total_count of reports returned.
+        datasets summary genome accession !{accession} | jq '.' > !{output}
+
+        if [ "$?" -ne 0 ]; then
+            echo "Invalid or unsupported assembly accession: !{accession}"
+            exit 1
+        fi
+
+        # Check if it should maybe be using RefSeq?           
+        if [[ $(jq '.total_count' !{output}) -eq 0 ]] && [[ !{accession} =~ "GCA_" ]]; then
+            accession=$(echo !{accession} | sed 's/^GCA_/GCF_/')
+            echo "Trying again with RefSeq accession: $accession"
+            datasets summary genome accession !{accession} | jq '.' > !{output}
+        fi
+        '''
+
+    stub:
+        output_file = "ncbi_stats.json"
+        dump_dir = "$workflow.projectDir/../../../../data/test/pipelines/dumper/dump_files"
+        dump_file = "downloaded_ncbi_stats.json"
+        """
+        cp $dump_dir/$dump_file $output_file
+        """
+}
diff --git a/pipelines/nextflow/modules/files/publish_output_dump.nf b/pipelines/nextflow/modules/files/publish_output_dump.nf
@@ -17,7 +17,7 @@
 process PUBLISH_DIR {
     tag "${db.species}"
     label 'default'
-    publishDir "$out_dir/build_$db.release/metadata/$db.division/$db.species", mode: 'copy'
+    publishDir "$out_dir/$release_dir/metadata/$db.division/$db.species", mode: 'copy'
     time '5min'
 
     input:
@@ -26,7 +26,17 @@ process PUBLISH_DIR {
 
     output:
         tuple val(db), path(data_dir, includeInputs: true)
+
     script:
+
+        // check if the core DB had an expected release version or not (internal/unformatted db name)
+        if ( "${db.release}".isEmpty() ) {
+            release_dir = "unreleased"
+        }
+        else{
+            release_dir = "build_${db.release}"
+        }
+
         """
         echo "Just copy over the finished files"
         """

diff --git a/...odules/genome_metadata/dump_ncbi_stats.nf → ...dules/genome_metadata/core_to_asm_meta.nf b/...odules/genome_metadata/dump_ncbi_stats.nf → ...dules/genome_metadata/core_to_asm_meta.nf
@@ -14,19 +14,17 @@
 // limitations under the License.
 
 
-process DUMP_NCBI_STATS {
+process CORE_TO_ASM_META {
     tag "${db.species}"
     label 'local'
-    label 'cached'
 
     input:
         val db
 
     output:
-        tuple val(db), path("ncbi_stats.json")
+        tuple val(db), env(accession)
 
     shell:
-        output = "ncbi_stats.json"
         password_arg = db.server.password ? "--password $db.server.password" : ""
         '''
         function get_meta_value {
@@ -41,31 +39,20 @@ process DUMP_NCBI_STATS {
                 -N -e "SELECT meta_value FROM meta WHERE meta_key='$meta_key'"
         }
 
-        touch !{output}
         # Get the INSDC accession to use
         accession=$(get_meta_value "assembly.accession")
         provider=$(get_meta_value "assembly.provider_url" | sed -r 's%^.+/%%g')
         if [ $provider == "refseq" ]; then
             accession=$(echo $accession | sed 's/^GCA_/GCF_/')
         fi
-        echo "Provider is $provider"
-        echo "Accession is $accession"
 
-        datasets summary genome accession $accession | jq '.' > !{output}
-
-        # Check if it should maybe be using RefSeq?           
-        if [ "$(jq '.total_count' !{output})" == "0" ]; then
-            accession=$(echo $accession | sed 's/^GCA_/GCF_/')
-            echo "Trying again with accession $accession"
-            datasets summary genome accession $accession | jq '.' > !{output}
-        fi
+        echo -e -n "Provider is $provider\nAccession is $accession\n"
         '''
+
 
     stub:
-        output_file = "ncbi_stats.json"
-        dump_dir = "$workflow.projectDir/../../../../data/test/pipelines/dumper/dump_files"
-        dump_file = "downloaded_ncbi_stats.json"
         """
-        cp $dump_dir/$dump_file $output_file
+        accession="GCA_015245375.1"
+        echo -e -n "Provider is The University of Georgia\nAccession is GCA_015245375.1\n"
         """
 }
diff --git a/pipelines/nextflow/subworkflows/dump_files/main.nf b/pipelines/nextflow/subworkflows/dump_files/main.nf
@@ -13,21 +13,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-include { DUMP_SEQ_REGIONS } from '../../modules/seq_region/dump_seq_regions.nf'
+include { DOWNLOAD_GENOME_META_FROM_DB } from '../../modules/download/datasets_genome_meta_from_db.nf'
 include { DUMP_AGP } from '../../modules/seq_region/dump_agp.nf'
-include { DUMP_SEQ_ATTRIB } from '../../modules/seq_region/dump_seq_attrib.nf'
-include { DUMP_FASTA_DNA } from '../../modules/fasta/dump_fasta_dna.nf'
-include { DUMP_FASTA_PEPTIDES } from '../../modules/fasta/dump_fasta_peptides.nf'
-include { DUMP_GFF3 } from '../../modules/gff3/dump_gff3.nf'
 include { DUMP_ANNOTATION } from '../../modules/annotation/dump_annotation.nf'
 include { DUMP_EVENTS } from '../../modules/events/dump_events.nf'
+include { DUMP_FASTA_DNA } from '../../modules/fasta/dump_fasta_dna.nf'
+include { DUMP_FASTA_PEPTIDES } from '../../modules/fasta/dump_fasta_peptides.nf'
 include { DUMP_GENOME_META } from '../../modules/genome_metadata/dump_genome_meta.nf'
 include { DUMP_GENOME_STATS } from '../../modules/genome_stats/dump_genome_stats.nf'
+include { DUMP_GFF3 } from '../../modules/gff3/dump_gff3.nf'
+include { DUMP_SEQ_ATTRIB } from '../../modules/seq_region/dump_seq_attrib.nf'
+include { DUMP_SEQ_REGIONS } from '../../modules/seq_region/dump_seq_regions.nf'
+include { CHECK_INTEGRITY } from '../../modules/manifest/integrity.nf'
 include { COMPARE_GENOME_STATS } from '../../modules/genome_stats/compare_genome_stats.nf'
-include { DUMP_NCBI_STATS } from '../../modules/genome_metadata/dump_ncbi_stats.nf'
-
+include { CORE_TO_ASM_META } from '../../modules/genome_metadata/core_to_asm_meta.nf'
 include { MANIFEST } from '../../modules/manifest/manifest_maker.nf'
-include { CHECK_INTEGRITY } from '../../modules/manifest/integrity.nf'
 include { PUBLISH_DIR } from '../../modules/files/publish_output_dump.nf'
 
 workflow DUMP_FILES {
@@ -99,7 +99,8 @@ workflow DUMP_FILES {
         // Genome stats
         if ("stats" in selection) {
             genome_stats = DUMP_GENOME_STATS(db)
-            ncbi_stats = DUMP_NCBI_STATS(db)
+            assembly_acc_meta = CORE_TO_ASM_META(db)
+            ncbi_stats = DOWNLOAD_GENOME_META_FROM_DB(assembly_acc_meta)
             stats = ncbi_stats.join(genome_stats)
             stats_files = COMPARE_GENOME_STATS(stats).transpose()
             db_files = db_files.mix(stats_files)

diff --git a/pipelines/nextflow/workflows/genome_prepare/main.nf b/pipelines/nextflow/workflows/genome_prepare/main.nf
@@ -32,7 +32,7 @@ if (params.brc_mode) {
 include { GENOME_PREPARE } from '../../subworkflows/genome_prepare/main.nf'
 // Import module
 include { PREPARE_GENOME_METADATA } from '../../modules/genome_metadata/prepare_genome_metadata.nf'
-include { DATASETS_METADATA } from '../../modules/genome_metadata/datasets_metadata.nf'
+include { DOWNLOAD_GENOME_META_FROM_ACC } from '../../modules/download/datasets_genome_meta_from_acc.nf'
 include { ACCESSION_METADATA } from '../../modules/genome_metadata/accession_metadata.nf'
 // Utilities
 include { read_json } from '../../modules/utils/utils.nf'
@@ -70,7 +70,7 @@ workflow {
     ch_genome_json = Channel.fromPath("${params.input_dir}/*.json", checkIfExists: true)
     accession_meta = ACCESSION_METADATA(ch_genome_json)
     accession_val = accession_meta.map{ accession, meta_file -> accession }
-    dataset_report = DATASETS_METADATA(accession_val)
+    dataset_report = DOWNLOAD_GENOME_META_FROM_ACC(accession_val)
     PREPARE_GENOME_METADATA(accession_meta.join(dataset_report))
 
     PREPARE_GENOME_METADATA.out.genomic_dataset

diff --git a/pipelines/nextflow/workflows/nextflow.config b/pipelines/nextflow/workflows/nextflow.config
@@ -127,4 +127,7 @@ process {
     withLabel: 'ensembl_scripts_container' {
         container = "matthieubarba/ensembl-scripts:0.8"
     }
+    withLabel: 'datasets_container' {
+        container = "ensemblorg/datasets-cli:latest"
+    }
 }