Skip to content

Commit

Permalink
Merge pull request #412 from Ensembl/lcampbell/dump_container_tweak
Browse files Browse the repository at this point in the history
Update/Fixes to container Dumper pipeline
  • Loading branch information
ens-LCampbell authored Jul 31, 2024
2 parents e61721a + d73d791 commit ac54171
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 50 deletions.
35 changes: 21 additions & 14 deletions pipelines/nextflow/modules/database/dump_db.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
// limitations under the License.

process DUMP_DB {
publishDir "$out_dir/build_$db.release/coredb/$db.division", mode: 'copy'
tag "$db.species"
label "variable_2_8_32"
publishDir "$out_dir/$release_dir/coredb/$db.division", mode: 'copy'
maxForks params.max_database_forks

input:
Expand All @@ -26,21 +26,28 @@ process DUMP_DB {
output:
path "*.sql.gz"

script:
shell:
output_file = "${db.species}.sql.gz"
"""
db_pass=""
if [ "${db.server.password}" != "" ]; then
db_pass="--password '${db.server.password}'"
fi

mysqldump '${db.server.database}' \
--host '${db.server.host}' \
--port '${db.server.port}' \
--user '${db.server.user}' \
\$db_pass \
| gzip > $output_file
"""
// check if the core DB had an expected release version or not (internal/unformatted db name)
if ( "${db.release}".isEmpty() ) {
release_dir = "unreleased"
}
else{
release_dir = "build_${db.release}"
}

// formatted_db_pass = validate_db_password(${db.server.password})
db_pass = db.server.password ? "--password $db.server.password" : ""

'''
mysqldump '!{db.server.database}' \
--host '!{db.server.host}' \
--port '!{db.server.port}' \
--user '!{db.server.user}' \
!{db_pass} \
| gzip > !{output_file}
'''

stub:
output_file = "${db.species}.sql.gz"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.

process DATASETS_METADATA {
process DOWNLOAD_GENOME_META_FROM_ACC {
tag "$accession"
label 'local'
label 'cached'
label 'datasets_container'

input:
val(accession)
Expand All @@ -25,14 +26,24 @@ process DATASETS_METADATA {
tuple val(accession), path("ncbi_meta.json")

shell:
output = "ncbi_meta.json"
'''
datasets summary genome accession !{accession} > ncbi_meta.json
echo "Calling datasets-cli.... datasets 'summary' 'genome' 'accession' [!{accession}]'"
# Pipe datasets to jq instead of '--as-json-lines' to
# obtain a total_count of reports returned.
datasets summary genome accession !{accession} | jq '.' > !{output}
if [ "$?" -ne 0 ]; then
echo "Invalid or unsupported assembly accession: !{accession}"
exit 1
elif [[ $(jq -r '.total_count' ncbi_meta.json) -eq 0 ]]; then
echo "No metadata returned for !{accession}"
exit 1
fi
# Check if it should maybe be using RefSeq?
if [[ $(jq '.total_count' !{output}) -eq 0 ]] && [[ !{accession} =~ "GCA_" ]]; then
accession=$(echo !{accession} | sed 's/^GCA_/GCF_/')
echo "Trying again with RefSeq accession: $accession"
datasets summary genome accession !{accession} | jq '.' > !{output}
fi
'''

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


process DOWNLOAD_GENOME_META_FROM_DB {
tag "${db.species}"
label 'local'
label 'cached'
label 'datasets_container'

input:
tuple val(db), val(accession)

output:
tuple val(db), path("ncbi_stats.json")

shell:
output = "ncbi_stats.json"
password_arg = db.server.password ? "--password $db.server.password" : ""
'''
echo "Calling datasets-cli.... datasets 'summary' 'genome' 'accession' [!{accession}]'"
# Pipe datasets to jq instead of '--as-json-lines' to
# obtain a total_count of reports returned.
datasets summary genome accession !{accession} | jq '.' > !{output}
if [ "$?" -ne 0 ]; then
echo "Invalid or unsupported assembly accession: !{accession}"
exit 1
fi
# Check if it should maybe be using RefSeq?
if [[ $(jq '.total_count' !{output}) -eq 0 ]] && [[ !{accession} =~ "GCA_" ]]; then
accession=$(echo !{accession} | sed 's/^GCA_/GCF_/')
echo "Trying again with RefSeq accession: $accession"
datasets summary genome accession !{accession} | jq '.' > !{output}
fi
'''

stub:
output_file = "ncbi_stats.json"
dump_dir = "$workflow.projectDir/../../../../data/test/pipelines/dumper/dump_files"
dump_file = "downloaded_ncbi_stats.json"
"""
cp $dump_dir/$dump_file $output_file
"""
}
12 changes: 11 additions & 1 deletion pipelines/nextflow/modules/files/publish_output_dump.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
process PUBLISH_DIR {
tag "${db.species}"
label 'default'
publishDir "$out_dir/build_$db.release/metadata/$db.division/$db.species", mode: 'copy'
publishDir "$out_dir/$release_dir/metadata/$db.division/$db.species", mode: 'copy'
time '5min'

input:
Expand All @@ -26,7 +26,17 @@ process PUBLISH_DIR {

output:
tuple val(db), path(data_dir, includeInputs: true)

script:

// check if the core DB had an expected release version or not (internal/unformatted db name)
if ( "${db.release}".isEmpty() ) {
release_dir = "unreleased"
}
else{
release_dir = "build_${db.release}"
}

"""
echo "Just copy over the finished files"
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,17 @@
// limitations under the License.


process DUMP_NCBI_STATS {
process CORE_TO_ASM_META {
tag "${db.species}"
label 'local'
label 'cached'

input:
val db

output:
tuple val(db), path("ncbi_stats.json")
tuple val(db), env(accession)

shell:
output = "ncbi_stats.json"
password_arg = db.server.password ? "--password $db.server.password" : ""
'''
function get_meta_value {
Expand All @@ -41,31 +39,20 @@ process DUMP_NCBI_STATS {
-N -e "SELECT meta_value FROM meta WHERE meta_key='$meta_key'"
}
touch !{output}
# Get the INSDC accession to use
accession=$(get_meta_value "assembly.accession")
provider=$(get_meta_value "assembly.provider_url" | sed -r 's%^.+/%%g')
if [ $provider == "refseq" ]; then
accession=$(echo $accession | sed 's/^GCA_/GCF_/')
fi
echo "Provider is $provider"
echo "Accession is $accession"
datasets summary genome accession $accession | jq '.' > !{output}
# Check if it should maybe be using RefSeq?
if [ "$(jq '.total_count' !{output})" == "0" ]; then
accession=$(echo $accession | sed 's/^GCA_/GCF_/')
echo "Trying again with accession $accession"
datasets summary genome accession $accession | jq '.' > !{output}
fi
echo -e -n "Provider is $provider\nAccession is $accession\n"
'''


stub:
output_file = "ncbi_stats.json"
dump_dir = "$workflow.projectDir/../../../../data/test/pipelines/dumper/dump_files"
dump_file = "downloaded_ncbi_stats.json"
"""
cp $dump_dir/$dump_file $output_file
accession="GCA_015245375.1"
echo -e -n "Provider is The University of Georgia\nAccession is GCA_015245375.1\n"
"""
}
19 changes: 10 additions & 9 deletions pipelines/nextflow/subworkflows/dump_files/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,21 @@
// See the License for the specific language governing permissions and
// limitations under the License.

include { DUMP_SEQ_REGIONS } from '../../modules/seq_region/dump_seq_regions.nf'
include { DOWNLOAD_GENOME_META_FROM_DB } from '../../modules/download/datasets_genome_meta_from_db.nf'
include { DUMP_AGP } from '../../modules/seq_region/dump_agp.nf'
include { DUMP_SEQ_ATTRIB } from '../../modules/seq_region/dump_seq_attrib.nf'
include { DUMP_FASTA_DNA } from '../../modules/fasta/dump_fasta_dna.nf'
include { DUMP_FASTA_PEPTIDES } from '../../modules/fasta/dump_fasta_peptides.nf'
include { DUMP_GFF3 } from '../../modules/gff3/dump_gff3.nf'
include { DUMP_ANNOTATION } from '../../modules/annotation/dump_annotation.nf'
include { DUMP_EVENTS } from '../../modules/events/dump_events.nf'
include { DUMP_FASTA_DNA } from '../../modules/fasta/dump_fasta_dna.nf'
include { DUMP_FASTA_PEPTIDES } from '../../modules/fasta/dump_fasta_peptides.nf'
include { DUMP_GENOME_META } from '../../modules/genome_metadata/dump_genome_meta.nf'
include { DUMP_GENOME_STATS } from '../../modules/genome_stats/dump_genome_stats.nf'
include { DUMP_GFF3 } from '../../modules/gff3/dump_gff3.nf'
include { DUMP_SEQ_ATTRIB } from '../../modules/seq_region/dump_seq_attrib.nf'
include { DUMP_SEQ_REGIONS } from '../../modules/seq_region/dump_seq_regions.nf'
include { CHECK_INTEGRITY } from '../../modules/manifest/integrity.nf'
include { COMPARE_GENOME_STATS } from '../../modules/genome_stats/compare_genome_stats.nf'
include { DUMP_NCBI_STATS } from '../../modules/genome_metadata/dump_ncbi_stats.nf'

include { CORE_TO_ASM_META } from '../../modules/genome_metadata/core_to_asm_meta.nf'
include { MANIFEST } from '../../modules/manifest/manifest_maker.nf'
include { CHECK_INTEGRITY } from '../../modules/manifest/integrity.nf'
include { PUBLISH_DIR } from '../../modules/files/publish_output_dump.nf'

workflow DUMP_FILES {
Expand Down Expand Up @@ -99,7 +99,8 @@ workflow DUMP_FILES {
// Genome stats
if ("stats" in selection) {
genome_stats = DUMP_GENOME_STATS(db)
ncbi_stats = DUMP_NCBI_STATS(db)
assembly_acc_meta = CORE_TO_ASM_META(db)
ncbi_stats = DOWNLOAD_GENOME_META_FROM_DB(assembly_acc_meta)
stats = ncbi_stats.join(genome_stats)
stats_files = COMPARE_GENOME_STATS(stats).transpose()
db_files = db_files.mix(stats_files)
Expand Down
4 changes: 2 additions & 2 deletions pipelines/nextflow/workflows/genome_prepare/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ if (params.brc_mode) {
include { GENOME_PREPARE } from '../../subworkflows/genome_prepare/main.nf'
// Import module
include { PREPARE_GENOME_METADATA } from '../../modules/genome_metadata/prepare_genome_metadata.nf'
include { DATASETS_METADATA } from '../../modules/genome_metadata/datasets_metadata.nf'
include { DOWNLOAD_GENOME_META_FROM_ACC } from '../../modules/download/datasets_genome_meta_from_acc.nf'
include { ACCESSION_METADATA } from '../../modules/genome_metadata/accession_metadata.nf'
// Utilities
include { read_json } from '../../modules/utils/utils.nf'
Expand Down Expand Up @@ -70,7 +70,7 @@ workflow {
ch_genome_json = Channel.fromPath("${params.input_dir}/*.json", checkIfExists: true)
accession_meta = ACCESSION_METADATA(ch_genome_json)
accession_val = accession_meta.map{ accession, meta_file -> accession }
dataset_report = DATASETS_METADATA(accession_val)
dataset_report = DOWNLOAD_GENOME_META_FROM_ACC(accession_val)
PREPARE_GENOME_METADATA(accession_meta.join(dataset_report))

PREPARE_GENOME_METADATA.out.genomic_dataset
Expand Down
3 changes: 3 additions & 0 deletions pipelines/nextflow/workflows/nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,7 @@ process {
withLabel: 'ensembl_scripts_container' {
container = "matthieubarba/ensembl-scripts:0.8"
}
withLabel: 'datasets_container' {
container = "ensemblorg/datasets-cli:latest"
}
}

0 comments on commit ac54171

Please sign in to comment.