Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculate output file md5 hashes and file sizes. #20

Merged
merged 8 commits into from
Sep 19, 2024
23 changes: 23 additions & 0 deletions conf/output_directories.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

params {
output_directories = [
panorama: "${params.result_dir}/panorama",
aws: "${params.result_dir}/aws",
msconvert: "${params.result_dir}/msconvert",
diann: "${params.result_dir}/diann",
qc_report: "${params.result_dir}/qc_report",
qc_report_tables: "${params.result_dir}/qc_report/tables",
gene_reports: "${params.result_dir}/gene_reports",
encyclopedia: [
convert_blib: "${params.result_dir}/encyclopedia/convert-blib",
search_file: "${params.result_dir}/encyclopedia/search-file",
create_elib: "${params.result_dir}/encyclopedia/create-elib"
],
skyline: [
add_lib: "${params.result_dir}/skyline/add-lib",
import_spectra: "${params.result_dir}/skyline/import-spectra",
minimize: "${params.result_dir}/skyline/minimize",
reports: "${params.result_dir}/skyline/reports"
]
]
}
23 changes: 23 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ include { panorama_upload_results } from "./workflows/panorama_upload"
include { panorama_upload_mzmls } from "./workflows/panorama_upload"
include { save_run_details } from "./workflows/save_run_details"
include { get_pdc_files } from "./workflows/get_pdc_files"
include { combine_file_hashes } from "./workflows/combine_file_hashes"

// modules
include { ENCYCLOPEDIA_BLIB_TO_DLIB } from "./modules/encyclopedia"
Expand Down Expand Up @@ -197,12 +198,14 @@ workflow {
)

quant_library = encyclopeda_export_elib.out.elib
spec_lib_hashes = encyclopeda_export_elib.out.output_file_stats

all_elib_ch = encyclopeda_export_elib.out.elib.concat(
encyclopeda_export_elib.out.individual_elibs
)
} else {
quant_library = spectral_library_to_use
spec_lib_hashes = Channel.empty()
all_mzml_ch = wide_mzml_ch
all_elib_ch = Channel.empty()
}
Expand All @@ -219,6 +222,7 @@ workflow {
)

encyclopedia_version = encyclopedia_quant.out.encyclopedia_version
search_file_stats = encyclopedia_quant.out.output_file_stats.concat(spec_lib_hashes)

final_elib = encyclopedia_quant.out.elib
all_elib_ch = all_elib_ch.concat(
Expand Down Expand Up @@ -284,6 +288,7 @@ workflow {
)

diann_version = diann_search.out.diann_version
search_file_stats = diann_search.out.output_file_stats

// create compatible spectral library for Skyline, if needed
if(!params.skyline.skip) {
Expand Down Expand Up @@ -329,11 +334,17 @@ workflow {
}

final_skyline_file = skyline_import.out.skyline_results
final_skyline_hash = skyline_import.out.skyline_results_hash

// generate QC report
if(!params.qc_report.skip) {
generate_dia_qc_report(final_skyline_file, replicate_metadata)
dia_qc_version = generate_dia_qc_report.out.dia_qc_version
qc_report_files = generate_dia_qc_report.out.qc_reports.concat(
generate_dia_qc_report.out.qc_report_qmd,
generate_dia_qc_report.out.qc_report_db,
generate_dia_qc_report.out.qc_tables
)

// Export PDC gene tables
if(params.pdc.gene_level_data != null) {
Expand All @@ -346,6 +357,8 @@ workflow {
}
} else {
dia_qc_version = Channel.empty()
qc_report_files = Channel.empty()
gene_reports = Channel.empty()
}

// run reports if requested
Expand All @@ -367,6 +380,7 @@ workflow {
final_skyline_file = Channel.empty()
qc_report_files = Channel.empty()
proteowizard_version = Channel.empty()
final_skyline_hash = Channel.empty()
dia_qc_version = Channel.empty()
gene_reports = Channel.empty()
}
Expand All @@ -382,6 +396,15 @@ workflow {
save_run_details(input_files.collect(), version_files.collect())
run_details_file = save_run_details.out.run_details

combine_file_hashes(fasta, spectral_library,
search_file_stats,
final_skyline_file,
final_skyline_hash,
skyline_reports_ch,
qc_report_files,
gene_reports,
run_details_file)

// upload results to Panorama
if(params.panorama.upload) {

Expand Down
4 changes: 2 additions & 2 deletions modules/aws.nf
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ process BUILD_AWS_SECRETS {
label 'process_low_constant'
secret 'PANORAMA_API_KEY'
executor 'local' // always run this locally
publishDir "${params.result_dir}/aws", failOnError: true, mode: 'copy'
publishDir params.output_directories.aws, failOnError: true, mode: 'copy'
cache false // never cache

input:
Expand Down Expand Up @@ -123,4 +123,4 @@ process BUILD_AWS_SECRETS {
// touch aws-destroy-secrets.stderr
// touch aws-destroy-secrets.stdout
// """
// }
// }
24 changes: 21 additions & 3 deletions modules/diann.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
process DIANN_SEARCH {
publishDir "${params.result_dir}/diann", failOnError: true, mode: 'copy'
publishDir params.output_directories.diann, failOnError: true, mode: 'copy'
label 'process_high_constant'
container params.images.diann

Expand All @@ -16,6 +16,7 @@ process DIANN_SEARCH {
path("report.tsv"), emit: precursor_tsv
path("*.quant"), emit: quant_files
path("diann_version.txt"), emit: version
path("output_file_stats.txt"), emit: output_file_stats

script:

Expand All @@ -37,18 +38,26 @@ process DIANN_SEARCH {
mv -v lib.tsv.speclib report.tsv.speclib

head -n 1 diann.stdout | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt

md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""

stub:
"""
touch report.tsv.speclib report.tsv stub.quant
touch stub.stderr stub.stdout
diann | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt

md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""
}

process DIANN_SEARCH_LIB_FREE {
publishDir "${params.result_dir}/diann", failOnError: true, mode: 'copy'
publishDir params.output_directories.diann, failOnError: true, mode: 'copy'
label 'process_high_constant'
container params.images.diann

Expand All @@ -65,6 +74,7 @@ process DIANN_SEARCH_LIB_FREE {
path("*.quant"), emit: quant_files
path("lib.predicted.speclib"), emit: predicted_speclib
path("diann_version.txt"), emit: version
path("output_file_stats.txt"), emit: output_file_stats

script:

Expand All @@ -87,19 +97,27 @@ process DIANN_SEARCH_LIB_FREE {
mv -v lib.tsv.speclib report.tsv.speclib

head -n 1 diann.stdout | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt

md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""

stub:
"""
touch lib.predicted.speclib report.tsv.speclib report.tsv stub.quant
touch stub.stderr stub.stdout
diann | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt

md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""
}


process BLIB_BUILD_LIBRARY {
publishDir "${params.result_dir}/diann", failOnError: true, mode: 'copy'
publishDir params.output_directories.diann, failOnError: true, mode: 'copy'
label 'process_medium'
container params.images.bibliospec

Expand Down
28 changes: 18 additions & 10 deletions modules/encyclopedia.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@ def exec_java_command(mem) {
}

process ENCYCLOPEDIA_SEARCH_FILE {
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.stderr", failOnError: true, mode: 'copy'
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.stdout", failOnError: true, mode: 'copy'
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.elib", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.dia", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.features.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.encyclopedia.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.encyclopedia.decoy.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.stderr", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.stdout", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.elib", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.features.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.encyclopedia.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.encyclopedia.decoy.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
label 'process_high_constant'
container params.images.encyclopedia

Expand All @@ -28,6 +27,7 @@ process ENCYCLOPEDIA_SEARCH_FILE {
path("${mzml_file}.features.txt"), emit: features
path("${mzml_file}.encyclopedia.txt"), emit: results_targets
path("${mzml_file}.encyclopedia.decoy.txt"), emit: results_decoys
path("output_file_stats.txt"), emit: output_file_stats


script:
Expand All @@ -40,6 +40,10 @@ process ENCYCLOPEDIA_SEARCH_FILE {
-percolatorVersion /usr/local/bin/percolator \\
${encyclopedia_params} \\
> >(tee "encyclopedia-${mzml_file.baseName}.stdout") 2> >(tee "encyclopedia-${mzml_file.baseName}.stderr" >&2)

md5sum *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""

stub:
Expand All @@ -50,11 +54,15 @@ process ENCYCLOPEDIA_SEARCH_FILE {
touch "${mzml_file}.features.txt"
touch "${mzml_file}.encyclopedia.txt"
touch "${mzml_file}.encyclopedia.decoy.txt"

md5sum *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""
}

process ENCYCLOPEDIA_CREATE_ELIB {
publishDir "${params.result_dir}/encyclopedia/create-elib", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.create_elib, failOnError: true, mode: 'copy'
label 'process_memory_high_constant'
container params.images.encyclopedia

Expand Down Expand Up @@ -113,7 +121,7 @@ process ENCYCLOPEDIA_CREATE_ELIB {
}

process ENCYCLOPEDIA_BLIB_TO_DLIB {
publishDir "${params.result_dir}/encyclopedia/convert-blib", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.convert_blib, failOnError: true, mode: 'copy'
label 'process_medium'
label 'process_high_memory'
container params.images.encyclopedia
Expand Down Expand Up @@ -147,7 +155,7 @@ process ENCYCLOPEDIA_BLIB_TO_DLIB {
}

process ENCYCLOPEDIA_DLIB_TO_TSV {
publishDir "${params.result_dir}/encyclopedia/convert-blib", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.convert_blib, failOnError: true, mode: 'copy'
label 'process_medium'
label 'process_high_memory'
container params.images.encyclopedia3_mriffle
Expand Down
37 changes: 37 additions & 0 deletions modules/file_stats.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

process CALCULATE_MD5 {
label 'process_low'
container params.images.ubuntu

input:
path(file_to_check)

output:
tuple val("${file_to_check.name}"), env(md5_sum)

shell:
'''
md5_sum=$( md5sum !{file_to_check} |awk '{print $1}' )
'''
}

process WRITE_FILE_STATS {
label 'process_low'
container params.images.ubuntu
publishDir "${params.result_dir}", failOnError: true, mode: 'copy'

input:
val file_stats

output:
path("file_checksums.tsv")

script:
data = file_stats.join('\\n')
"""
text="${data}"

echo -e 'file\\tpath\\tmd5_hash\\tsize' > file_checksums.tsv
echo -e \$text >> file_checksums.tsv
"""
}
2 changes: 1 addition & 1 deletion modules/msconvert.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
process MSCONVERT {
storeDir "${params.mzml_cache_directory}/${workflow.commitId}/${params.msconvert.do_demultiplex}/${params.msconvert.do_simasspectra}"
publishDir "${params.result_dir}/msconvert", pattern: "*.mzML", failOnError: true, mode: 'copy', enabled: params.msconvert_only && !params.panorama.upload
publishDir params.output_directories.msconvert, pattern: "*.mzML", failOnError: true, mode: 'copy', enabled: params.msconvert_only && !params.panorama.upload
label 'process_medium'
label 'process_high_memory'
label 'error_retry'
Expand Down
18 changes: 9 additions & 9 deletions modules/panorama.nf
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ process PANORAMA_GET_RAW_FILE_LIST {
label 'process_low_constant'
label 'error_retry'
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy'
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy'
secret 'PANORAMA_API_KEY'

input:
Expand Down Expand Up @@ -93,8 +93,8 @@ process PANORAMA_GET_FILE {
label 'process_low_constant'
label 'error_retry'
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr"
secret 'PANORAMA_API_KEY'

input:
Expand Down Expand Up @@ -169,8 +169,8 @@ process PANORAMA_GET_SKYR_FILE {
label 'process_low_constant'
label 'error_retry'
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr"
secret 'PANORAMA_API_KEY'

input:
Expand Down Expand Up @@ -202,8 +202,8 @@ process UPLOAD_FILE {
label 'error_retry'
maxForks 2
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr"
secret 'PANORAMA_API_KEY'

input:
Expand Down Expand Up @@ -239,8 +239,8 @@ process UPLOAD_FILE {
process IMPORT_SKYLINE {
label 'process_low_constant'
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr"
secret 'PANORAMA_API_KEY'

input:
Expand Down
Loading
Loading