From e610ab6f3667bcd26890240c0ec9fb20f5e61756 Mon Sep 17 00:00:00 2001 From: mauraisa Date: Fri, 13 Sep 2024 15:28:00 -0700 Subject: [PATCH 1/8] Calculate output file stats during DiaNN search. --- modules/diann.nf | 18 ++++++++++++++++++ workflows/diann_search.nf | 5 ++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/modules/diann.nf b/modules/diann.nf index 95a7e44..ad00a03 100644 --- a/modules/diann.nf +++ b/modules/diann.nf @@ -16,6 +16,7 @@ process DIANN_SEARCH { path("report.tsv"), emit: precursor_tsv path("*.quant"), emit: quant_files path("diann_version.txt"), emit: version + path("output_file_stats.txt"), emit: output_file_stats script: @@ -37,6 +38,10 @@ process DIANN_SEARCH { mv -v lib.tsv.speclib report.tsv.speclib head -n 1 diann.stdout | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt + + md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt + stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt + join -t'\t' hashes.txt sizes.txt > output_file_stats.txt """ stub: @@ -44,6 +49,10 @@ process DIANN_SEARCH { touch report.tsv.speclib report.tsv stub.quant touch stub.stderr stub.stdout diann | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt + + md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt + stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt + join -t'\t' hashes.txt sizes.txt > output_file_stats.txt """ } @@ -65,6 +74,7 @@ process DIANN_SEARCH_LIB_FREE { path("*.quant"), emit: quant_files path("lib.predicted.speclib"), emit: predicted_speclib path("diann_version.txt"), emit: version + path("output_file_stats.txt"), emit: output_file_stats script: @@ -87,6 +97,10 @@ process DIANN_SEARCH_LIB_FREE { mv -v lib.tsv.speclib report.tsv.speclib head -n 1 diann.stdout | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt + + md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt + stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt + join -t'\t' hashes.txt sizes.txt > output_file_stats.txt """ stub: @@ -94,6 +108,10 @@ process DIANN_SEARCH_LIB_FREE { touch lib.predicted.speclib report.tsv.speclib report.tsv stub.quant touch stub.stderr stub.stdout diann | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt + + md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt + stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt + join -t'\t' hashes.txt sizes.txt > output_file_stats.txt """ } diff --git a/workflows/diann_search.nf b/workflows/diann_search.nf index 2af93ae..64c954c 100644 --- a/workflows/diann_search.nf +++ b/workflows/diann_search.nf @@ -4,7 +4,7 @@ include { DIANN_SEARCH } from "../modules/diann" include { DIANN_SEARCH_LIB_FREE } from "../modules/diann" workflow diann_search { - + take: ms_file_ch fasta @@ -18,6 +18,7 @@ workflow diann_search { stderr predicted_speclib diann_version + output_file_stats main: @@ -30,6 +31,7 @@ workflow diann_search { params.diann.params ) diann_version = DIANN_SEARCH.out.version + output_file_stats = DIANN_SEARCH.out.output_file_stats predicted_speclib = Channel.empty() } else { @@ -41,6 +43,7 @@ workflow diann_search { diann_version = DIANN_SEARCH_LIB_FREE.out.version predicted_speclib = diann_results.predicted_speclib + output_file_stats = DIANN_SEARCH_LIB_FREE.out.output_file_stats } quant_files = diann_results.quant_files From 8dba9e1982ef117b6e6c60b212581102a9cd0132 Mon Sep 17 00:00:00 2001 From: mauraisa Date: Fri, 13 Sep 2024 15:28:49 -0700 Subject: [PATCH 2/8] Don't publish EncyclopeDIA .dia files. --- modules/encyclopedia.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/encyclopedia.nf b/modules/encyclopedia.nf index b9edaaa..a42e791 100644 --- a/modules/encyclopedia.nf +++ b/modules/encyclopedia.nf @@ -7,7 +7,6 @@ process ENCYCLOPEDIA_SEARCH_FILE { publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.stderr", failOnError: true, mode: 'copy' publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.stdout", failOnError: true, mode: 'copy' publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.elib", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output - publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.dia", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.features.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.encyclopedia.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.encyclopedia.decoy.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output From c775bd1a6618d2b387c4ed2eb11843514777f6bc Mon Sep 17 00:00:00 2001 From: mauraisa Date: Fri, 13 Sep 2024 16:44:17 -0700 Subject: [PATCH 3/8] Move process publishDir paths to conf/output_directories.config --- conf/output_directories.config | 23 +++++++++++++++++++++++ modules/aws.nf | 4 ++-- modules/diann.nf | 6 +++--- modules/encyclopedia.nf | 18 +++++++++--------- modules/msconvert.nf | 2 +- modules/panorama.nf | 18 +++++++++--------- modules/qc_report.nf | 16 ++++++++-------- modules/skyline.nf | 14 +++++++------- nextflow.config | 3 +++ 9 files changed, 65 insertions(+), 39 deletions(-) create mode 100644 conf/output_directories.config diff --git a/conf/output_directories.config b/conf/output_directories.config new file mode 100644 index 0000000..57d2270 --- /dev/null +++ b/conf/output_directories.config @@ -0,0 +1,23 @@ + +params { + output_directories = [ + panorama: "${params.result_dir}/panorama", + aws: "${params.result_dir}/aws", + msconvert: "${params.result_dir}/msconvert", + diann: "${params.result_dir}/diann", + qc_report: "${params.result_dir}/qc_report", + qc_report_tables: "${params.result_dir}/qc_report/tables", + gene_reports: "${params.result_dir}/gene_reports", + encyclopedia: [ + convert_blib: "${params.result_dir}/encyclopedia/convert-blib", + search_file: "${params.result_dir}/encyclopedia/search-file", + create_elib: "${params.result_dir}/encyclopedia/create-elib" + ], + skyline: [ + add_lib: "${params.result_dir}/skyline/add-lib", + import_spectra: "${params.result_dir}/skyline/import-spectra", + minimize: "${params.result_dir}/skyline/minimize", + reports: "${params.result_dir}/skyline/reports" + ] + ] +} diff --git a/modules/aws.nf b/modules/aws.nf index 53509fe..0928586 100644 --- a/modules/aws.nf +++ b/modules/aws.nf @@ -36,7 +36,7 @@ process BUILD_AWS_SECRETS { label 'process_low_constant' secret 'PANORAMA_API_KEY' executor 'local' // always run this locally - publishDir "${params.result_dir}/aws", failOnError: true, mode: 'copy' + publishDir params.output_directories.aws, failOnError: true, mode: 'copy' cache false // never cache input: @@ -123,4 +123,4 @@ process BUILD_AWS_SECRETS { // touch aws-destroy-secrets.stderr // touch aws-destroy-secrets.stdout // """ -// } \ No newline at end of file +// } diff --git a/modules/diann.nf b/modules/diann.nf index ad00a03..a7c8cfe 100644 --- a/modules/diann.nf +++ b/modules/diann.nf @@ -1,5 +1,5 @@ process DIANN_SEARCH { - publishDir "${params.result_dir}/diann", failOnError: true, mode: 'copy' + publishDir params.output_directories.diann, failOnError: true, mode: 'copy' label 'process_high_constant' container params.images.diann @@ -57,7 +57,7 @@ process DIANN_SEARCH { } process DIANN_SEARCH_LIB_FREE { - publishDir "${params.result_dir}/diann", failOnError: true, mode: 'copy' + publishDir params.output_directories.diann, failOnError: true, mode: 'copy' label 'process_high_constant' container params.images.diann @@ -117,7 +117,7 @@ process DIANN_SEARCH_LIB_FREE { process BLIB_BUILD_LIBRARY { - publishDir "${params.result_dir}/diann", failOnError: true, mode: 'copy' + publishDir params.output_directories.diann, failOnError: true, mode: 'copy' label 'process_medium' container params.images.bibliospec diff --git a/modules/encyclopedia.nf b/modules/encyclopedia.nf index a42e791..06cc187 100644 --- a/modules/encyclopedia.nf +++ b/modules/encyclopedia.nf @@ -4,12 +4,12 @@ def exec_java_command(mem) { } process ENCYCLOPEDIA_SEARCH_FILE { - publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.stderr", failOnError: true, mode: 'copy' - publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.stdout", failOnError: true, mode: 'copy' - publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.elib", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output - publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.features.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output - publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.encyclopedia.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output - publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.encyclopedia.decoy.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output + publishDir params.output_directories.encyclopedia.search_file, pattern: "*.stderr", failOnError: true, mode: 'copy' + publishDir params.output_directories.encyclopedia.search_file, pattern: "*.stdout", failOnError: true, mode: 'copy' + publishDir params.output_directories.encyclopedia.search_file, pattern: "*.elib", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output + publishDir params.output_directories.encyclopedia.search_file, pattern: "*.features.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output + publishDir params.output_directories.encyclopedia.search_file, pattern: "*.encyclopedia.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output + publishDir params.output_directories.encyclopedia.search_file, pattern: "*.encyclopedia.decoy.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output label 'process_high_constant' container params.images.encyclopedia @@ -53,7 +53,7 @@ process ENCYCLOPEDIA_SEARCH_FILE { } process ENCYCLOPEDIA_CREATE_ELIB { - publishDir "${params.result_dir}/encyclopedia/create-elib", failOnError: true, mode: 'copy' + publishDir params.output_directories.encyclopedia.create_elib, failOnError: true, mode: 'copy' label 'process_memory_high_constant' container params.images.encyclopedia @@ -112,7 +112,7 @@ process ENCYCLOPEDIA_CREATE_ELIB { } process ENCYCLOPEDIA_BLIB_TO_DLIB { - publishDir "${params.result_dir}/encyclopedia/convert-blib", failOnError: true, mode: 'copy' + publishDir params.output_directories.encyclopedia.convert_blib, failOnError: true, mode: 'copy' label 'process_medium' label 'process_high_memory' container params.images.encyclopedia @@ -146,7 +146,7 @@ process ENCYCLOPEDIA_BLIB_TO_DLIB { } process ENCYCLOPEDIA_DLIB_TO_TSV { - publishDir "${params.result_dir}/encyclopedia/convert-blib", failOnError: true, mode: 'copy' + publishDir params.output_directories.encyclopedia.convert_blib, failOnError: true, mode: 'copy' label 'process_medium' label 'process_high_memory' container params.images.encyclopedia3_mriffle diff --git a/modules/msconvert.nf b/modules/msconvert.nf index 1861e77..5f2dd27 100644 --- a/modules/msconvert.nf +++ b/modules/msconvert.nf @@ -1,6 +1,6 @@ process MSCONVERT { storeDir "${params.mzml_cache_directory}/${workflow.commitId}/${params.msconvert.do_demultiplex}/${params.msconvert.do_simasspectra}" - publishDir "${params.result_dir}/msconvert", pattern: "*.mzML", failOnError: true, mode: 'copy', enabled: params.msconvert_only && !params.panorama.upload + publishDir params.output_directories.msconvert, pattern: "*.mzML", failOnError: true, mode: 'copy', enabled: params.msconvert_only && !params.panorama.upload label 'process_medium' label 'process_high_memory' label 'error_retry' diff --git a/modules/panorama.nf b/modules/panorama.nf index e5a90e3..07b86e5 100644 --- a/modules/panorama.nf +++ b/modules/panorama.nf @@ -55,7 +55,7 @@ process PANORAMA_GET_RAW_FILE_LIST { label 'process_low_constant' label 'error_retry' container params.images.panorama_client - publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy' + publishDir params.output_directories.panorama, failOnError: true, mode: 'copy' secret 'PANORAMA_API_KEY' input: @@ -93,8 +93,8 @@ process PANORAMA_GET_FILE { label 'process_low_constant' label 'error_retry' container params.images.panorama_client - publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout" - publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr" + publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout" + publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr" secret 'PANORAMA_API_KEY' input: @@ -169,8 +169,8 @@ process PANORAMA_GET_SKYR_FILE { label 'process_low_constant' label 'error_retry' container params.images.panorama_client - publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout" - publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr" + publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout" + publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr" secret 'PANORAMA_API_KEY' input: @@ -202,8 +202,8 @@ process UPLOAD_FILE { label 'error_retry' maxForks 2 container params.images.panorama_client - publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout" - publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr" + publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout" + publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr" secret 'PANORAMA_API_KEY' input: @@ -239,8 +239,8 @@ process UPLOAD_FILE { process IMPORT_SKYLINE { label 'process_low_constant' container params.images.panorama_client - publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout" - publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr" + publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout" + publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr" secret 'PANORAMA_API_KEY' input: diff --git a/modules/qc_report.nf b/modules/qc_report.nf index f93c9d8..427aaf4 100644 --- a/modules/qc_report.nf +++ b/modules/qc_report.nf @@ -28,7 +28,7 @@ process MAKE_EMPTY_FILE { } process PARSE_REPORTS { - publishDir "${params.result_dir}/qc_report", failOnError: true, mode: 'copy' + publishDir params.output_directories.qc_report, failOnError: true, mode: 'copy' label 'process_high_memory' container params.images.qc_pipeline @@ -79,9 +79,9 @@ process PARSE_REPORTS { } process EXPORT_TABLES { - publishDir "${params.result_dir}/qc_report/tables", pattern: '*.tsv', failOnError: true, mode: 'copy' - publishDir "${params.result_dir}/qc_report", pattern: '*.stdout', failOnError: true, mode: 'copy' - publishDir "${params.result_dir}/qc_report", pattern: '*.stderr', failOnError: true, mode: 'copy' + publishDir params.output_directories.qc_report_tables, pattern: '*.tsv', failOnError: true, mode: 'copy' + publishDir params.output_directories.qc_report, pattern: '*.stdout', failOnError: true, mode: 'copy' + publishDir params.output_directories.qc_report, pattern: '*.stderr', failOnError: true, mode: 'copy' label 'process_high_memory' container params.images.qc_pipeline @@ -106,9 +106,9 @@ process EXPORT_TABLES { } process RENDER_QC_REPORT { - publishDir "${params.result_dir}/qc_report", pattern: 'qc_report.*', failOnError: true, mode: 'copy' - publishDir "${params.result_dir}/qc_report", pattern: '*.stdout', failOnError: true, mode: 'copy' - publishDir "${params.result_dir}/qc_report", pattern: '*.stderr', failOnError: true, mode: 'copy' + publishDir params.output_directories.qc_report, pattern: 'qc_report.*', failOnError: true, mode: 'copy' + publishDir params.output_directories.qc_report, pattern: '*.stdout', failOnError: true, mode: 'copy' + publishDir params.output_directories.qc_report, pattern: '*.stderr', failOnError: true, mode: 'copy' label 'process_high_memory' container params.images.qc_pipeline @@ -136,7 +136,7 @@ process RENDER_QC_REPORT { } process EXPORT_GENE_REPORTS { - publishDir "${params.result_dir}/gene_reports", failOnError: true, mode: 'copy' + publishDir params.output_directories.gene_reports, failOnError: true, mode: 'copy' label 'process_high_memory' container params.images.qc_pipeline diff --git a/modules/skyline.nf b/modules/skyline.nf index 780b60b..a70b565 100644 --- a/modules/skyline.nf +++ b/modules/skyline.nf @@ -4,7 +4,7 @@ def sky_basename(path) { } process SKYLINE_ADD_LIB { - publishDir "${params.result_dir}/skyline/add-lib", failOnError: true, mode: 'copy' + publishDir params.output_directories.skyline.add_lib, failOnError: true, mode: 'copy' label 'process_medium' label 'process_short' label 'error_retry' @@ -86,7 +86,7 @@ process SKYLINE_ADD_LIB { } process SKYLINE_IMPORT_MZML { - publishDir "${params.result_dir}/skyline/import-spectra", pattern: '*.std[oe][ur][tr]', failOnError: true, mode: 'copy' + publishDir params.output_directories.skyline.import_spectra, pattern: '*.std[oe][ur][tr]', failOnError: true, mode: 'copy' label 'process_medium' label 'process_high_memory' label 'process_short' @@ -123,7 +123,7 @@ process SKYLINE_IMPORT_MZML { } process SKYLINE_MERGE_RESULTS { - publishDir "${params.result_dir}/skyline/import-spectra", enabled: params.replicate_metadata == null && params.pdc.study_id == null, failOnError: true, mode: 'copy' + publishDir params.output_directories.skyline.import_spectra, enabled: params.replicate_metadata == null && params.pdc.study_id == null, failOnError: true, mode: 'copy' label 'process_high' label 'error_retry' container params.images.proteowizard @@ -175,7 +175,7 @@ process SKYLINE_MERGE_RESULTS { } process ANNOTATION_TSV_TO_CSV { - publishDir "${params.result_dir}/skyline/import-spectra", failOnError: true, mode: 'copy' + publishDir params.output_directories.skyline.import_spectra, failOnError: true, mode: 'copy' label 'process_low' label 'error_retry' container params.images.qc_pipeline @@ -199,7 +199,7 @@ process ANNOTATION_TSV_TO_CSV { } process SKYLINE_MINIMIZE_DOCUMENT { - publishDir "${params.result_dir}/skyline/minimize", failOnError: true, mode: 'copy' + publishDir params.output_directories.skyline.minimize, failOnError: true, mode: 'copy' label 'error_retry' label 'process_high' container params.images.proteowizard @@ -239,7 +239,7 @@ process SKYLINE_MINIMIZE_DOCUMENT { } process SKYLINE_ANNOTATE_DOCUMENT { - publishDir "${params.result_dir}/skyline/import-spectra", failOnError: true, mode: 'copy' + publishDir params.output_directories.skyline.import_spectra, failOnError: true, mode: 'copy' label 'process_memory_high_constant' container params.images.proteowizard @@ -280,7 +280,7 @@ process SKYLINE_ANNOTATE_DOCUMENT { } process SKYLINE_RUN_REPORTS { - publishDir "${params.result_dir}/skyline/reports", failOnError: true, mode: 'copy' + publishDir params.output_directories.skyline.reports, failOnError: true, mode: 'copy' label 'process_high' label 'error_retry' container params.images.proteowizard diff --git a/nextflow.config b/nextflow.config index 2f80c0d..82abcec 100644 --- a/nextflow.config +++ b/nextflow.config @@ -237,6 +237,9 @@ includeConfig 'conf/base.config' // Load the images to use for all processes includeConfig 'container_images.config' +// Load the output file directories +includeConfig 'conf/output_directories.config' + // Function to ensure that resource requirements don't go beyond // a maximum limit. Copied from the nf-core template. def check_max(obj, type) { From ae159c8828aadabbbdf43ae9e5b6e01eb6cf09a4 Mon Sep 17 00:00:00 2001 From: mauraisa Date: Mon, 16 Sep 2024 10:56:33 -0700 Subject: [PATCH 4/8] Calculate output file stats during EncyclopeDIA search. --- modules/encyclopedia.nf | 9 +++++++++ workflows/encyclopedia_search.nf | 2 ++ 2 files changed, 11 insertions(+) diff --git a/modules/encyclopedia.nf b/modules/encyclopedia.nf index 06cc187..07f9048 100644 --- a/modules/encyclopedia.nf +++ b/modules/encyclopedia.nf @@ -27,6 +27,7 @@ process ENCYCLOPEDIA_SEARCH_FILE { path("${mzml_file}.features.txt"), emit: features path("${mzml_file}.encyclopedia.txt"), emit: results_targets path("${mzml_file}.encyclopedia.decoy.txt"), emit: results_decoys + path("output_file_stats.txt"), emit: output_file_stats script: @@ -39,6 +40,10 @@ process ENCYCLOPEDIA_SEARCH_FILE { -percolatorVersion /usr/local/bin/percolator \\ ${encyclopedia_params} \\ > >(tee "encyclopedia-${mzml_file.baseName}.stdout") 2> >(tee "encyclopedia-${mzml_file.baseName}.stderr" >&2) + + md5sum *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt + stat -L --printf='%n\t%s\n' *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sort > sizes.txt + join -t'\t' hashes.txt sizes.txt > output_file_stats.txt """ stub: @@ -49,6 +54,10 @@ process ENCYCLOPEDIA_SEARCH_FILE { touch "${mzml_file}.features.txt" touch "${mzml_file}.encyclopedia.txt" touch "${mzml_file}.encyclopedia.decoy.txt" + + md5sum *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt + stat -L --printf='%n\t%s\n' *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sort > sizes.txt + join -t'\t' hashes.txt sizes.txt > output_file_stats.txt """ } diff --git a/workflows/encyclopedia_search.nf b/workflows/encyclopedia_search.nf index f62cdd2..4c0271a 100644 --- a/workflows/encyclopedia_search.nf +++ b/workflows/encyclopedia_search.nf @@ -18,6 +18,7 @@ workflow encyclopedia_search { peptide_quant protein_quant encyclopedia_version + output_file_stats main: @@ -49,4 +50,5 @@ workflow encyclopedia_search { peptide_quant = ENCYCLOPEDIA_CREATE_ELIB.out.peptide_quant protein_quant = ENCYCLOPEDIA_CREATE_ELIB.out.protein_quant encyclopedia_version = ENCYCLOPEDIA_CREATE_ELIB.out.version + output_file_stats = ENCYCLOPEDIA_SEARCH_FILE.out.output_file_stats } From 38793463cbc3d75ef8d5feadc8e2518de220c190 Mon Sep 17 00:00:00 2001 From: mauraisa Date: Mon, 16 Sep 2024 15:09:27 -0700 Subject: [PATCH 5/8] Write skyline hashes to file instead of environment variable. --- modules/skyline.nf | 18 +++++++++--------- workflows/skyline_import.nf | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/modules/skyline.nf b/modules/skyline.nf index a70b565..467c50f 100644 --- a/modules/skyline.nf +++ b/modules/skyline.nf @@ -139,7 +139,7 @@ process SKYLINE_MERGE_RESULTS { path("${params.skyline.document_name}.sky.zip"), emit: final_skyline_zipfile path("skyline-merge.stdout"), emit: stdout path("skyline-merge.stderr"), emit: stderr - env(sky_zip_hash), emit: file_hash + path('output_file_hashes.txt'), emit: output_file_hashes script: import_files_params = "--import-file=${(mzml_files as List).collect{ "/tmp/" + file(it).name }.join(' --import-file=')}" @@ -163,14 +163,14 @@ process SKYLINE_MERGE_RESULTS { --share-type="complete" \ > >(tee 'skyline-merge.stdout') 2> >(tee 'skyline-merge.stderr' >&2) - sky_zip_hash=\$( md5sum ${params.skyline.document_name}.sky.zip |awk '{print \$1}' ) + md5sum ${params.skyline.document_name}.sky.zip | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\1\\t\\2/' > output_file_hashes.txt """ stub: """ touch "${params.skyline.document_name}.sky.zip" touch "skyline-merge.stderr" "skyline-merge.stdout" - sky_zip_hash=\$( md5sum ${params.skyline.document_name}.sky.zip |awk '{print \$1}' ) + md5sum ${params.skyline.document_name}.sky.zip | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\1\\t\\2/' > output_file_hashes.txt """ } @@ -211,7 +211,7 @@ process SKYLINE_MINIMIZE_DOCUMENT { path("${sky_basename(skyline_zipfile)}_minimized.sky.zip"), emit: final_skyline_zipfile path("*.stdout"), emit: stdout path("*.stderr"), emit: stderr - env(sky_zip_hash), emit: file_hash + path('output_file_hashes.txt'), emit: output_file_hashes script: """ @@ -227,14 +227,14 @@ process SKYLINE_MINIMIZE_DOCUMENT { --share-type="minimal" \ > >(tee 'minimize_skyline.stdout') 2> >(tee 'minimize_skyline.stderr' >&2) - sky_zip_hash=\$( md5sum ${sky_basename(skyline_zipfile)}_minimized.sky.zip |awk '{print \$1}' ) + md5sum ${sky_basename(skyline_zipfile)}_minimized.sky.zip | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\1\\t\\2/' > output_file_hashes.txt """ stub: """ touch ${sky_basename(skyline_zipfile)}_minimized.sky.zip touch stub.stdout stub.stderr - sky_zip_hash=\$( md5sum ${sky_basename(skyline_zipfile)}_minimized.sky.zip |awk '{print \$1}' ) + md5sum ${sky_basename(skyline_zipfile)}_minimized.sky.zip | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\1\\t\\2/' > output_file_hashes.txt """ } @@ -252,7 +252,7 @@ process SKYLINE_ANNOTATE_DOCUMENT { path("${sky_basename(skyline_zipfile)}_annotated.sky.zip"), emit: final_skyline_zipfile path("*.stdout"), emit: stdout path("*.stderr"), emit: stderr - env(sky_zip_hash), emit: file_hash + path('output_file_hashes.txt'), emit: output_file_hashes shell: """ @@ -268,14 +268,14 @@ process SKYLINE_ANNOTATE_DOCUMENT { wine SkylineCmd --batch-commands=add_annotations.bat \ > >(tee 'annotate_doc.stdout') 2> >(tee 'annotate_doc.stderr' >&2) - sky_zip_hash=\$( md5sum ${sky_basename(skyline_zipfile)}_annotated.sky.zip |awk '{print \$1}' ) + md5sum ${sky_basename(skyline_zipfile)}_annotated.sky.zip | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\1\\t\\2/' > output_file_hashes.txt """ stub: """ touch "${sky_basename(skyline_zipfile)}_annotated.sky.zip" touch stub.stdout stub.stderr - sky_zip_hash=\$( md5sum ${sky_basename(skyline_zipfile)}_annotated.sky.zip |awk '{print \$1}' ) + md5sum ${sky_basename(skyline_zipfile)}_annotated.sky.zip | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\1\\t\\2/' > output_file_hashes.txt """ } diff --git a/workflows/skyline_import.nf b/workflows/skyline_import.nf index ec15da7..d999ff3 100644 --- a/workflows/skyline_import.nf +++ b/workflows/skyline_import.nf @@ -47,16 +47,16 @@ workflow skyline_import { ANNOTATION_TSV_TO_CSV.out.annotation_definitions) skyline_results = SKYLINE_ANNOTATE_DOCUMENT.out.final_skyline_zipfile - skyline_results_hash = SKYLINE_ANNOTATE_DOCUMENT.out.file_hash + skyline_results_hash = SKYLINE_ANNOTATE_DOCUMENT.out.output_file_hashes } else { skyline_results = SKYLINE_MERGE_RESULTS.out.final_skyline_zipfile - skyline_results_hash = SKYLINE_MERGE_RESULTS.out.file_hash + skyline_results_hash = SKYLINE_MERGE_RESULTS.out.output_file_hashes } if(params.skyline.minimize) { SKYLINE_MINIMIZE_DOCUMENT(skyline_results) skyline_minimized_results = SKYLINE_MINIMIZE_DOCUMENT.out.final_skyline_zipfile - skyline_minimized_results_hash = SKYLINE_MINIMIZE_DOCUMENT.out.file_hash + skyline_minimized_results_hash = SKYLINE_MINIMIZE_DOCUMENT.out.output_file_hashes } else { skyline_minimized_results = Channel.empty() skyline_minimized_results_hash = Channel.empty() From 282f5f9ef7bfdff9c7be67b1f9326eca866721a8 Mon Sep 17 00:00:00 2001 From: mauraisa Date: Mon, 16 Sep 2024 11:18:17 -0700 Subject: [PATCH 6/8] Add initial combine_file_hashes sub-workflow and file_stats module. --- main.nf | 15 ++++++ modules/file_stats.nf | 37 ++++++++++++++ workflows/combine_file_hashes.nf | 83 ++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 modules/file_stats.nf create mode 100644 workflows/combine_file_hashes.nf diff --git a/main.nf b/main.nf index db0064d..8b28762 100644 --- a/main.nf +++ b/main.nf @@ -16,6 +16,7 @@ include { panorama_upload_results } from "./workflows/panorama_upload" include { panorama_upload_mzmls } from "./workflows/panorama_upload" include { save_run_details } from "./workflows/save_run_details" include { get_pdc_files } from "./workflows/get_pdc_files" +include { combine_file_hashes } from "./workflows/combine_file_hashes" // modules include { ENCYCLOPEDIA_BLIB_TO_DLIB } from "./modules/encyclopedia" @@ -197,12 +198,14 @@ workflow { ) quant_library = encyclopeda_export_elib.out.elib + spec_lib_hashes = encyclopeda_export_elib.out.output_file_hashes all_elib_ch = encyclopeda_export_elib.out.elib.concat( encyclopeda_export_elib.out.individual_elibs ) } else { quant_library = spectral_library_to_use + spec_lib_hashes = Channel.empty() all_mzml_ch = wide_mzml_ch all_elib_ch = Channel.empty() } @@ -219,6 +222,7 @@ workflow { ) encyclopedia_version = encyclopedia_quant.out.encyclopedia_version + search_file_hashes = encyclopedia_quant.out.output_file_hashes.concat(spec_lib_hashes) final_elib = encyclopedia_quant.out.elib all_elib_ch = all_elib_ch.concat( @@ -284,6 +288,7 @@ workflow { ) diann_version = diann_search.out.diann_version + search_file_hashes = diann_search.out.output_file_hashes // create compatible spectral library for Skyline, if needed if(!params.skyline.skip) { @@ -367,6 +372,7 @@ workflow { final_skyline_file = Channel.empty() qc_report_files = Channel.empty() proteowizard_version = Channel.empty() + final_skyline_hash = Channel.empty() dia_qc_version = Channel.empty() gene_reports = Channel.empty() } @@ -382,6 +388,15 @@ workflow { save_run_details(input_files.collect(), version_files.collect()) run_details_file = save_run_details.out.run_details + combine_file_hashes(fasta, spectral_library, + search_file_hashes, + final_skyline_file, + final_skyline_hash, + skyline_reports_ch, + qc_report_files, + gene_reports, + run_details_file) + // upload results to Panorama if(params.panorama.upload) { diff --git a/modules/file_stats.nf b/modules/file_stats.nf new file mode 100644 index 0000000..196b2e2 --- /dev/null +++ b/modules/file_stats.nf @@ -0,0 +1,37 @@ + +process CALCULATE_MD5 { + label 'process_low' + container params.images.ubuntu + + input: + path(file_to_check) + + output: + tuple val("${file_to_check.name}"), env(md5_sum) + + shell: + ''' + md5_sum=$( md5sum !{file_to_check} |awk '{print $1}' ) + ''' +} + +process WRITE_FILE_STATS { + label 'process_low' + container params.images.ubuntu + publishDir "${params.result_dir}", failOnError: true, mode: 'copy' + + input: + val file_stats + + output: + path("file_checksums.tsv") + + script: + data = file_stats.join('\\n') + """ + text="${data}" + + echo -e 'file\\tpath\\tmd5_hash\\tsize' > file_checksums.tsv + echo -e \$text >> file_checksums.tsv + """ +} diff --git a/workflows/combine_file_hashes.nf b/workflows/combine_file_hashes.nf new file mode 100644 index 0000000..df1d6d3 --- /dev/null +++ b/workflows/combine_file_hashes.nf @@ -0,0 +1,83 @@ + +include { CALCULATE_MD5 } from "../modules/file_stats" +include { WRITE_FILE_STATS } from "../modules/file_stats" + +def get_search_file_dir() { + if(params.search_engine.toLowerCase() == 'encyclopedia') { + return params.output_directories.encyclopedia.search_file + } + if(params.search_engine.toLowerCase() == 'diann') { + return params.output_directories.diann + } + return 'UNKNOWN_SEARCH_ENGINE' +} + + +workflow combine_file_hashes { + take: + fasta + spectral_library + + search_file_stats + + final_skyline_file + final_skyline_hash + skyline_reports + + qc_report_files + gene_reports + + workflow_versions + + emit: + output_file_hashes + + main: + + // process hash text files produced by search + search_file_data = search_file_stats.splitText().map{ + it -> tuple(it.split('\\t')) + }.branch{ + mzml_files: it[0].endsWith("mzML") + tuple(it[0], "${params.mzml_cache_directory}", it[1], it[2]) + search_files: true + tuple(it[0], get_search_file_dir(), it[1], it[2]) + } + + // process skyline hash text files + skyline_doc_data = final_skyline_file.map{ + it -> tuple(it.name, params.output_directories.skyline.import_spectra, it.size()) + }.join( + final_skyline_hash.splitText().map{ it -> + elems = it.split('\t') + tuple(elems[1], elems[0]) + } + ).map{ it -> tuple(it[0], it[1], it[3], it[2])} + + // Combine files we need to calculate the hash of into a single channel + file_stat_files = fasta.concat(spectral_library).map{ + it -> tuple(it.name, it, params.result_dir, it.size()) + }.concat( + skyline_reports.map{ tuple(it.name, it, params.output_directories.skyline.reports, it.size()) }, + qc_report_files.map{ tuple(it.name, it, params.output_directories.qc_report, it.size()) }, + gene_reports.map{ tuple(it.name, it, params.output_directories.gene_reports, it.size()) }, + workflow_versions.map{ tuple(it.name, it, params.result_dir, it.size()) } + ) + + md5_input = file_stat_files.map{ it -> it[1] } + CALCULATE_MD5(md5_input) + + // Combine all file hashes into a single channel + output_file_hashes = search_file_data.mzml_files.concat( + file_stat_files.join(CALCULATE_MD5.out).map{ + it -> tuple(it[0], it[2], it[4], it[3]) + } + ).concat(search_file_data.search_files, skyline_doc_data).map{ + it -> it.join('\\t') + } + + // output_file_hashes.view() + + WRITE_FILE_STATS(output_file_hashes.collect()) +} + From 67cef1a746070beca803fe7a7b4c589cf733b2a8 Mon Sep 17 00:00:00 2001 From: mauraisa Date: Mon, 16 Sep 2024 14:09:55 -0700 Subject: [PATCH 7/8] Define required variables for combine_file_hashes workflow. --- main.nf | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 8b28762..ed641cf 100644 --- a/main.nf +++ b/main.nf @@ -198,7 +198,7 @@ workflow { ) quant_library = encyclopeda_export_elib.out.elib - spec_lib_hashes = encyclopeda_export_elib.out.output_file_hashes + spec_lib_hashes = encyclopeda_export_elib.out.output_file_stats all_elib_ch = encyclopeda_export_elib.out.elib.concat( encyclopeda_export_elib.out.individual_elibs @@ -222,7 +222,7 @@ workflow { ) encyclopedia_version = encyclopedia_quant.out.encyclopedia_version - search_file_hashes = encyclopedia_quant.out.output_file_hashes.concat(spec_lib_hashes) + search_file_stats = encyclopedia_quant.out.output_file_stats.concat(spec_lib_hashes) final_elib = encyclopedia_quant.out.elib all_elib_ch = all_elib_ch.concat( @@ -288,7 +288,7 @@ workflow { ) diann_version = diann_search.out.diann_version - search_file_hashes = diann_search.out.output_file_hashes + search_file_stats = diann_search.out.output_file_stats // create compatible spectral library for Skyline, if needed if(!params.skyline.skip) { @@ -334,11 +334,17 @@ workflow { } final_skyline_file = skyline_import.out.skyline_results + final_skyline_hash = skyline_import.out.skyline_results_hash // generate QC report if(!params.qc_report.skip) { generate_dia_qc_report(final_skyline_file, replicate_metadata) dia_qc_version = generate_dia_qc_report.out.dia_qc_version + qc_report_files = generate_dia_qc_report.out.qc_reports.concat( + generate_dia_qc_report.out.qc_report_qmd, + generate_dia_qc_report.out.qc_report_db, + generate_dia_qc_report.out.qc_tables + ) // Export PDC gene tables if(params.pdc.gene_level_data != null) { @@ -351,6 +357,8 @@ workflow { } } else { dia_qc_version = Channel.empty() + qc_report_files = Channel.empty() + gene_reports = Channel.empty() } // run reports if requested @@ -389,7 +397,7 @@ workflow { run_details_file = save_run_details.out.run_details combine_file_hashes(fasta, spectral_library, - search_file_hashes, + search_file_stats, final_skyline_file, final_skyline_hash, skyline_reports_ch, From 80b21195f2bf75558e48a52d40e0d78a6afbb466 Mon Sep 17 00:00:00 2001 From: mauraisa Date: Mon, 16 Sep 2024 14:06:04 -0700 Subject: [PATCH 8/8] Make sure fasta and spectral_library are always a Channel When these files are stored locally they are initialized as a file system object, but when they are downloaded from panorama they are a channel. This inconsistency was causing problems with the combine_file_hashes sub-workflow. --- workflows/get_input_files.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/get_input_files.nf b/workflows/get_input_files.nf index 22b500a..f938ec5 100644 --- a/workflows/get_input_files.nf +++ b/workflows/get_input_files.nf @@ -46,7 +46,7 @@ workflow get_input_files { PANORAMA_GET_FASTA(params.fasta, aws_secret_id) fasta = PANORAMA_GET_FASTA.out.panorama_file } else { - fasta = file(params.fasta, checkIfExists: true) + fasta = Channel.value(file(params.fasta, checkIfExists: true)) } if(params.spectral_library) { @@ -54,7 +54,7 @@ workflow get_input_files { PANORAMA_GET_SPECTRAL_LIBRARY(params.spectral_library, aws_secret_id) spectral_library = PANORAMA_GET_SPECTRAL_LIBRARY.out.panorama_file } else { - spectral_library = file(params.spectral_library, checkIfExists: true) + spectral_library = Channel.value(file(params.spectral_library, checkIfExists: true)) } } else { spectral_library = null