From 53a732aa94811d87adfe8a81a2bf3af8d68c343d Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 22 Mar 2024 16:24:43 +0000 Subject: [PATCH 1/5] :tada: new pbta data config :hammer: minor refactor to handle multple maf locations --- .../pbta_all_data_processing_config.json | 43 +++++++++++++++ scripts/genomics_file_cbio_package_build.py | 3 + scripts/maf_merge.py | 55 ++++++++----------- 3 files changed, 68 insertions(+), 33 deletions(-) create mode 100644 STUDY_CONFIGS/pbta_all_data_processing_config.json diff --git a/STUDY_CONFIGS/pbta_all_data_processing_config.json b/STUDY_CONFIGS/pbta_all_data_processing_config.json new file mode 100644 index 0000000..44ee4fc --- /dev/null +++ b/STUDY_CONFIGS/pbta_all_data_processing_config.json @@ -0,0 +1,43 @@ +{ + "bedtools": "bedtools", + "cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl", + "bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed", + "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv", + "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt", + "rna_ext_list": { + "expression": "rsem.genes.results.gz", + "fusion": "annoFuse_filter.tsv" + }, + "dna_ext_list": { + "mutation": "consensus_somatic.norm.annot.public.maf", + "copy_number": "controlfreec.CNVs.p.value.txt", + "seg": "controlfreec.seg" + }, + "file_loc_defs": { + "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run", + "mafs": { + "kf": ["annotated_public_outputs", "consensus_public_outputs"], + "dgd": "annotated_public", + "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt" + }, + "cnvs": { + "pval": "ctrlfreec_pval", + "info": "ctrlfreec_info", + "seg": "ctrlfreec_bam_seg" + }, + "rsem": "RSEM_gene", + "fusion": "annofuse_filtered_fusions_tsv", + "dgd_fusion": "fusion-dgd.tsv.gz" + }, + "dl_file_type_list": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs", + "ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg", "DGD_MAF"], + "ens_gene_list":"/home/ubuntu/tools/kf-cbioportal-etl/REFS/gencode27_gene_list.txt", + "script_dir": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/", + "cbioportal_validator": "/home/ubuntu/tools/cbioportal/core/src/main/scripts/importer/validateData.py", + "cna_flag": 1, + "cnv_high_gain": 4, + "cnv_min_len": 50000, + "rna_flag": 1, + "cpus": 8, + "threads": 40 +} diff --git a/scripts/genomics_file_cbio_package_build.py b/scripts/genomics_file_cbio_package_build.py index 64831bc..4054bf0 100755 --- a/scripts/genomics_file_cbio_package_build.py +++ b/scripts/genomics_file_cbio_package_build.py @@ -45,6 +45,9 @@ def process_maf(maf_loc_dict, cbio_id_table, data_config_file, dgd_status): maf_dir = maf_loc_dict["kf"] if args.dgd_status == "dgd": maf_dir = maf_loc_dict["dgd"] + else: + # KF can be in multiple palces + maf_dir = ",".join(maf_dir) maf_header = maf_loc_dict["header"] maf_cmd = "{}maf_merge.py -t {} -i {} -m {} -j {} -f {} 2> collate_mafs.log".format( script_dir, cbio_id_table, maf_header, maf_dir, data_config_file, dgd_status diff --git a/scripts/maf_merge.py b/scripts/maf_merge.py index 01fb471..b245c1e 100755 --- a/scripts/maf_merge.py +++ b/scripts/maf_merge.py @@ -24,9 +24,7 @@ def filter_entry(entry, tum_id, norm_id, tid_idx, nid_idx, v_idx, h_idx, maf_exc return None -def process_maf( - maf_fn, new_maf, maf_exc, tum_id, norm_id -): +def process_maf(maf_fn, new_maf, maf_exc, tum_id, norm_id): """ Iterate over maf file, skipping header lines since the files are being merged. With possiblility of mixed source, search headers @@ -69,9 +67,7 @@ def process_maf( cur_maf.close() -def process_tbl( - cbio_dx, file_meta_dict, print_head -): +def process_tbl(cbio_dx, file_meta_dict, print_head): """ Probaby a less likely scenario, but can split out into multiple projects based on dict """ @@ -84,28 +80,10 @@ def process_tbl( for cbio_tum_id in file_meta_dict[cbio_dx]: cbio_norm_id = file_meta_dict[cbio_dx][cbio_tum_id]["cbio_norm_id"] fname = file_meta_dict[cbio_dx][cbio_tum_id]["fname"] - sys.stderr.write( - "Found relevant maf to process for " - + " " - + cbio_tum_id - + " " - + cbio_norm_id - + " " - + file_meta_dict[cbio_dx][cbio_tum_id]["kf_tum_id"] - + " " - + file_meta_dict[cbio_dx][cbio_tum_id]["kf_norm_id"] - + " " - + fname - + "\n" - ) - sys.stderr.flush() - process_maf( - maf_dir + fname, - new_maf, - maf_exc, - cbio_tum_id, - cbio_norm_id, - ) + print("Found relevant maf to process for {} {} {} {} {}".format( + cbio_tum_id, cbio_norm_id, file_meta_dict[cbio_dx][cbio_tum_id]["kf_tum_id"], file_meta_dict[cbio_dx][cbio_tum_id]["kf_norm_id"], fname), + file=sys.stderr) + process_maf(maf_dir + fname, new_maf, maf_exc, cbio_tum_id, cbio_norm_id) x += 1 sys.stderr.write( "Completed processing " + str(x) + " entries in " + cbio_dx + "\n" @@ -130,7 +108,7 @@ def process_tbl( "-i", "--header", action="store", dest="header", help="File with maf header only" ) parser.add_argument( - "-m", "--maf-dir", action="store", dest="maf_dir", help="maf file directory" + "-m", "--maf-dirs", action="store", dest="maf_dirs", help="csv of maf file directories" ) parser.add_argument( "-j", @@ -155,10 +133,21 @@ def process_tbl( args = parser.parse_args() with open(args.config_file) as f: config_data = json.load(f) - # get maf file ext - maf_dir = args.maf_dir - if maf_dir[-1] != "/": - maf_dir += "/" + # Create symlinks to mafs in one place for ease of processing + maf_dir = "MAFS/" + maf_dirs_in = args.maf_dirs + print("Symlinking maf files from {} to {}".format(maf_dirs_in, maf_dir), file=sys.stderr) + os.makedirs("MAFS", exist_ok=True) + for dirname in maf_dirs_in.split(","): + abs_path = os.path.abspath(dirname) + for fname in os.listdir(dirname): + try: + src = os.path.join(abs_path, fname) + dest = os.path.join(maf_dir, fname) + os.symlink(src, dest) + except Exception as e: + print(e, file=sys.stderr) + print("Could not sym link {} in {}".format(fname, dirname)) # If DGD maf only, else if both, dgd maf wil be handled separately, or not at all if no dgd and kf only file_meta_dict = get_file_metadata(args.table, "DGD_MAF") From 845b42e624c84940cb75a149f16563a4fe3764e2 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 22 Mar 2024 14:16:11 -0400 Subject: [PATCH 2/5] :pencil: add missing manifests --- STUDY_CONFIGS/pbta_all_case_meta_config.json | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/STUDY_CONFIGS/pbta_all_case_meta_config.json b/STUDY_CONFIGS/pbta_all_case_meta_config.json index 35da286..822db1f 100644 --- a/STUDY_CONFIGS/pbta_all_case_meta_config.json +++ b/STUDY_CONFIGS/pbta_all_case_meta_config.json @@ -248,8 +248,17 @@ "table": "bix_genomics_file.sd_bhjxbdqk_mioncoseq-genomics_file_manifest", "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], "out_file": "mioncoseq_genomics_file_manifest.txt" + }, + "dgd_cbtn_panel": { + "table": "bix_genomics_file.sd_bhjxbdqk_dgd_panel-genomics_file_manifest", + "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], + "out_file": "dgd_cbtn_panel_genomics_file_manifest.txt" + }, + "dgd__panel": { + "table": "bix_genomics_file.sd_6g58hhsx_dgd_panel-genomics_file_manifest", + "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], + "out_file": "dgd_panel_genomics_file_manifest.txt" } - }, "sample_head": { "table": "template_sample_header.txt" From c66288d41bd2aa8b60f9b9b75ce4984fa0a2706d Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 22 Mar 2024 19:06:35 +0000 Subject: [PATCH 3/5] :hammer: improve error capture in cnv script --- scripts/cnv_2_merge.py | 18 +++++++++--------- scripts/cnv_3_gistic_style.py | 8 +++++++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/scripts/cnv_2_merge.py b/scripts/cnv_2_merge.py index 1e75766..4d248e8 100755 --- a/scripts/cnv_2_merge.py +++ b/scripts/cnv_2_merge.py @@ -3,12 +3,10 @@ import os import argparse import json -import subprocess import concurrent.futures import pandas as pd import re from get_file_metadata_helper import get_file_metadata -import pdb def process_cnv(cnv_fn, cur_cnv_dict, samp_id): @@ -43,7 +41,6 @@ def get_ploidy(obj): def process_table(cbio_dx, file_meta_dict): try: - # project/disease name should be name of directory hosting datasheet sys.stderr.write("Processing " + cbio_dx + " project" + "\n") new_cnv = open(out_dir + cbio_dx + ".predicted_cnv.txt", "w") @@ -86,9 +83,10 @@ def process_table(cbio_dx, file_meta_dict): new_cnv.write("\t" + ploidy_dict[samp]) new_cnv.write("\n") new_cnv.close() + return 0, cbio_tum_id except Exception as e: - print(e) - exit(1) + print(e, file=sys.stderr) + return 1, cbio_tum_id @@ -149,10 +147,12 @@ def process_table(cbio_dx, file_meta_dict): sys.stderr.write("output dir already exists\n") file_meta_dict = get_file_metadata(args.table, "cnv") with concurrent.futures.ProcessPoolExecutor(config_data["cpus"]) as executor: - results = { - executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx - for cbio_dx in file_meta_dict - } + results = { executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx for cbio_dx in file_meta_dict } + for result in concurrent.futures.as_completed(results): + if result.result()[0]: + print("Failed processing " + result.result()[1], file=sys.stderr) + exit(1) + # for cbio_dx in file_meta_dict: # process_table(cbio_dx, file_meta_dict) # sys.stderr.write("Done, check logs\n") diff --git a/scripts/cnv_3_gistic_style.py b/scripts/cnv_3_gistic_style.py index ad28e41..01d9523 100755 --- a/scripts/cnv_3_gistic_style.py +++ b/scripts/cnv_3_gistic_style.py @@ -8,6 +8,7 @@ from get_file_metadata_helper import get_file_metadata import pandas as pd import numpy as np +import pdb parser = argparse.ArgumentParser( description="Convert merged cnv values to discrete coded values." @@ -107,7 +108,12 @@ def mt_adjust_cn(obj): for fname in fname_list: parts = re.search("^" + args.merged_cnv_dir + "/(.*).predicted_cnv.txt", fname) cbio_dx = parts.group(1) - data = pd.read_csv(fname, sep="\t") + try: + data = pd.read_csv(fname, sep="\t") + except Exception as e: + print(e, file=sys.stderr) + pdb.set_trace() + hold=1 data.set_index("Hugo_Symbol") # sample list would be cbio ids samp_list = list(data.columns)[1:] From 3644424a63971f40bbb7cddc85161f075eea9011 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 28 Mar 2024 14:07:34 +0000 Subject: [PATCH 4/5] :pencil: change all maf fields to match maf processing in configs --- STUDY_CONFIGS/aml_sd_pet7q6f2_2018_data_processing_config.json | 2 +- .../bllnos_sd_z6mwd3h0_2018_data_processing_config.json | 2 +- STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json | 2 +- STUDY_CONFIGS/data_processing_config.json | 2 +- STUDY_CONFIGS/os_sd_zxjffmef_2015_data_processing_config.json | 2 +- STUDY_CONFIGS/pbta_all_treatment_data_processing_config.json | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/STUDY_CONFIGS/aml_sd_pet7q6f2_2018_data_processing_config.json b/STUDY_CONFIGS/aml_sd_pet7q6f2_2018_data_processing_config.json index 2fefe9a..5f06def 100644 --- a/STUDY_CONFIGS/aml_sd_pet7q6f2_2018_data_processing_config.json +++ b/STUDY_CONFIGS/aml_sd_pet7q6f2_2018_data_processing_config.json @@ -11,7 +11,7 @@ "file_loc_defs": { "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run", "mafs": { - "kf": "annotated_public_outputs", + "kf": ["annotated_public_outputs"], "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS.txt" }, "rsem": "RSEM_gene", diff --git a/STUDY_CONFIGS/bllnos_sd_z6mwd3h0_2018_data_processing_config.json b/STUDY_CONFIGS/bllnos_sd_z6mwd3h0_2018_data_processing_config.json index ca495d9..a1be0b4 100644 --- a/STUDY_CONFIGS/bllnos_sd_z6mwd3h0_2018_data_processing_config.json +++ b/STUDY_CONFIGS/bllnos_sd_z6mwd3h0_2018_data_processing_config.json @@ -16,7 +16,7 @@ "file_loc_defs": { "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run", "mafs": { - "kf": "annotated_public_outputs", + "kf": ["annotated_public_outputs"], "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt" }, "cnvs": { diff --git a/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json b/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json index 24b7415..0842f55 100644 --- a/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json +++ b/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json @@ -16,7 +16,7 @@ "file_loc_defs": { "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run", "mafs": { - "kf": "annotated_public_outputs", + "kf": ["annotated_public_outputs"], "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt" }, "cnvs": { diff --git a/STUDY_CONFIGS/data_processing_config.json b/STUDY_CONFIGS/data_processing_config.json index 7d7a438..e72c2e5 100644 --- a/STUDY_CONFIGS/data_processing_config.json +++ b/STUDY_CONFIGS/data_processing_config.json @@ -16,7 +16,7 @@ "file_loc_defs": { "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run", "mafs": { - "kf": "annotated_public_outputs", + "kf": ["annotated_public_outputs"], "dgd": "annotated_public", "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt" }, diff --git a/STUDY_CONFIGS/os_sd_zxjffmef_2015_data_processing_config.json b/STUDY_CONFIGS/os_sd_zxjffmef_2015_data_processing_config.json index b0545f1..846fd20 100644 --- a/STUDY_CONFIGS/os_sd_zxjffmef_2015_data_processing_config.json +++ b/STUDY_CONFIGS/os_sd_zxjffmef_2015_data_processing_config.json @@ -12,7 +12,7 @@ "file_loc_defs": { "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run", "mafs": { - "kf": "annotated_public_outputs", + "kf": ["annotated_public_outputs"], "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt" }, "cnvs": { diff --git a/STUDY_CONFIGS/pbta_all_treatment_data_processing_config.json b/STUDY_CONFIGS/pbta_all_treatment_data_processing_config.json index 3e908cd..a453d18 100644 --- a/STUDY_CONFIGS/pbta_all_treatment_data_processing_config.json +++ b/STUDY_CONFIGS/pbta_all_treatment_data_processing_config.json @@ -16,7 +16,7 @@ "file_loc_defs": { "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run", "mafs": { - "kf": "annotated_public_outputs", + "kf": ["annotated_public_outputs"], "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS.txt" }, "cnvs": { From 5a8fcdd4d9312f6f8eb41512fc9fdababaeae6e2 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 28 Mar 2024 11:28:44 -0400 Subject: [PATCH 5/5] :pencil: incorporated some of the PR suggestions --- scripts/cnv_3_gistic_style.py | 2 -- scripts/maf_merge.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/cnv_3_gistic_style.py b/scripts/cnv_3_gistic_style.py index 01d9523..bff8ed5 100755 --- a/scripts/cnv_3_gistic_style.py +++ b/scripts/cnv_3_gistic_style.py @@ -112,8 +112,6 @@ def mt_adjust_cn(obj): data = pd.read_csv(fname, sep="\t") except Exception as e: print(e, file=sys.stderr) - pdb.set_trace() - hold=1 data.set_index("Hugo_Symbol") # sample list would be cbio ids samp_list = list(data.columns)[1:] diff --git a/scripts/maf_merge.py b/scripts/maf_merge.py index b245c1e..bf3b089 100755 --- a/scripts/maf_merge.py +++ b/scripts/maf_merge.py @@ -108,7 +108,7 @@ def process_tbl(cbio_dx, file_meta_dict, print_head): "-i", "--header", action="store", dest="header", help="File with maf header only" ) parser.add_argument( - "-m", "--maf-dirs", action="store", dest="maf_dirs", help="csv of maf file directories" + "-m", "--maf-dirs", action="store", dest="maf_dirs", help="comma-separated list of maf file directories" ) parser.add_argument( "-j",