Skip to content

Commit

Permalink
Merge pull request #60 from kids-first/feature/mb-update-pbta-20240322
Browse files Browse the repository at this point in the history
🔨 Update PBTA 20240322
  • Loading branch information
migbro committed Mar 28, 2024
2 parents c03c5d2 + 5a8fcdd commit 49157bb
Show file tree
Hide file tree
Showing 12 changed files with 98 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"file_loc_defs": {
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
"mafs": {
"kf": "annotated_public_outputs",
"kf": ["annotated_public_outputs"],
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS.txt"
},
"rsem": "RSEM_gene",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"file_loc_defs": {
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
"mafs": {
"kf": "annotated_public_outputs",
"kf": ["annotated_public_outputs"],
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
},
"cnvs": {
Expand Down
2 changes: 1 addition & 1 deletion STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"file_loc_defs": {
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
"mafs": {
"kf": "annotated_public_outputs",
"kf": ["annotated_public_outputs"],
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
},
"cnvs": {
Expand Down
2 changes: 1 addition & 1 deletion STUDY_CONFIGS/data_processing_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"file_loc_defs": {
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
"mafs": {
"kf": "annotated_public_outputs",
"kf": ["annotated_public_outputs"],
"dgd": "annotated_public",
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"file_loc_defs": {
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
"mafs": {
"kf": "annotated_public_outputs",
"kf": ["annotated_public_outputs"],
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
},
"cnvs": {
Expand Down
11 changes: 10 additions & 1 deletion STUDY_CONFIGS/pbta_all_case_meta_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,17 @@
"table": "bix_genomics_file.sd_bhjxbdqk_mioncoseq-genomics_file_manifest",
"file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
"out_file": "mioncoseq_genomics_file_manifest.txt"
},
"dgd_cbtn_panel": {
"table": "bix_genomics_file.sd_bhjxbdqk_dgd_panel-genomics_file_manifest",
"file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
"out_file": "dgd_cbtn_panel_genomics_file_manifest.txt"
},
"dgd__panel": {
"table": "bix_genomics_file.sd_6g58hhsx_dgd_panel-genomics_file_manifest",
"file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
"out_file": "dgd_panel_genomics_file_manifest.txt"
}

},
"sample_head": {
"table": "template_sample_header.txt"
Expand Down
43 changes: 43 additions & 0 deletions STUDY_CONFIGS/pbta_all_data_processing_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"bedtools": "bedtools",
"cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl",
"bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed",
"hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv",
"entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt",
"rna_ext_list": {
"expression": "rsem.genes.results.gz",
"fusion": "annoFuse_filter.tsv"
},
"dna_ext_list": {
"mutation": "consensus_somatic.norm.annot.public.maf",
"copy_number": "controlfreec.CNVs.p.value.txt",
"seg": "controlfreec.seg"
},
"file_loc_defs": {
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
"mafs": {
"kf": ["annotated_public_outputs", "consensus_public_outputs"],
"dgd": "annotated_public",
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
},
"cnvs": {
"pval": "ctrlfreec_pval",
"info": "ctrlfreec_info",
"seg": "ctrlfreec_bam_seg"
},
"rsem": "RSEM_gene",
"fusion": "annofuse_filtered_fusions_tsv",
"dgd_fusion": "fusion-dgd.tsv.gz"
},
"dl_file_type_list": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs",
"ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg", "DGD_MAF"],
"ens_gene_list":"/home/ubuntu/tools/kf-cbioportal-etl/REFS/gencode27_gene_list.txt",
"script_dir": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/",
"cbioportal_validator": "/home/ubuntu/tools/cbioportal/core/src/main/scripts/importer/validateData.py",
"cna_flag": 1,
"cnv_high_gain": 4,
"cnv_min_len": 50000,
"rna_flag": 1,
"cpus": 8,
"threads": 40
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"file_loc_defs": {
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
"mafs": {
"kf": "annotated_public_outputs",
"kf": ["annotated_public_outputs"],
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS.txt"
},
"cnvs": {
Expand Down
18 changes: 9 additions & 9 deletions scripts/cnv_2_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
import os
import argparse
import json
import subprocess
import concurrent.futures
import pandas as pd
import re
from get_file_metadata_helper import get_file_metadata
import pdb


def process_cnv(cnv_fn, cur_cnv_dict, samp_id):
Expand Down Expand Up @@ -43,7 +41,6 @@ def get_ploidy(obj):

def process_table(cbio_dx, file_meta_dict):
try:

# project/disease name should be name of directory hosting datasheet
sys.stderr.write("Processing " + cbio_dx + " project" + "\n")
new_cnv = open(out_dir + cbio_dx + ".predicted_cnv.txt", "w")
Expand Down Expand Up @@ -86,9 +83,10 @@ def process_table(cbio_dx, file_meta_dict):
new_cnv.write("\t" + ploidy_dict[samp])
new_cnv.write("\n")
new_cnv.close()
return 0, cbio_tum_id
except Exception as e:
print(e)
exit(1)
print(e, file=sys.stderr)
return 1, cbio_tum_id



Expand Down Expand Up @@ -149,10 +147,12 @@ def process_table(cbio_dx, file_meta_dict):
sys.stderr.write("output dir already exists\n")
file_meta_dict = get_file_metadata(args.table, "cnv")
with concurrent.futures.ProcessPoolExecutor(config_data["cpus"]) as executor:
results = {
executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx
for cbio_dx in file_meta_dict
}
results = { executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx for cbio_dx in file_meta_dict }
for result in concurrent.futures.as_completed(results):
if result.result()[0]:
print("Failed processing " + result.result()[1], file=sys.stderr)
exit(1)

# for cbio_dx in file_meta_dict:
# process_table(cbio_dx, file_meta_dict)
# sys.stderr.write("Done, check logs\n")
6 changes: 5 additions & 1 deletion scripts/cnv_3_gistic_style.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from get_file_metadata_helper import get_file_metadata
import pandas as pd
import numpy as np
import pdb

parser = argparse.ArgumentParser(
description="Convert merged cnv values to discrete coded values."
Expand Down Expand Up @@ -107,7 +108,10 @@ def mt_adjust_cn(obj):
for fname in fname_list:
parts = re.search("^" + args.merged_cnv_dir + "/(.*).predicted_cnv.txt", fname)
cbio_dx = parts.group(1)
data = pd.read_csv(fname, sep="\t")
try:
data = pd.read_csv(fname, sep="\t")
except Exception as e:
print(e, file=sys.stderr)
data.set_index("Hugo_Symbol")
# sample list would be cbio ids
samp_list = list(data.columns)[1:]
Expand Down
3 changes: 3 additions & 0 deletions scripts/genomics_file_cbio_package_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def process_maf(maf_loc_dict, cbio_id_table, data_config_file, dgd_status):
maf_dir = maf_loc_dict["kf"]
if args.dgd_status == "dgd":
maf_dir = maf_loc_dict["dgd"]
else:
# KF can be in multiple palces
maf_dir = ",".join(maf_dir)
maf_header = maf_loc_dict["header"]
maf_cmd = "{}maf_merge.py -t {} -i {} -m {} -j {} -f {} 2> collate_mafs.log".format(
script_dir, cbio_id_table, maf_header, maf_dir, data_config_file, dgd_status
Expand Down
55 changes: 22 additions & 33 deletions scripts/maf_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ def filter_entry(entry, tum_id, norm_id, tid_idx, nid_idx, v_idx, h_idx, maf_exc
return None


def process_maf(
maf_fn, new_maf, maf_exc, tum_id, norm_id
):
def process_maf(maf_fn, new_maf, maf_exc, tum_id, norm_id):
"""
Iterate over maf file, skipping header lines since the files are being merged.
With possiblility of mixed source, search headers
Expand Down Expand Up @@ -69,9 +67,7 @@ def process_maf(
cur_maf.close()


def process_tbl(
cbio_dx, file_meta_dict, print_head
):
def process_tbl(cbio_dx, file_meta_dict, print_head):
"""
Probaby a less likely scenario, but can split out into multiple projects based on dict
"""
Expand All @@ -84,28 +80,10 @@ def process_tbl(
for cbio_tum_id in file_meta_dict[cbio_dx]:
cbio_norm_id = file_meta_dict[cbio_dx][cbio_tum_id]["cbio_norm_id"]
fname = file_meta_dict[cbio_dx][cbio_tum_id]["fname"]
sys.stderr.write(
"Found relevant maf to process for "
+ " "
+ cbio_tum_id
+ " "
+ cbio_norm_id
+ " "
+ file_meta_dict[cbio_dx][cbio_tum_id]["kf_tum_id"]
+ " "
+ file_meta_dict[cbio_dx][cbio_tum_id]["kf_norm_id"]
+ " "
+ fname
+ "\n"
)
sys.stderr.flush()
process_maf(
maf_dir + fname,
new_maf,
maf_exc,
cbio_tum_id,
cbio_norm_id,
)
print("Found relevant maf to process for {} {} {} {} {}".format(
cbio_tum_id, cbio_norm_id, file_meta_dict[cbio_dx][cbio_tum_id]["kf_tum_id"], file_meta_dict[cbio_dx][cbio_tum_id]["kf_norm_id"], fname),
file=sys.stderr)
process_maf(maf_dir + fname, new_maf, maf_exc, cbio_tum_id, cbio_norm_id)
x += 1
sys.stderr.write(
"Completed processing " + str(x) + " entries in " + cbio_dx + "\n"
Expand All @@ -130,7 +108,7 @@ def process_tbl(
"-i", "--header", action="store", dest="header", help="File with maf header only"
)
parser.add_argument(
"-m", "--maf-dir", action="store", dest="maf_dir", help="maf file directory"
"-m", "--maf-dirs", action="store", dest="maf_dirs", help="comma-separated list of maf file directories"
)
parser.add_argument(
"-j",
Expand All @@ -155,10 +133,21 @@ def process_tbl(
args = parser.parse_args()
with open(args.config_file) as f:
config_data = json.load(f)
# get maf file ext
maf_dir = args.maf_dir
if maf_dir[-1] != "/":
maf_dir += "/"
# Create symlinks to mafs in one place for ease of processing
maf_dir = "MAFS/"
maf_dirs_in = args.maf_dirs
print("Symlinking maf files from {} to {}".format(maf_dirs_in, maf_dir), file=sys.stderr)
os.makedirs("MAFS", exist_ok=True)
for dirname in maf_dirs_in.split(","):
abs_path = os.path.abspath(dirname)
for fname in os.listdir(dirname):
try:
src = os.path.join(abs_path, fname)
dest = os.path.join(maf_dir, fname)
os.symlink(src, dest)
except Exception as e:
print(e, file=sys.stderr)
print("Could not sym link {} in {}".format(fname, dirname))
# If DGD maf only, else if both, dgd maf wil be handled separately, or not at all if no dgd and kf only

file_meta_dict = get_file_metadata(args.table, "DGD_MAF")
Expand Down

0 comments on commit 49157bb

Please sign in to comment.