From ca1a5e06736b737a848f11bc19b77e6526b7bee6 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 22 Jun 2023 11:56:53 -0400 Subject: [PATCH 1/6] :construction: began work of refactoring how data clinical file headers are appended --- REFS/template_patient_header.txt | 5 +++ REFS/template_sample_header.txt | 5 +++ STUDY_CONFIGS/pbta_all_case_meta_config.json | 4 +- scripts/get_study_metadata.py | 39 ++++++++++++-------- 4 files changed, 35 insertions(+), 18 deletions(-) create mode 100644 REFS/template_patient_header.txt create mode 100644 REFS/template_sample_header.txt diff --git a/REFS/template_patient_header.txt b/REFS/template_patient_header.txt new file mode 100644 index 0000000..3a69d04 --- /dev/null +++ b/REFS/template_patient_header.txt @@ -0,0 +1,5 @@ +#PATIENT_ID GENDER RACE ETHNICITY AGE OS_MONTHS OS_STATUS cancer_predispositions EXTERNAL_PATIENT_ID germline_sex_estimate +#Patient identifier reported gender race ethnicity Age at which the condition or disease was first diagnosed, in years Overall survival in months since initial diagnosis Overall patient survival status cancer predispositions External Patient Identifier germline sex estimate +#STRING STRING STRING STRING NUMBER NUMBER STRING STRING STRING STRING +#99 10 7 6 9 3 4 2 11 1 +PATIENT_ID GENDER RACE ETHNICITY AGE OS_MONTHS OS_STATUS CANCER_PREDISPOSITIONS EXTERNAL_PATIENT_ID GERMLINE_SEX_ESTIMATE diff --git a/REFS/template_sample_header.txt b/REFS/template_sample_header.txt new file mode 100644 index 0000000..c586dc0 --- /dev/null +++ b/REFS/template_sample_header.txt @@ -0,0 +1,5 @@ +#Patient Identifier COLLECTION_EVENT_ID Sample Identifier SPECIMEN_ID CANCER_TYPE CANCER_TYPE_DETAILED ONCOTREE_CODE TUMOR_TISSUE_SITE TUMOR_TYPE SAMPLE_TYPE MATCHED_NORMAL_SAMPLE_ID MATCHED_NORMAL_SPECIMEN_ID CBTN_TUMOR_TYPE MOLECULAR_SUBTYPE HARMONIZED_DIAGNOSIS BROAD_HISTOLOGY CANCER_GROUP EXPERIMENT_STRATEGY pathology_free_text_diagnosis tumor_fraction tumor_ploidy CNS_region +#Patient identifier Unifying ID of child sequencing events from a single biological sample Sample Identifier using external_sample_id kfdrc tumor biopsecimen ID Study-defined cancer type Study-defined cancer type detail OncoTree alphanumeric code value for CANCER_TYPE_DETAILED tumor tissue location primary v metastatic tumor designation patient tissue sample or cell line matched normal external_sample_id kfdrc matched normal biospecimen ID CBTN-assigned tumor type contains molecular subtypes for tumor types selected from pathology_diagnosis and pathology_free_text_diagnosis fields, following World Health Organization 2016 classification criteria Disease with molecular subtype and grade broad histology cancer group sequencing strategies of sample represented by this SAMPLE_ID pathology free text diagnosis tumor fraction tumor ploidy CNS region +#STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING NUMBER NUMBER STRING +#14 3 13 12 11 10 9 8 7 6 2 1 7 5 5 5 5 6 1 3 4 5 +PATIENT_ID COLLECTION_EVENT_ID SAMPLE_ID SPECIMEN_ID CANCER_TYPE CANCER_TYPE_DETAILED ONCOTREE_CODE TUMOR_TISSUE_TYPE TUMOR_TYPE SAMPLE_TYPE MATCHED_NORMAL_SAMPLE_ID MATCHED_NORMAL_SPECIMEN_ID CBTN_TUMOR_TYPE MOLECULAR_SUBTYPE HARMONIZED_DIAGNOSIS BROAD_HISTOLOGY CANCER_GROUP EXPERIMENT_STRATEGY PATHOLOGY_FREE_TEXT_DIAGNOSIS TUMOR_FRACTION TUMOR_PLOIDY CNS_REGION diff --git a/STUDY_CONFIGS/pbta_all_case_meta_config.json b/STUDY_CONFIGS/pbta_all_case_meta_config.json index ff39f76..8dedcc9 100644 --- a/STUDY_CONFIGS/pbta_all_case_meta_config.json +++ b/STUDY_CONFIGS/pbta_all_case_meta_config.json @@ -236,14 +236,14 @@ } }, "sample_head": { - "table": "bix_workflows.data_clinical_sample_header" + "table": "template_sample_header.txt" }, "sample_file": { "table": "prod_cbio.pbta_all_data_clinical_sample", "out_file": "data_clinical_sample.txt" }, "patient_head": { - "table": "bix_workflows.data_clinical_patient_header" + "table": "template_patient_header.txt" }, "patient_file": { "table": "prod_cbio.pbta_all_data_clinical_patient", diff --git a/scripts/get_study_metadata.py b/scripts/get_study_metadata.py index dfa0828..1585187 100644 --- a/scripts/get_study_metadata.py +++ b/scripts/get_study_metadata.py @@ -8,6 +8,7 @@ from configparser import ConfigParser import argparse import json +import sys import pdb @@ -59,7 +60,7 @@ def generic_print(out_file, rows, colnames): return 0 -def get_data_clinical(db_cur, config_dict, prefix): +def get_data_clinical(db_cur, config_dict, prefix, ref_dir): """ Depending on the prefix of patient or sample, will pull from related tables, only use related header info present in table, and print the combined results. @@ -74,19 +75,22 @@ def get_data_clinical(db_cur, config_dict, prefix): (rows, colnames) = generic_pull(db_cur, tbl_name) # use table header from colnames, and use to select file header - head_name = config_dict['database_pulls'][prefix + '_head']['table'] - # get sample table contents, have to split if format schema.table - if '.' not in head_name: - head_sql = sql.SQL('SELECT {} FROM {};').format(sql.SQL(',').join(map(sql.Identifier, colnames)), sql.Identifier(head_name)) - else: - (schema, table) = head_name.split('.') - head_sql = sql.SQL('SELECT {} FROM {}.{};').format(sql.SQL(',').join(map(sql.Identifier, colnames)), sql.Identifier(schema), sql.Identifier(table)) - db_cur.execute(head_sql) - head = db_cur.fetchall() + head_file = open(ref_dir + config_dict['database_pulls'][prefix + '_head']['table']) + # get and read head file + head_lines = head_file.readlines() # create output file and combine results for final product out_file = open(datasheet_dir + "/" + config_data['database_pulls'][prefix + '_file']['out_file'], 'w') - for row in head: - out_file.write("\t".join(row) + "\n") + # get indices of matching head lines, then print corresponding cBio header values + col_i = [] + head_search = head_lines[-1].rstrip('\n').split('\t') + for col in colnames: + col_i.append(head_search.index(col)) + if len(col_i) != len(colnames): + print("ERROR! Number header columns found {} != {} number of columns in {} data clinical sheet. Check your header file!".format(len(col_i), len(colnames), prefix), file=sys.stderr) + for i in range(0, len(head_lines) -1, 1): + head = [head_lines[i].rstrip('\n').split('\t')[j] for j in col_i] + out_file.write("\t".join(head) + "\n") + pdb.set_trace() generic_print(out_file, rows, colnames) return 0 @@ -122,7 +126,8 @@ def get_manifests(db_cur, config_dict): parser.add_argument("-d", "--db-ini", action="store", dest="db_ini", help="Database config file - formatting like aws or sbg creds") parser.add_argument("-p", "--profile", action="store", dest="profile", help="ini profile name", default="postgresql") -parser.add_argument("-c", "--config", action="store", dest="config_file", help="json config file with meta information; see REFS/pbta_all_case_meta_config.json example",) +parser.add_argument("-c", "--config", action="store", dest="config_file", help="json config file with meta information; see REFS/pbta_all_case_meta_config.json example") +parser.add_argument("-r", "--ref-dir", action="store", dest="ref_dir", help="dir name containing template data_clinical* header files") args = parser.parse_args() # Load database login info @@ -137,13 +142,15 @@ def get_manifests(db_cur, config_dict): # dict to track keys with specific database calls special_keys = {"sample_head": 0, "sample_file": 0, "patient_head": 0, "patient_file": 0, "manifests": 0} - + ref_dir = args.ref_dir + if ref_dir[-1] != '/': + ref_dir += '/' try: os.mkdir(datasheet_dir) except Exception as e: print(str(e) + ' IGNORE!') - get_data_clinical(cur, config_data, 'sample') - get_data_clinical(cur, config_data, 'patient') + get_data_clinical(cur, config_data, 'sample', ref_dir) + get_data_clinical(cur, config_data, 'patient', ref_dir) get_manifests(cur, config_data) # For all other tables to be printed simply, not in special_keys From 749a384343ec23ee8815e83c7291803caec0f2d6 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 23 Jun 2023 12:06:47 -0400 Subject: [PATCH 2/6] :pencil: update template and docs --- README.md | 4 +++- REFS/template_patient_header.txt | 10 +++++----- STUDY_CONFIGS/pbta_all_case_meta_config.json | 2 +- scripts/get_study_metadata.py | 4 +--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index dbdf4e3..7565dbd 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Further general loading notes can be found in this [Notion page](https://www.not See [below](#collaborative-and-publication-workflows) for special cases like publications or collaborative efforts ## I have everything and I know I am doing Below assumes you have already created the necessary tables from dbt -1. Run commands as outlined in `scripts/get_study_metadata.py`. Copy/move those files to the cBio loader ec2 instance +1. Run commands as outlined in [scripts/get_study_metadata.py](#scriptsget_study_metadatapy). Copy/move those files to the cBio loader ec2 instance 1. Copy over the appropriate aws account key and download files. Example using `pbta_all` study: ```sh @@ -107,6 +107,8 @@ optional arguments: ini profile name -c CONFIG_FILE, --config CONFIG_FILE json config file with meta information; see REFS/pbta_all_case_meta_config.json example + -r REF_DIR, --ref-dir REF_DIR + dir name containing template data_clinical* header files ``` ### From D3b Warehouse diff --git a/REFS/template_patient_header.txt b/REFS/template_patient_header.txt index 3a69d04..3f9e8d7 100644 --- a/REFS/template_patient_header.txt +++ b/REFS/template_patient_header.txt @@ -1,5 +1,5 @@ -#PATIENT_ID GENDER RACE ETHNICITY AGE OS_MONTHS OS_STATUS cancer_predispositions EXTERNAL_PATIENT_ID germline_sex_estimate -#Patient identifier reported gender race ethnicity Age at which the condition or disease was first diagnosed, in years Overall survival in months since initial diagnosis Overall patient survival status cancer predispositions External Patient Identifier germline sex estimate -#STRING STRING STRING STRING NUMBER NUMBER STRING STRING STRING STRING -#99 10 7 6 9 3 4 2 11 1 -PATIENT_ID GENDER RACE ETHNICITY AGE OS_MONTHS OS_STATUS CANCER_PREDISPOSITIONS EXTERNAL_PATIENT_ID GERMLINE_SEX_ESTIMATE +#Patient Identifier External Patient Identifier SEX RACE ETHNICITY AGE AGE_IN_DAYS OS_STATUS OS_MONTHS EFS_MONTHS EFS_STATUS germline_sex_estimate cancer_predispositions +#Patient identifier Patient ID used by generator of data Sex of the patient racial demographic ethnic demographic Age at which the condition or disease was first diagnosed, in years Patient age in days at initial diagnosis Overall patient survival status Overall survival in months since initial diagnosis Event free (months) since initial treatment Event free status germline sex estimate cancer predispositions +#STRING STRING STRING STRING STRING NUMBER NUMBER STRING NUMBER NUMBER STRING STRING STRING +#11 10 9 5 4 8 7 3 2 1 1 1 2 +PATIENT_ID EXTERNAL_PATIENT_ID SEX RACE ETHNICITY AGE AGE_IN_DAYS OS_STATUS OS_MONTHS EFS_MONTHS EFS_STATUS GERMLINE_SEX_ESTIMATE CANCER_PREDISPOSITIONS diff --git a/STUDY_CONFIGS/pbta_all_case_meta_config.json b/STUDY_CONFIGS/pbta_all_case_meta_config.json index 8dedcc9..9566450 100644 --- a/STUDY_CONFIGS/pbta_all_case_meta_config.json +++ b/STUDY_CONFIGS/pbta_all_case_meta_config.json @@ -25,7 +25,7 @@ "out_file": "Desired output file name" } }, - "x_head": "Special header file table for data_clinical(sample/patient). cBio data_clinical headers have 5 header rows, and which columns are used are determined by the x_file table", + "x_head": "Special header tsv file for data_clinical(sample/patient). cBio data_clinical headers have 5 header rows, and which columns are used are determined by the x_file table. Should be just the file name", "x_file": "sample or patient tables with corresponding metadata at the sample and patient levels", "genomics_etl": "a helper file with relevant cBio sample names and individual genomic files names for ETL merging", "seq_center": "only if project has RNA data, a helper file to fill in missing sequencing center information for genomics etl", diff --git a/scripts/get_study_metadata.py b/scripts/get_study_metadata.py index 1585187..9deb47b 100644 --- a/scripts/get_study_metadata.py +++ b/scripts/get_study_metadata.py @@ -82,15 +82,13 @@ def get_data_clinical(db_cur, config_dict, prefix, ref_dir): out_file = open(datasheet_dir + "/" + config_data['database_pulls'][prefix + '_file']['out_file'], 'w') # get indices of matching head lines, then print corresponding cBio header values col_i = [] + # the last row, and the header of the data clinical table should have overlapping values head_search = head_lines[-1].rstrip('\n').split('\t') for col in colnames: col_i.append(head_search.index(col)) - if len(col_i) != len(colnames): - print("ERROR! Number header columns found {} != {} number of columns in {} data clinical sheet. Check your header file!".format(len(col_i), len(colnames), prefix), file=sys.stderr) for i in range(0, len(head_lines) -1, 1): head = [head_lines[i].rstrip('\n').split('\t')[j] for j in col_i] out_file.write("\t".join(head) + "\n") - pdb.set_trace() generic_print(out_file, rows, colnames) return 0 From 73a198f309dad54b4e8c570f8c5923bb0ae85a49 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 29 Jun 2023 11:20:22 -0400 Subject: [PATCH 3/6] :pencil: update configs/refs :hammer: update get files to get active default --- REFS/aws_bucket_key_pairs.txt | 1 + STUDY_CONFIGS/pbta_all_case_meta_config.json | 5 +++++ scripts/get_study_metadata.py | 20 ++++++++++++++------ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/REFS/aws_bucket_key_pairs.txt b/REFS/aws_bucket_key_pairs.txt index 82ac5d6..151467f 100644 --- a/REFS/aws_bucket_key_pairs.txt +++ b/REFS/aws_bucket_key_pairs.txt @@ -2,4 +2,5 @@ s3://cds-246-phs002517-p30-fy20 NCI-AR s3://cds-246-phs002517-sequencefiles-p30-fy20 NCI-AR s3://cds-306-phs002517-x01 NCI-X01 s3://d3b-cds-working-bucket d3b +s3://kf-strides-study-us-east-1-prd-sd-bhjxbdqk kf s3://kf-study-us-east-1-prd-sd-8y99qzjj saml \ No newline at end of file diff --git a/STUDY_CONFIGS/pbta_all_case_meta_config.json b/STUDY_CONFIGS/pbta_all_case_meta_config.json index 9566450..7638316 100644 --- a/STUDY_CONFIGS/pbta_all_case_meta_config.json +++ b/STUDY_CONFIGS/pbta_all_case_meta_config.json @@ -229,6 +229,11 @@ "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], "out_file": "x01_genomics_file_manifest.txt" }, + "cbtn_extra": { + "table": "bix_genomics_file.sd_bhjxbdqk_x01_extra-genomics_file_manifest", + "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], + "out_file": "cbtn_extra_genomics_file_manifest.txt" + }, "pnoc": { "table": "bix_genomics_file.sd_8y99qzjj-genomics_file_manifest", "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], diff --git a/scripts/get_study_metadata.py b/scripts/get_study_metadata.py index 9deb47b..e48f22c 100644 --- a/scripts/get_study_metadata.py +++ b/scripts/get_study_metadata.py @@ -8,8 +8,6 @@ from configparser import ConfigParser import argparse import json -import sys -import pdb def config(filename='database.ini', section='postgresql'): @@ -103,11 +101,19 @@ def get_manifests(db_cur, config_dict): try: tbl_name = manifests[manifest]['table'] file_types = manifests[manifest]['file_type'] - if '.' not in tbl_name: - manifest_sql = sql.SQL('SELECT * FROM {} WHERE file_type in ({});').format(sql.Identifier(tbl_name), sql.SQL(',').join(map(sql.Literal, file_types))) + if args.all: + if '.' not in tbl_name: + manifest_sql = sql.SQL('SELECT * FROM {} WHERE file_type in ({});').format(sql.Identifier(tbl_name), sql.SQL(',').join(map(sql.Literal, file_types))) + else: + (schema, table) = tbl_name.split('.') + manifest_sql = sql.SQL('SELECT * FROM {}.{} WHERE file_type in ({});').format(sql.Identifier(schema), sql.Identifier(table), sql.SQL(',').join(map(sql.Literal, file_types)), sql.Literal("active")) else: - (schema, table) = tbl_name.split('.') - manifest_sql = sql.SQL('SELECT * FROM {}.{} WHERE file_type in ({});').format(sql.Identifier(schema), sql.Identifier(table), sql.SQL(',').join(map(sql.Literal, file_types))) + if '.' not in tbl_name: + manifest_sql = sql.SQL('SELECT * FROM {} WHERE file_type in ({}) and status={};').format(sql.Identifier(tbl_name), sql.SQL(',').join(map(sql.Literal, file_types))) + else: + (schema, table) = tbl_name.split('.') + manifest_sql = sql.SQL('SELECT * FROM {}.{} WHERE file_type in ({}) and status={};').format(sql.Identifier(schema), sql.Identifier(table), sql.SQL(',').join(map(sql.Literal, file_types)), sql.Literal("active")) + db_cur.execute(manifest_sql) rows = db_cur.fetchall() colnames = [desc[0] for desc in db_cur.description] @@ -126,10 +132,12 @@ def get_manifests(db_cur, config_dict): parser.add_argument("-p", "--profile", action="store", dest="profile", help="ini profile name", default="postgresql") parser.add_argument("-c", "--config", action="store", dest="config_file", help="json config file with meta information; see REFS/pbta_all_case_meta_config.json example") parser.add_argument("-r", "--ref-dir", action="store", dest="ref_dir", help="dir name containing template data_clinical* header files") +parser.add_argument("-a", "--all", action="store_true", dest="all", help="flag to include all relevant files, not just status=active files, NOT RECOMMENDED") args = parser.parse_args() # Load database login info params = config(filename=args.db_ini, section=args.profile) +print(args.all) datasheet_dir = 'datasheets' # Load json config file with database pull info with open(args.config_file) as f: From 9b54eaeac1652583ff7964f7e3907fd5def89740 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 29 Jun 2023 11:22:41 -0400 Subject: [PATCH 4/6] :wrench: fixed space/tab --- REFS/aws_bucket_key_pairs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/REFS/aws_bucket_key_pairs.txt b/REFS/aws_bucket_key_pairs.txt index 151467f..ebd78a8 100644 --- a/REFS/aws_bucket_key_pairs.txt +++ b/REFS/aws_bucket_key_pairs.txt @@ -2,5 +2,5 @@ s3://cds-246-phs002517-p30-fy20 NCI-AR s3://cds-246-phs002517-sequencefiles-p30-fy20 NCI-AR s3://cds-306-phs002517-x01 NCI-X01 s3://d3b-cds-working-bucket d3b -s3://kf-strides-study-us-east-1-prd-sd-bhjxbdqk kf +s3://kf-strides-study-us-east-1-prd-sd-bhjxbdqk kf s3://kf-study-us-east-1-prd-sd-8y99qzjj saml \ No newline at end of file From c7153f4d949e832d023ac467054f75036920ff5f Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 29 Jun 2023 15:53:54 -0400 Subject: [PATCH 5/6] :wrench: catch and fix fusion ext --- scripts/convert_fusion_as_sv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/convert_fusion_as_sv.py b/scripts/convert_fusion_as_sv.py index de8df24..0b7c35c 100755 --- a/scripts/convert_fusion_as_sv.py +++ b/scripts/convert_fusion_as_sv.py @@ -197,11 +197,14 @@ def init_cbio_master(fusion_results, mode, rna_metadata): if args.mode == 'openX': r_ext = "rsem" elif args.mode == 'dgd': - r_ext = "rsem" + r_ext = "DGD_FUSION" # ensure sample name is imported as str all_file_meta = pd.read_csv(args.table, sep="\t", dtype={'Cbio_Tumor_Name': str}) - + # ext used in pbta vs openpedcan varies rna_subset = all_file_meta.loc[all_file_meta["File_Type"] == r_ext] + if rna_subset is None: + r_ext = 'rsem' + rna_subset = all_file_meta.loc[all_file_meta["File_Type"] == r_ext] # reset index so that references work later while iterating rna_subset = rna_subset.reset_index(drop=True) project_list = rna_subset.Cbio_project.unique() From e0f0fe7e8b5d8d80d238a92792bba8de64cb2903 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Wed, 19 Jul 2023 13:12:29 -0400 Subject: [PATCH 6/6] :broom: rm likely debug statement --- scripts/get_study_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/get_study_metadata.py b/scripts/get_study_metadata.py index e48f22c..7c82f9b 100644 --- a/scripts/get_study_metadata.py +++ b/scripts/get_study_metadata.py @@ -137,7 +137,7 @@ def get_manifests(db_cur, config_dict): args = parser.parse_args() # Load database login info params = config(filename=args.db_ini, section=args.profile) -print(args.all) + datasheet_dir = 'datasheets' # Load json config file with database pull info with open(args.config_file) as f: