diff --git a/README.md b/README.md index dbdf4e3..7565dbd 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Further general loading notes can be found in this [Notion page](https://www.not See [below](#collaborative-and-publication-workflows) for special cases like publications or collaborative efforts ## I have everything and I know I am doing Below assumes you have already created the necessary tables from dbt -1. Run commands as outlined in `scripts/get_study_metadata.py`. Copy/move those files to the cBio loader ec2 instance +1. Run commands as outlined in [scripts/get_study_metadata.py](#scriptsget_study_metadatapy). Copy/move those files to the cBio loader ec2 instance 1. Copy over the appropriate aws account key and download files. Example using `pbta_all` study: ```sh @@ -107,6 +107,8 @@ optional arguments: ini profile name -c CONFIG_FILE, --config CONFIG_FILE json config file with meta information; see REFS/pbta_all_case_meta_config.json example + -r REF_DIR, --ref-dir REF_DIR + dir name containing template data_clinical* header files ``` ### From D3b Warehouse diff --git a/REFS/aws_bucket_key_pairs.txt b/REFS/aws_bucket_key_pairs.txt index 82ac5d6..ebd78a8 100644 --- a/REFS/aws_bucket_key_pairs.txt +++ b/REFS/aws_bucket_key_pairs.txt @@ -2,4 +2,5 @@ s3://cds-246-phs002517-p30-fy20 NCI-AR s3://cds-246-phs002517-sequencefiles-p30-fy20 NCI-AR s3://cds-306-phs002517-x01 NCI-X01 s3://d3b-cds-working-bucket d3b +s3://kf-strides-study-us-east-1-prd-sd-bhjxbdqk kf s3://kf-study-us-east-1-prd-sd-8y99qzjj saml \ No newline at end of file diff --git a/REFS/template_patient_header.txt b/REFS/template_patient_header.txt new file mode 100644 index 0000000..3f9e8d7 --- /dev/null +++ b/REFS/template_patient_header.txt @@ -0,0 +1,5 @@ +#Patient Identifier External Patient Identifier SEX RACE ETHNICITY AGE AGE_IN_DAYS OS_STATUS OS_MONTHS EFS_MONTHS EFS_STATUS germline_sex_estimate cancer_predispositions +#Patient identifier Patient ID used by generator of data Sex of the patient racial demographic ethnic demographic Age at which the condition or disease was first diagnosed, in years Patient age in days at initial diagnosis Overall patient survival status Overall survival in months since initial diagnosis Event free (months) since initial treatment Event free status germline sex estimate cancer predispositions +#STRING STRING STRING STRING STRING NUMBER NUMBER STRING NUMBER NUMBER STRING STRING STRING +#11 10 9 5 4 8 7 3 2 1 1 1 2 +PATIENT_ID EXTERNAL_PATIENT_ID SEX RACE ETHNICITY AGE AGE_IN_DAYS OS_STATUS OS_MONTHS EFS_MONTHS EFS_STATUS GERMLINE_SEX_ESTIMATE CANCER_PREDISPOSITIONS diff --git a/REFS/template_sample_header.txt b/REFS/template_sample_header.txt new file mode 100644 index 0000000..c586dc0 --- /dev/null +++ b/REFS/template_sample_header.txt @@ -0,0 +1,5 @@ +#Patient Identifier COLLECTION_EVENT_ID Sample Identifier SPECIMEN_ID CANCER_TYPE CANCER_TYPE_DETAILED ONCOTREE_CODE TUMOR_TISSUE_SITE TUMOR_TYPE SAMPLE_TYPE MATCHED_NORMAL_SAMPLE_ID MATCHED_NORMAL_SPECIMEN_ID CBTN_TUMOR_TYPE MOLECULAR_SUBTYPE HARMONIZED_DIAGNOSIS BROAD_HISTOLOGY CANCER_GROUP EXPERIMENT_STRATEGY pathology_free_text_diagnosis tumor_fraction tumor_ploidy CNS_region +#Patient identifier Unifying ID of child sequencing events from a single biological sample Sample Identifier using external_sample_id kfdrc tumor biopsecimen ID Study-defined cancer type Study-defined cancer type detail OncoTree alphanumeric code value for CANCER_TYPE_DETAILED tumor tissue location primary v metastatic tumor designation patient tissue sample or cell line matched normal external_sample_id kfdrc matched normal biospecimen ID CBTN-assigned tumor type contains molecular subtypes for tumor types selected from pathology_diagnosis and pathology_free_text_diagnosis fields, following World Health Organization 2016 classification criteria Disease with molecular subtype and grade broad histology cancer group sequencing strategies of sample represented by this SAMPLE_ID pathology free text diagnosis tumor fraction tumor ploidy CNS region +#STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING NUMBER NUMBER STRING +#14 3 13 12 11 10 9 8 7 6 2 1 7 5 5 5 5 6 1 3 4 5 +PATIENT_ID COLLECTION_EVENT_ID SAMPLE_ID SPECIMEN_ID CANCER_TYPE CANCER_TYPE_DETAILED ONCOTREE_CODE TUMOR_TISSUE_TYPE TUMOR_TYPE SAMPLE_TYPE MATCHED_NORMAL_SAMPLE_ID MATCHED_NORMAL_SPECIMEN_ID CBTN_TUMOR_TYPE MOLECULAR_SUBTYPE HARMONIZED_DIAGNOSIS BROAD_HISTOLOGY CANCER_GROUP EXPERIMENT_STRATEGY PATHOLOGY_FREE_TEXT_DIAGNOSIS TUMOR_FRACTION TUMOR_PLOIDY CNS_REGION diff --git a/STUDY_CONFIGS/pbta_all_case_meta_config.json b/STUDY_CONFIGS/pbta_all_case_meta_config.json index ff39f76..7638316 100644 --- a/STUDY_CONFIGS/pbta_all_case_meta_config.json +++ b/STUDY_CONFIGS/pbta_all_case_meta_config.json @@ -25,7 +25,7 @@ "out_file": "Desired output file name" } }, - "x_head": "Special header file table for data_clinical(sample/patient). cBio data_clinical headers have 5 header rows, and which columns are used are determined by the x_file table", + "x_head": "Special header tsv file for data_clinical(sample/patient). cBio data_clinical headers have 5 header rows, and which columns are used are determined by the x_file table. Should be just the file name", "x_file": "sample or patient tables with corresponding metadata at the sample and patient levels", "genomics_etl": "a helper file with relevant cBio sample names and individual genomic files names for ETL merging", "seq_center": "only if project has RNA data, a helper file to fill in missing sequencing center information for genomics etl", @@ -229,6 +229,11 @@ "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], "out_file": "x01_genomics_file_manifest.txt" }, + "cbtn_extra": { + "table": "bix_genomics_file.sd_bhjxbdqk_x01_extra-genomics_file_manifest", + "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], + "out_file": "cbtn_extra_genomics_file_manifest.txt" + }, "pnoc": { "table": "bix_genomics_file.sd_8y99qzjj-genomics_file_manifest", "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], @@ -236,14 +241,14 @@ } }, "sample_head": { - "table": "bix_workflows.data_clinical_sample_header" + "table": "template_sample_header.txt" }, "sample_file": { "table": "prod_cbio.pbta_all_data_clinical_sample", "out_file": "data_clinical_sample.txt" }, "patient_head": { - "table": "bix_workflows.data_clinical_patient_header" + "table": "template_patient_header.txt" }, "patient_file": { "table": "prod_cbio.pbta_all_data_clinical_patient", diff --git a/scripts/convert_fusion_as_sv.py b/scripts/convert_fusion_as_sv.py index de8df24..0b7c35c 100755 --- a/scripts/convert_fusion_as_sv.py +++ b/scripts/convert_fusion_as_sv.py @@ -197,11 +197,14 @@ def init_cbio_master(fusion_results, mode, rna_metadata): if args.mode == 'openX': r_ext = "rsem" elif args.mode == 'dgd': - r_ext = "rsem" + r_ext = "DGD_FUSION" # ensure sample name is imported as str all_file_meta = pd.read_csv(args.table, sep="\t", dtype={'Cbio_Tumor_Name': str}) - + # ext used in pbta vs openpedcan varies rna_subset = all_file_meta.loc[all_file_meta["File_Type"] == r_ext] + if rna_subset is None: + r_ext = 'rsem' + rna_subset = all_file_meta.loc[all_file_meta["File_Type"] == r_ext] # reset index so that references work later while iterating rna_subset = rna_subset.reset_index(drop=True) project_list = rna_subset.Cbio_project.unique() diff --git a/scripts/get_study_metadata.py b/scripts/get_study_metadata.py index dfa0828..7c82f9b 100644 --- a/scripts/get_study_metadata.py +++ b/scripts/get_study_metadata.py @@ -8,7 +8,6 @@ from configparser import ConfigParser import argparse import json -import pdb def config(filename='database.ini', section='postgresql'): @@ -59,7 +58,7 @@ def generic_print(out_file, rows, colnames): return 0 -def get_data_clinical(db_cur, config_dict, prefix): +def get_data_clinical(db_cur, config_dict, prefix, ref_dir): """ Depending on the prefix of patient or sample, will pull from related tables, only use related header info present in table, and print the combined results. @@ -74,19 +73,20 @@ def get_data_clinical(db_cur, config_dict, prefix): (rows, colnames) = generic_pull(db_cur, tbl_name) # use table header from colnames, and use to select file header - head_name = config_dict['database_pulls'][prefix + '_head']['table'] - # get sample table contents, have to split if format schema.table - if '.' not in head_name: - head_sql = sql.SQL('SELECT {} FROM {};').format(sql.SQL(',').join(map(sql.Identifier, colnames)), sql.Identifier(head_name)) - else: - (schema, table) = head_name.split('.') - head_sql = sql.SQL('SELECT {} FROM {}.{};').format(sql.SQL(',').join(map(sql.Identifier, colnames)), sql.Identifier(schema), sql.Identifier(table)) - db_cur.execute(head_sql) - head = db_cur.fetchall() + head_file = open(ref_dir + config_dict['database_pulls'][prefix + '_head']['table']) + # get and read head file + head_lines = head_file.readlines() # create output file and combine results for final product out_file = open(datasheet_dir + "/" + config_data['database_pulls'][prefix + '_file']['out_file'], 'w') - for row in head: - out_file.write("\t".join(row) + "\n") + # get indices of matching head lines, then print corresponding cBio header values + col_i = [] + # the last row, and the header of the data clinical table should have overlapping values + head_search = head_lines[-1].rstrip('\n').split('\t') + for col in colnames: + col_i.append(head_search.index(col)) + for i in range(0, len(head_lines) -1, 1): + head = [head_lines[i].rstrip('\n').split('\t')[j] for j in col_i] + out_file.write("\t".join(head) + "\n") generic_print(out_file, rows, colnames) return 0 @@ -101,11 +101,19 @@ def get_manifests(db_cur, config_dict): try: tbl_name = manifests[manifest]['table'] file_types = manifests[manifest]['file_type'] - if '.' not in tbl_name: - manifest_sql = sql.SQL('SELECT * FROM {} WHERE file_type in ({});').format(sql.Identifier(tbl_name), sql.SQL(',').join(map(sql.Literal, file_types))) + if args.all: + if '.' not in tbl_name: + manifest_sql = sql.SQL('SELECT * FROM {} WHERE file_type in ({});').format(sql.Identifier(tbl_name), sql.SQL(',').join(map(sql.Literal, file_types))) + else: + (schema, table) = tbl_name.split('.') + manifest_sql = sql.SQL('SELECT * FROM {}.{} WHERE file_type in ({});').format(sql.Identifier(schema), sql.Identifier(table), sql.SQL(',').join(map(sql.Literal, file_types)), sql.Literal("active")) else: - (schema, table) = tbl_name.split('.') - manifest_sql = sql.SQL('SELECT * FROM {}.{} WHERE file_type in ({});').format(sql.Identifier(schema), sql.Identifier(table), sql.SQL(',').join(map(sql.Literal, file_types))) + if '.' not in tbl_name: + manifest_sql = sql.SQL('SELECT * FROM {} WHERE file_type in ({}) and status={};').format(sql.Identifier(tbl_name), sql.SQL(',').join(map(sql.Literal, file_types))) + else: + (schema, table) = tbl_name.split('.') + manifest_sql = sql.SQL('SELECT * FROM {}.{} WHERE file_type in ({}) and status={};').format(sql.Identifier(schema), sql.Identifier(table), sql.SQL(',').join(map(sql.Literal, file_types)), sql.Literal("active")) + db_cur.execute(manifest_sql) rows = db_cur.fetchall() colnames = [desc[0] for desc in db_cur.description] @@ -122,11 +130,14 @@ def get_manifests(db_cur, config_dict): parser.add_argument("-d", "--db-ini", action="store", dest="db_ini", help="Database config file - formatting like aws or sbg creds") parser.add_argument("-p", "--profile", action="store", dest="profile", help="ini profile name", default="postgresql") -parser.add_argument("-c", "--config", action="store", dest="config_file", help="json config file with meta information; see REFS/pbta_all_case_meta_config.json example",) +parser.add_argument("-c", "--config", action="store", dest="config_file", help="json config file with meta information; see REFS/pbta_all_case_meta_config.json example") +parser.add_argument("-r", "--ref-dir", action="store", dest="ref_dir", help="dir name containing template data_clinical* header files") +parser.add_argument("-a", "--all", action="store_true", dest="all", help="flag to include all relevant files, not just status=active files, NOT RECOMMENDED") args = parser.parse_args() # Load database login info params = config(filename=args.db_ini, section=args.profile) + datasheet_dir = 'datasheets' # Load json config file with database pull info with open(args.config_file) as f: @@ -137,13 +148,15 @@ def get_manifests(db_cur, config_dict): # dict to track keys with specific database calls special_keys = {"sample_head": 0, "sample_file": 0, "patient_head": 0, "patient_file": 0, "manifests": 0} - + ref_dir = args.ref_dir + if ref_dir[-1] != '/': + ref_dir += '/' try: os.mkdir(datasheet_dir) except Exception as e: print(str(e) + ' IGNORE!') - get_data_clinical(cur, config_data, 'sample') - get_data_clinical(cur, config_data, 'patient') + get_data_clinical(cur, config_data, 'sample', ref_dir) + get_data_clinical(cur, config_data, 'patient', ref_dir) get_manifests(cur, config_data) # For all other tables to be printed simply, not in special_keys