Skip to content

Commit

Permalink
Merge pull request #50 from kids-first/feature/mb-clin-data-refactor
Browse files Browse the repository at this point in the history
🛠️ clin data refactor
  • Loading branch information
migbro committed Jul 19, 2023
2 parents 838ee3c + e0f0fe7 commit bc9c9b5
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 27 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Further general loading notes can be found in this [Notion page](https://www.not
See [below](#collaborative-and-publication-workflows) for special cases like publications or collaborative efforts
## I have everything and I know I am doing
Below assumes you have already created the necessary tables from dbt
1. Run commands as outlined in `scripts/get_study_metadata.py`. Copy/move those files to the cBio loader ec2 instance
1. Run commands as outlined in [scripts/get_study_metadata.py](#scriptsget_study_metadatapy). Copy/move those files to the cBio loader ec2 instance
1. Copy over the appropriate aws account key and download files. Example using `pbta_all` study:

```sh
Expand Down Expand Up @@ -107,6 +107,8 @@ optional arguments:
ini profile name
-c CONFIG_FILE, --config CONFIG_FILE
json config file with meta information; see REFS/pbta_all_case_meta_config.json example
-r REF_DIR, --ref-dir REF_DIR
dir name containing template data_clinical* header files
```
### From D3b Warehouse
Expand Down
1 change: 1 addition & 0 deletions REFS/aws_bucket_key_pairs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ s3://cds-246-phs002517-p30-fy20 NCI-AR
s3://cds-246-phs002517-sequencefiles-p30-fy20 NCI-AR
s3://cds-306-phs002517-x01 NCI-X01
s3://d3b-cds-working-bucket d3b
s3://kf-strides-study-us-east-1-prd-sd-bhjxbdqk kf
s3://kf-study-us-east-1-prd-sd-8y99qzjj saml
5 changes: 5 additions & 0 deletions REFS/template_patient_header.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#Patient Identifier External Patient Identifier SEX RACE ETHNICITY AGE AGE_IN_DAYS OS_STATUS OS_MONTHS EFS_MONTHS EFS_STATUS germline_sex_estimate cancer_predispositions
#Patient identifier Patient ID used by generator of data Sex of the patient racial demographic ethnic demographic Age at which the condition or disease was first diagnosed, in years Patient age in days at initial diagnosis Overall patient survival status Overall survival in months since initial diagnosis Event free (months) since initial treatment Event free status germline sex estimate cancer predispositions
#STRING STRING STRING STRING STRING NUMBER NUMBER STRING NUMBER NUMBER STRING STRING STRING
#11 10 9 5 4 8 7 3 2 1 1 1 2
PATIENT_ID EXTERNAL_PATIENT_ID SEX RACE ETHNICITY AGE AGE_IN_DAYS OS_STATUS OS_MONTHS EFS_MONTHS EFS_STATUS GERMLINE_SEX_ESTIMATE CANCER_PREDISPOSITIONS
5 changes: 5 additions & 0 deletions REFS/template_sample_header.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#Patient Identifier COLLECTION_EVENT_ID Sample Identifier SPECIMEN_ID CANCER_TYPE CANCER_TYPE_DETAILED ONCOTREE_CODE TUMOR_TISSUE_SITE TUMOR_TYPE SAMPLE_TYPE MATCHED_NORMAL_SAMPLE_ID MATCHED_NORMAL_SPECIMEN_ID CBTN_TUMOR_TYPE MOLECULAR_SUBTYPE HARMONIZED_DIAGNOSIS BROAD_HISTOLOGY CANCER_GROUP EXPERIMENT_STRATEGY pathology_free_text_diagnosis tumor_fraction tumor_ploidy CNS_region
#Patient identifier Unifying ID of child sequencing events from a single biological sample Sample Identifier using external_sample_id kfdrc tumor biopsecimen ID Study-defined cancer type Study-defined cancer type detail OncoTree alphanumeric code value for CANCER_TYPE_DETAILED tumor tissue location primary v metastatic tumor designation patient tissue sample or cell line matched normal external_sample_id kfdrc matched normal biospecimen ID CBTN-assigned tumor type contains molecular subtypes for tumor types selected from pathology_diagnosis and pathology_free_text_diagnosis fields, following World Health Organization 2016 classification criteria Disease with molecular subtype and grade broad histology cancer group sequencing strategies of sample represented by this SAMPLE_ID pathology free text diagnosis tumor fraction tumor ploidy CNS region
#STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING STRING NUMBER NUMBER STRING
#14 3 13 12 11 10 9 8 7 6 2 1 7 5 5 5 5 6 1 3 4 5
PATIENT_ID COLLECTION_EVENT_ID SAMPLE_ID SPECIMEN_ID CANCER_TYPE CANCER_TYPE_DETAILED ONCOTREE_CODE TUMOR_TISSUE_TYPE TUMOR_TYPE SAMPLE_TYPE MATCHED_NORMAL_SAMPLE_ID MATCHED_NORMAL_SPECIMEN_ID CBTN_TUMOR_TYPE MOLECULAR_SUBTYPE HARMONIZED_DIAGNOSIS BROAD_HISTOLOGY CANCER_GROUP EXPERIMENT_STRATEGY PATHOLOGY_FREE_TEXT_DIAGNOSIS TUMOR_FRACTION TUMOR_PLOIDY CNS_REGION
11 changes: 8 additions & 3 deletions STUDY_CONFIGS/pbta_all_case_meta_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"out_file": "Desired output file name"
}
},
"x_head": "Special header file table for data_clinical(sample/patient). cBio data_clinical headers have 5 header rows, and which columns are used are determined by the x_file table",
"x_head": "Special header tsv file for data_clinical(sample/patient). cBio data_clinical headers have 5 header rows, and which columns are used are determined by the x_file table. Should be just the file name",
"x_file": "sample or patient tables with corresponding metadata at the sample and patient levels",
"genomics_etl": "a helper file with relevant cBio sample names and individual genomic files names for ETL merging",
"seq_center": "only if project has RNA data, a helper file to fill in missing sequencing center information for genomics etl",
Expand Down Expand Up @@ -229,21 +229,26 @@
"file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
"out_file": "x01_genomics_file_manifest.txt"
},
"cbtn_extra": {
"table": "bix_genomics_file.sd_bhjxbdqk_x01_extra-genomics_file_manifest",
"file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
"out_file": "cbtn_extra_genomics_file_manifest.txt"
},
"pnoc": {
"table": "bix_genomics_file.sd_8y99qzjj-genomics_file_manifest",
"file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
"out_file": "pnoc_genomics_file_manifest.txt"
}
},
"sample_head": {
"table": "bix_workflows.data_clinical_sample_header"
"table": "template_sample_header.txt"
},
"sample_file": {
"table": "prod_cbio.pbta_all_data_clinical_sample",
"out_file": "data_clinical_sample.txt"
},
"patient_head": {
"table": "bix_workflows.data_clinical_patient_header"
"table": "template_patient_header.txt"
},
"patient_file": {
"table": "prod_cbio.pbta_all_data_clinical_patient",
Expand Down
7 changes: 5 additions & 2 deletions scripts/convert_fusion_as_sv.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,14 @@ def init_cbio_master(fusion_results, mode, rna_metadata):
if args.mode == 'openX':
r_ext = "rsem"
elif args.mode == 'dgd':
r_ext = "rsem"
r_ext = "DGD_FUSION"
# ensure sample name is imported as str
all_file_meta = pd.read_csv(args.table, sep="\t", dtype={'Cbio_Tumor_Name': str})
# ext used in pbta vs openpedcan varies
rna_subset = all_file_meta.loc[all_file_meta["File_Type"] == r_ext]
if rna_subset is None:
r_ext = 'rsem'
rna_subset = all_file_meta.loc[all_file_meta["File_Type"] == r_ext]
# reset index so that references work later while iterating
rna_subset = rna_subset.reset_index(drop=True)
project_list = rna_subset.Cbio_project.unique()
Expand Down
55 changes: 34 additions & 21 deletions scripts/get_study_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from configparser import ConfigParser
import argparse
import json
import pdb


def config(filename='database.ini', section='postgresql'):
Expand Down Expand Up @@ -59,7 +58,7 @@ def generic_print(out_file, rows, colnames):
return 0


def get_data_clinical(db_cur, config_dict, prefix):
def get_data_clinical(db_cur, config_dict, prefix, ref_dir):
"""
Depending on the prefix of patient or sample, will pull from related tables,
only use related header info present in table, and print the combined results.
Expand All @@ -74,19 +73,20 @@ def get_data_clinical(db_cur, config_dict, prefix):
(rows, colnames) = generic_pull(db_cur, tbl_name)

# use table header from colnames, and use to select file header
head_name = config_dict['database_pulls'][prefix + '_head']['table']
# get sample table contents, have to split if format schema.table
if '.' not in head_name:
head_sql = sql.SQL('SELECT {} FROM {};').format(sql.SQL(',').join(map(sql.Identifier, colnames)), sql.Identifier(head_name))
else:
(schema, table) = head_name.split('.')
head_sql = sql.SQL('SELECT {} FROM {}.{};').format(sql.SQL(',').join(map(sql.Identifier, colnames)), sql.Identifier(schema), sql.Identifier(table))
db_cur.execute(head_sql)
head = db_cur.fetchall()
head_file = open(ref_dir + config_dict['database_pulls'][prefix + '_head']['table'])
# get and read head file
head_lines = head_file.readlines()
# create output file and combine results for final product
out_file = open(datasheet_dir + "/" + config_data['database_pulls'][prefix + '_file']['out_file'], 'w')
for row in head:
out_file.write("\t".join(row) + "\n")
# get indices of matching head lines, then print corresponding cBio header values
col_i = []
# the last row, and the header of the data clinical table should have overlapping values
head_search = head_lines[-1].rstrip('\n').split('\t')
for col in colnames:
col_i.append(head_search.index(col))
for i in range(0, len(head_lines) -1, 1):
head = [head_lines[i].rstrip('\n').split('\t')[j] for j in col_i]
out_file.write("\t".join(head) + "\n")
generic_print(out_file, rows, colnames)
return 0

Expand All @@ -101,11 +101,19 @@ def get_manifests(db_cur, config_dict):
try:
tbl_name = manifests[manifest]['table']
file_types = manifests[manifest]['file_type']
if '.' not in tbl_name:
manifest_sql = sql.SQL('SELECT * FROM {} WHERE file_type in ({});').format(sql.Identifier(tbl_name), sql.SQL(',').join(map(sql.Literal, file_types)))
if args.all:
if '.' not in tbl_name:
manifest_sql = sql.SQL('SELECT * FROM {} WHERE file_type in ({});').format(sql.Identifier(tbl_name), sql.SQL(',').join(map(sql.Literal, file_types)))
else:
(schema, table) = tbl_name.split('.')
manifest_sql = sql.SQL('SELECT * FROM {}.{} WHERE file_type in ({});').format(sql.Identifier(schema), sql.Identifier(table), sql.SQL(',').join(map(sql.Literal, file_types)), sql.Literal("active"))
else:
(schema, table) = tbl_name.split('.')
manifest_sql = sql.SQL('SELECT * FROM {}.{} WHERE file_type in ({});').format(sql.Identifier(schema), sql.Identifier(table), sql.SQL(',').join(map(sql.Literal, file_types)))
if '.' not in tbl_name:
manifest_sql = sql.SQL('SELECT * FROM {} WHERE file_type in ({}) and status={};').format(sql.Identifier(tbl_name), sql.SQL(',').join(map(sql.Literal, file_types)))
else:
(schema, table) = tbl_name.split('.')
manifest_sql = sql.SQL('SELECT * FROM {}.{} WHERE file_type in ({}) and status={};').format(sql.Identifier(schema), sql.Identifier(table), sql.SQL(',').join(map(sql.Literal, file_types)), sql.Literal("active"))

db_cur.execute(manifest_sql)
rows = db_cur.fetchall()
colnames = [desc[0] for desc in db_cur.description]
Expand All @@ -122,11 +130,14 @@ def get_manifests(db_cur, config_dict):

parser.add_argument("-d", "--db-ini", action="store", dest="db_ini", help="Database config file - formatting like aws or sbg creds")
parser.add_argument("-p", "--profile", action="store", dest="profile", help="ini profile name", default="postgresql")
parser.add_argument("-c", "--config", action="store", dest="config_file", help="json config file with meta information; see REFS/pbta_all_case_meta_config.json example",)
parser.add_argument("-c", "--config", action="store", dest="config_file", help="json config file with meta information; see REFS/pbta_all_case_meta_config.json example")
parser.add_argument("-r", "--ref-dir", action="store", dest="ref_dir", help="dir name containing template data_clinical* header files")
parser.add_argument("-a", "--all", action="store_true", dest="all", help="flag to include all relevant files, not just status=active files, NOT RECOMMENDED")

args = parser.parse_args()
# Load database login info
params = config(filename=args.db_ini, section=args.profile)

datasheet_dir = 'datasheets'
# Load json config file with database pull info
with open(args.config_file) as f:
Expand All @@ -137,13 +148,15 @@ def get_manifests(db_cur, config_dict):

# dict to track keys with specific database calls
special_keys = {"sample_head": 0, "sample_file": 0, "patient_head": 0, "patient_file": 0, "manifests": 0}

ref_dir = args.ref_dir
if ref_dir[-1] != '/':
ref_dir += '/'
try:
os.mkdir(datasheet_dir)
except Exception as e:
print(str(e) + ' IGNORE!')
get_data_clinical(cur, config_data, 'sample')
get_data_clinical(cur, config_data, 'patient')
get_data_clinical(cur, config_data, 'sample', ref_dir)
get_data_clinical(cur, config_data, 'patient', ref_dir)
get_manifests(cur, config_data)

# For all other tables to be printed simply, not in special_keys
Expand Down

0 comments on commit bc9c9b5

Please sign in to comment.