diff --git a/README.md b/README.md index 7565dbd..8997838 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ -# Outline on ETL for converting data from cavatica and data service to pedcbioportal format +# Outline on ETL for converting data from CAVATICA and Data Warehouse to PedcBioportal format In general, we are creating upload packages converting our data and metadata to satisfy the requirements outlined [here](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats). Further general loading notes can be found in this [Notion page](https://www.notion.so/d3b/Cbioportal-Study-Load-SOP-58812479fabe4d2fa9f72242e331b5ee). See [below](#collaborative-and-publication-workflows) for special cases like publications or collaborative efforts ## I have everything and I know I am doing Below assumes you have already created the necessary tables from dbt 1. Run commands as outlined in [scripts/get_study_metadata.py](#scriptsget_study_metadatapy). Copy/move those files to the cBio loader ec2 instance +1. Recommended, but not required: run [scripts/diff_studies.py](docs/DIFF_STUDY_CLINICAL.md). It will give a summary of metadata changes between what is currently loaded and what you plan to load, to potentially flag any suspicious changes 1. Copy over the appropriate aws account key and download files. Example using `pbta_all` study: ```sh diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md new file mode 100644 index 0000000..595b1b1 --- /dev/null +++ b/docs/DIFF_STUDY_CLINICAL.md @@ -0,0 +1,115 @@ +# Compare current versus build +This documentation addresses a [QC script](../scripts/diff_studies.py) for clinical metadata. It streamlines the process of identifying and summarizing changes slated to be made. + +```sh +python3 scripts/diff_studies.py --help +usage: diff_studies.py [-h] [-u URL] [-s STUDY] [-t TOKEN] [-d DATA_DIR] + +Compare local clinical data to server + +options: + -h, --help show this help message and exit + -u URL, --url URL url to search against + -s STUDY, --study STUDY + Cancer study ID to compare on server + -t TOKEN, --token TOKEN + Token file obtained from Web API + -d DATA_DIR, --datasheet-dir DATA_DIR + Directory containing data_clinical_*.txt +``` + +## INPUTS: + - `-u, --url`: cBioportal api deployment site. Default: https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs + - `-s, --study`: cBioportal cancer study ID, i.e. `pbta_all` + - `-t, --token`: File obtained from navigating to https://pedcbioportal.kidsfirstdrc.org/webAPI#using-data-access-tokens, then clicking on `Download Token`. File is reusable + - `-d, --datasheet-dir`: Name of directory containing `data_clinical_patient.txt` and `data_clinical_sample.txt` being vetted for upload + +## OUTPUTS: +Essentially two change log files, `patient_portal_v_build.txt` and `sample_portal_v_build.txt`. +For the patient and sample views, each file respectively has: + - A list, one per line, per ID, per attribute, of what would change if the data were loaded + - A list of IDs that would be removed from the portal, if any + - A list of IDs that would be added if any + - A summary of the number of changes of each attribute type printed to STDOUT + +### patient_portal_v_build.txt example: +``` +Patient attribute before after +PT_017WC8PS ETHNICITY NA Not Available +PT_01HNFSBZ CANCER_PREDISPOSITIONS None documented NA +PT_01HNFSBZ ETHNICITY NA Not Available +PT_01HNFSBZ GERMLINE_SEX_ESTIMATE Unknown NA +PT_01SH4F1X AGE_IN_DAYS 3838 NA +PT_01SH4F1X OS_MONTHS 45 54 +PT_0324HWD5 AGE_IN_DAYS 3121 NA +PT_047YGDRW ETHNICITY NA Not Available +PT_04V47WFC AGE_IN_DAYS 5717 NA +PT_08M919BH OS_MONTHS 44 62 +PT_08M919BH EFS_MONTHS 44 62 +PT_0BSG3R3N AGE_IN_DAYS 3431 NA +PT_0BVR16FK ETHNICITY NA Not Available +PT_0CE0HFYB GERMLINE_SEX_ESTIMATE Male NA +PT_0CVRX4SJ OS_MONTHS NA 149 +``` + +### sample_portal_v_build.txt example: +``` +Sample attribute before after +16510-1 TUMOR_FRACTION 0.349951221921 0.34995122192100003 +16510-15 TUMOR_FRACTION 0.892871847605 0.8928718476049999 +16510-2 TUMOR_FRACTION 0.242536563786 0.24253656378600005 +16510-8 TUMOR_FRACTION 0.557284218924 0.5572842189239999 +7316-100 TUMOR_FRACTION 0.270649989118 0.27064998911800003 +7316-1017 TUMOR_FRACTION 0.570184695999 0.559801637737 +7316-104 TUMOR_FRACTION 0.664343255194 0.6643432551940001 +7316-1045 TUMOR_FRACTION 0.477859261757 0.496989582389 +7316-105 CNS_REGION NA Mixed +7316-105 CANCER_TYPE_DETAILED NA Low-grade glioma, BRAF V600E +7316-105 MOLECULAR_SUBTYPE NA LGG, BRAF V600E +7316-105 BROAD_HISTOLOGY NA Low-grade astrocytic tumor +7316-105 CANCER_GROUP NA Low-grade glioma +7316-105 TUMOR_PLOIDY NA 2 +7316-105 PATHOLOGY_FREE_TEXT_DIAGNOSIS NA pilocytic astrocytoma ii +7316-105 TUMOR_FRACTION NA 0.823344460708 +7316-1052 CANCER_TYPE_DETAILED Diffuse midline glioma, H3 K28-mutant Diffuse midline glioma, H3 K28-altered +7316-1052 MOLECULAR_SUBTYPE DMG, H3 K28 DMG, H3 K28, TP53 +7316-1062 CANCER_TYPE_DETAILED Diffuse midline glioma, H3 K28-mutant Diffuse midline glioma, H3 K28-altered +7316-1068 CANCER_TYPE_DETAILED Diffuse midline glioma, H3 K28-mutant Diffuse midline glioma, H3 K28-altered +7316-1072 CANCER_TYPE_DETAILED Glial-neuronal tumor NOS Glial-neuronal tumor, To be classified +7316-1072 BROAD_HISTOLOGY Low-grade astrocytic tumor Neuronal and mixed neuronal-glial tumor +7316-1072 CANCER_GROUP Glial-neuronal tumor Glial-neuronal tumor NOS +``` + +### STDOUT: +``` +Sample CHANGE SUMMARY: +27 Samples in build would be added to the portal: 1235928,1235929,1235930,1235931,1235932,1235933,1235934,1235935,1235936,1235937,1235938,1235939,1235940,1235941,1235981,1240110,1240112,1240114,1240116,1242273,1242274,1242276,1250775,1250776,1250777,1250778,1273223 +TUMOR_FRACTION has 1005 change(s) +CNS_REGION has 410 change(s) +CANCER_TYPE_DETAILED has 847 change(s) +MOLECULAR_SUBTYPE has 488 change(s) +BROAD_HISTOLOGY has 390 change(s) +CANCER_GROUP has 517 change(s) +TUMOR_PLOIDY has 403 change(s) +PATHOLOGY_FREE_TEXT_DIAGNOSIS has 507 change(s) +ONCOTREE_CODE has 7 change(s) +CANCER_TYPE has 8 change(s) +TUMOR_TYPE has 8 change(s) +TUMOR_TISSUE_TYPE has 17 change(s) +EXPERIMENT_STRATEGY has 1 change(s) +SPECIMEN_ID has 1 change(s) +CBTN_TUMOR_TYPE has 6 change(s) +SAMPLE_TYPE has 2 change(s) + +Patient CHANGE SUMMARY: +ETHNICITY has 358 change(s) +CANCER_PREDISPOSITIONS has 29 change(s) +GERMLINE_SEX_ESTIMATE has 220 change(s) +AGE_IN_DAYS has 147 change(s) +OS_MONTHS has 99 change(s) +EFS_MONTHS has 60 change(s) +EFS_STATUS has 18 change(s) +SEX has 9 change(s) +AGE has 6 change(s) +OS_STATUS has 4 change(s) +``` \ No newline at end of file diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py new file mode 100644 index 0000000..3356454 --- /dev/null +++ b/scripts/diff_studies.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +""" +Script to check a study on pedcbioportal for differences against a local build +""" + +import argparse +from bravado.client import SwaggerClient +from bravado.requests_client import RequestsClient +from urllib.parse import urlparse + + +def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out): + """ + Compare differences in portal sample data and build. + """ + # gross ID diffs + portal_clinical_ids = set(portal.keys()) + build_clinical_ids = set(build.keys()) + portal_only = sorted(portal_clinical_ids - build_clinical_ids) + build_only = sorted(build_clinical_ids - portal_clinical_ids) + common_clinical_ids = sorted(portal_clinical_ids & build_clinical_ids) + # gross attribute diffs + portal_attr_only = list(portal_attr - build_attr) + build_attr_only = list(build_attr - portal_attr) + common_attr = list(portal_attr & build_attr) + # focus on common samp and common attr, as "everything is different for x" is not that useful + print(clin_type + "\tattribute\tbefore\tafter", file=out) + attr_cts = {} + for clinical_id in common_clinical_ids: + for attr in common_attr: + # portal will not have a value for that attr in the struct if none + portal_value = portal[clinical_id].get(attr, "NA") + if portal_value != build[clinical_id][attr]: + print("{}\t{}\t{}\t{}".format(clinical_id, attr, portal_value, build[clinical_id][attr]), file=out) + if attr not in attr_cts: + attr_cts[attr] = 0 + attr_cts[attr] += 1 + + # print change summary to STDOUT + print(clin_type +" CHANGE SUMMARY:") + if len(portal_only) > 0: + print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only))) + if len(build_only) > 0: + print("{} {}s in build would be added to the portal: {}".format(len(build_only), clin_type, ",".join(build_only))) + if len(portal_attr_only) > 0: + print("{} attributes in portal would be removed: {}".format(len(portal_attr_only), ",".join(portal_attr_only))) + if len(build_attr_only) > 0: + print("{} attributes in build would be added to the portal: {}".format(len(build_attr_only), ",".join(build_attr_only))) + for attr in attr_cts: + print("{} has {} change(s)".format(attr, attr_cts[attr])) + # Print extra newline for readability + print ("") + + +def table_to_dict(in_file, key, aggr_list): + """ + Take a text file and convert to dict with certain row value as primary, all other row values as subkeys. + Also return a set of attribute keys + """ + with open(in_file) as f: + # skip lines starting with hash until normal header is reached + for entry in f: + if not entry.startswith("#"): + header = entry.rstrip('\n').split('\t') + primary = header.index(key) + # get aggregate field indices + aggr_head = [] + for aggr in aggr_list: + if aggr in header: + aggr_head.append(header.index(aggr)) + break + data_dict = {} + for entry in f: + data = entry.rstrip('\n').split('\t') + # Replace empty string with NA as that is how the portal will return it + data = ["NA" if d == "" else d for d in data] + data_dict[data[primary]] = {} + # For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter + # Therefore, sort them so that when compared, no errors are triggered + for i in aggr_head: + data[i] = ';'.join(sorted(data[i].split(';'))) + # two loops, for up until primary key, then after. + for i in range(len(data)): + if i == primary: continue + data_dict[data[primary]][header[i]] = data[i] + attr_set = set(header) + # no need for primary key to be reported as an attribute + attr_set.remove(key) + return data_dict, attr_set + + +def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list): + """ + Get all the column-value pairs for each data_type(SAMPLE or PATIENT) for a specific study + Convert result to dict + Also return a set of attribute keys + """ + # object is a big ass array of struct, one entry per attribute, per patient/sample - so like if a table were just concatenated into a single vector + data_clinical = cbio_conn.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=study_id, projection='DETAILED', clinicalDataType=data_type).result() + data_dict = {} + # Use sampleId or patientID, clinicalAttributeId (column name) and value + attr_dict = {"SAMPLE": "sampleId", "PATIENT": "patientId" } + status = ["OS_STATUS"] + for entry in data_clinical: + clinical_id = getattr(entry, attr_dict[data_type]) + if clinical_id not in data_dict: + # every entry per sample as sampleId and patientId, patient just patientId. Add keys to match + data_dict[clinical_id] = {"PATIENT_ID": entry.patientId} + value = entry.value + attr_id = entry.clinicalAttributeId + # For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter + # Therefore, sort them so that when compared, no errors are triggered + if attr_id in aggr_list: + value = ';'.join(sorted(value.split(';'))) + # "standardize" status field so that 0:LIVING = LIVING and 1:DECEASED = DECEASED + if attr_id in status: + value = value[2:] + data_dict[clinical_id][attr_id] = value + return data_dict + + +def main(): + parser = argparse.ArgumentParser( + description="Compare local clinical data to server" + ) + parser.add_argument( + "-u", "--url", action="store", dest="url", help="url to search against", default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs" + ) + parser.add_argument( + "-c", "--study", action="store", dest="study", help="Cancer study ID to compare on server" + ) + parser.add_argument( + "-t", "--token", action="store", dest="token", help="Token file obtained from Web API" + ) + parser.add_argument( + "-s", "--datasheet-sample", action="store", dest="data_sample", help="File containing cBio-formatted sample metadata, typically named data_clinical_sample.txt" + ) + parser.add_argument( + "-p", "--datasheet-patient", action="store", dest="data_patient", help="File containing cBio-formatted patient metadata, typically named data_clinical_patient.txt" + ) + + args = parser.parse_args() + with open(args.token, 'r') as token_file: + token = token_file.read().rstrip().split(': ')[1] + + url_object = urlparse(args.url) + + http_client = RequestsClient() + http_client.set_api_key( + '{}'.format(url_object.hostname), 'Bearer {}'.format(token), + param_name='Authorization', param_in='header' + ) + + cbioportal = SwaggerClient.from_url(args.url, + http_client=http_client, + config={"validate_requests":False, + "validate_responses":False, + "validate_swagger_spec": False} + ) + + # hardcode for now names of aggregate fields, implicit, and skip fields + aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"] + portal_sample_attr_implicit = ['PATIENT_ID'] + portal_patient_attr_skip = ['SAMPLE_COUNT'] + portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT'] + # get attribute keys + attr_key_obj = cbioportal.Clinical_Attributes.fetchClinicalAttributesUsingPOST(studyIds=[args.study], projection='ID').result() + # gather sample-level metadata + portal_sample_data = data_clinical_from_study(cbioportal, args.study, "SAMPLE", aggr_list) + build_sample_data, build_sample_attr_keys = table_to_dict(args.data_sample, "SAMPLE_ID", aggr_list) + portal_sample_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if not x.patientAttribute]) + # implicit attributes not returned by function that are required for sample view + portal_sample_attr_keys.update(portal_sample_attr_implicit) + # drop attributes that are post-load portal-specific + portal_sample_attr_keys -= set(portal_sample_attr_skip) + # sample-level diffs + with open('sample_portal_v_build.txt', 'w') as sample_diff_out: + clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out) + # patient-level diffs + portal_patient_data = data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list) + build_patient_data, build_patient_attr_keys = table_to_dict(args.data_patient, "PATIENT_ID", aggr_list) + portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute]) + # drop attributes that are post-load portal-specific + portal_patient_attr_keys -= set(portal_patient_attr_skip) + with open('patient_portal_v_build.txt', 'w') as patient_diff_out: + clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out) + + +if __name__ == '__main__': + main()