From f23bfb1cff2e5642c64b77eecec3cd6ea3b5607f Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 22 Feb 2024 17:07:47 -0500 Subject: [PATCH 1/8] :construction: WIP cbio server scraper --- scripts/diff_studies.py | 61 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 scripts/diff_studies.py diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py new file mode 100644 index 0000000..df876e9 --- /dev/null +++ b/scripts/diff_studies.py @@ -0,0 +1,61 @@ +""" +Script to check a study on pedcbioportal for differences against a local build +""" + +import argparse +from bravado.client import SwaggerClient +from bravado.requests_client import RequestsClient +from urllib.parse import urlparse +import pdb + + +def main(): + parser = argparse.ArgumentParser( + description="Compare local clinical data to server" + ) + parser.add_argument( + "-u", + "--url", + action="store", + dest="url", + help="url to search against", + default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs" + ) + parser.add_argument( + "-s", "--study", action="store", dest="study", help="Cancer study ID to compare on server" + ) + parser.add_argument( + "-t", "--token", action="store", dest="token", help="Token file obtained from Web API" + ) + parser.add_argument( + "-d", "--datasheet-dir", action="store", dest="data_dir", help="Directory containing data_clinical_*.txt" + ) + + args = parser.parse_args() + + with open(args.token, 'r') as token_file: + token = token_file.read().rstrip().split(': ')[1] + + url_object = urlparse(args.url) + + http_client = RequestsClient() + http_client.set_api_key( + '{}'.format(url_object.hostname), 'Bearer {}'.format(token), + param_name='Authorization', param_in='header' + ) + + cbioportal = SwaggerClient.from_url(args.url, + http_client=http_client, + config={"validate_requests":False, + "validate_responses":False, + "validate_swagger_spec": False} + ) + # object is a big ass array, one entry per attribute, per patient/sample? - so like if a table were just concatenated into a single vector + clinical_data = cbioportal.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=study).result() + # would need to loop through patient IDs get patient-specific fields, like age? + # get_clin_pt = cbioportal.Clinical_Data.getAllClinicalDataOfPatientInStudyUsingGET(studyId=study,patientId='PT_00G007DM' ).result() + pdb.set_trace() + hold=1 + +if __name__ == '__main__': + main() From 616270cff761c9d09ed3410aceb67cd006b4074f Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 23 Feb 2024 16:03:06 -0500 Subject: [PATCH 2/8] :construction: nearly complete with logic, need to wrap that up then test --- scripts/diff_studies.py | 132 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 121 insertions(+), 11 deletions(-) diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py index df876e9..0cab0d1 100644 --- a/scripts/diff_studies.py +++ b/scripts/diff_studies.py @@ -9,17 +9,118 @@ import pdb +def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out): + """ + Compare differences in portal sample data and build. + """ + # gross ID diffs + portal_clinical_ids = set(list(portal.keys()).sort()) + build_clinical_ids = set(list(build.keys()).sort()) + portal_only = list(portal_clinical_ids - build_clinical_ids) + build_only = list(build_clinical_ids - portal_clinical_ids) + common_samp_ids = list(portal_clinical_ids & build_clinical_ids) + # gross attribute diffs + portal_attr_only = list(portal_attr - build_attr) + build_attr_only = list(build_attr - portal_attr) + common_attr = list(portal_attr & build_attr) + + # focus on common samp and common attr, as "everything is different for x" is not that useful + print("Per " + clin_type + " changes:", file=out) + attr_cts = {} + for samp_id in common_samp_ids: + for attr in common_attr: + if portal[samp_id][attr] != build[samp_id][attr]: + print("{} {} attribute {} would change from {} to {}".format(clin_type, samp_id, attr, portal[samp_id][attr], build[samp_id][attr]), file=out) + if attr not in attr_cts: + attr_cts[attr] = 0 + attr_cts[attr] += 1 + print("CHANGE SUMMARY:", file=out) + if len(portal_only) > 0: + print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only)), file=out) + if len(build_only) > 0: + print("{} {}s in build would be added to the portal: {}".format(len(build_only), clin_type, ",".join(build_only)), file=out) + if len(portal_attr_only) > 0: + print("{} attributes in portal would be removed: {}".format(len(portal_attr_only), ",".join(portal_attr_only)), file=out) + if len(build_attr_only) > 0: + print("{} attributes in build would be added to the portal: {}".format(len(build_attr_only), ",".join(build_attr_only)), file=out) + for attr in attr_cts: + print("{} has {} change(s)".format(attr, attr_cts[attr]), file=out) + + +def split_sort_field(value, sep): + """ + For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter. + Therefore, sort them so that when compared, no errors are triggered + """ + return sep.join(value.split(sep).sort()) + + +def table_to_dict(in_file, key, aggr_list): + """ + Take a text file and convert to dict with certain row value as primary, all other row values as subkeys. + Also return a set of attribute keys + """ + with open(in_file) as f: + # skip lines starting with hash until normal header is reached + for entry in f: + if entry[0] != "#": + head = next(f) + header = head.rstrip('\n').split('\t') + primary = header.index(key) + # get aggregate field indices + aggr_head = [] + for aggr in aggr_list: + if aggr in header: + aggr_head.append(header.index(aggr)) + + break + data_dict = {} + for entry in f: + data = entry.rstrip('\n').split('\t') + data_dict[data[primary]] = {} + # sort aggr fields + for i in aggr_head: + entry[i] = split_sort_field(entry[i], ";") + # two loops, for up until primary key, then after + for i in range(0, primary, 1): + data_dict[data[primary]][header[i]] = entry[i] + for i in range((primary + 1), len(entry), 1): + data_dict[data[primary]][header[i]] = entry[i] + return data_dict, set(header.sort()) + + +def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list): + """ + Get all the column-value pairs for each data_type(SAMPLE or PATIENT) for a specific study + Convert result to dict + Also return a set of attribute keys + """ + # object is a big ass array of struct, one entry per attribute, per patient/sample - so like if a table were just concatenated into a single vector + data_clinical = cbio_conn.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=study_id, projection='DETAILED', clinicalDataType=data_type).result() + data_dict = {} + attr_keys = [] + # Use sampleId or patientID, clinicalAttributeId (column name) and value + attr_dict = {"SAMPLE": "sampleId", "PATIENT": "patientId" } + for entry in data_clinical: + clinical_id = getattr(entry, attr_dict[data_type]) + if clinical_id not in data_dict: + data_dict[clinical_id] = {} + value = entry.value + attr_id = entry.clinicalAttributeId + if attr_id in aggr_list: + value = split_sort_field(value, ";") + data_dict[clinical_id][attr_id] = value + # need to fix this...can probably just use api to get attribute keys instead + attr_keys.append(attr_id) + return data_dict, set(attr_keys.sort()) + + def main(): parser = argparse.ArgumentParser( description="Compare local clinical data to server" ) parser.add_argument( - "-u", - "--url", - action="store", - dest="url", - help="url to search against", - default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs" + "-u", "--url", action="store", dest="url", help="url to search against", default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs" ) parser.add_argument( "-s", "--study", action="store", dest="study", help="Cancer study ID to compare on server" @@ -32,7 +133,6 @@ def main(): ) args = parser.parse_args() - with open(args.token, 'r') as token_file: token = token_file.read().rstrip().split(': ')[1] @@ -50,10 +150,20 @@ def main(): "validate_responses":False, "validate_swagger_spec": False} ) - # object is a big ass array, one entry per attribute, per patient/sample? - so like if a table were just concatenated into a single vector - clinical_data = cbioportal.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=study).result() - # would need to loop through patient IDs get patient-specific fields, like age? - # get_clin_pt = cbioportal.Clinical_Data.getAllClinicalDataOfPatientInStudyUsingGET(studyId=study,patientId='PT_00G007DM' ).result() + + portal_sample_data, portal_sample_attr_keys = data_clinical_from_study(cbioportal, args.study, "SAMPLE") + build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt") + sample_diff_out = open('sample_portal_v_build.txt', 'w') + clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out) + sample_diff_out.close() + # hardcode for now names of aggregate fields + aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"] + portal_patient_data, portal_patient_attr_keys = data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list) + build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", aggr_list) + patient_diff_out = open('patient_portal_v_build.txt', 'w') + clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out) + patient_diff_out.close() + pdb.set_trace() hold=1 From 9c041d40a2438b2564c1a181a7fb17cef4a54174 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Mon, 26 Feb 2024 15:56:34 -0500 Subject: [PATCH 3/8] :tada: complete working version --- scripts/diff_studies.py | 85 +++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py index 0cab0d1..29b1c7b 100644 --- a/scripts/diff_studies.py +++ b/scripts/diff_studies.py @@ -6,34 +6,38 @@ from bravado.client import SwaggerClient from bravado.requests_client import RequestsClient from urllib.parse import urlparse -import pdb def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out): """ Compare differences in portal sample data and build. """ - # gross ID diffs - portal_clinical_ids = set(list(portal.keys()).sort()) - build_clinical_ids = set(list(build.keys()).sort()) + # gross ID diffs + portal_clinical_ids = set(portal.keys()) + build_clinical_ids = set(build.keys()) portal_only = list(portal_clinical_ids - build_clinical_ids) + portal_only.sort() build_only = list(build_clinical_ids - portal_clinical_ids) + build_only.sort() common_samp_ids = list(portal_clinical_ids & build_clinical_ids) + common_samp_ids.sort() # gross attribute diffs portal_attr_only = list(portal_attr - build_attr) build_attr_only = list(build_attr - portal_attr) common_attr = list(portal_attr & build_attr) - # focus on common samp and common attr, as "everything is different for x" is not that useful print("Per " + clin_type + " changes:", file=out) attr_cts = {} for samp_id in common_samp_ids: for attr in common_attr: - if portal[samp_id][attr] != build[samp_id][attr]: - print("{} {} attribute {} would change from {} to {}".format(clin_type, samp_id, attr, portal[samp_id][attr], build[samp_id][attr]), file=out) + # portal will not have a value for that attr in the struct if none + portal_value = portal[samp_id].get(attr, "NA") + if portal_value != build[samp_id][attr]: + print("{} {} attribute {} would change from {} to {}".format(clin_type, samp_id, attr, portal_value, build[samp_id][attr]), file=out) if attr not in attr_cts: attr_cts[attr] = 0 attr_cts[attr] += 1 + print("CHANGE SUMMARY:", file=out) if len(portal_only) > 0: print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only)), file=out) @@ -52,7 +56,9 @@ def split_sort_field(value, sep): For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter. Therefore, sort them so that when compared, no errors are triggered """ - return sep.join(value.split(sep).sort()) + value_list = value.split(sep) + value_list.sort() + return sep.join(value_list) def table_to_dict(in_file, key, aggr_list): @@ -64,8 +70,7 @@ def table_to_dict(in_file, key, aggr_list): # skip lines starting with hash until normal header is reached for entry in f: if entry[0] != "#": - head = next(f) - header = head.rstrip('\n').split('\t') + header = entry.rstrip('\n').split('\t') primary = header.index(key) # get aggregate field indices aggr_head = [] @@ -77,16 +82,21 @@ def table_to_dict(in_file, key, aggr_list): data_dict = {} for entry in f: data = entry.rstrip('\n').split('\t') + # Replace empty string with NA as that is how the portal will return it + data = ["NA" if d == "" else d for d in data] data_dict[data[primary]] = {} # sort aggr fields for i in aggr_head: - entry[i] = split_sort_field(entry[i], ";") - # two loops, for up until primary key, then after + data[i] = split_sort_field(data[i], ";") + # two loops, for up until primary key, then after. for i in range(0, primary, 1): - data_dict[data[primary]][header[i]] = entry[i] - for i in range((primary + 1), len(entry), 1): - data_dict[data[primary]][header[i]] = entry[i] - return data_dict, set(header.sort()) + data_dict[data[primary]][header[i]] = data[i] + for i in range((primary + 1), len(data), 1): + data_dict[data[primary]][header[i]] = data[i] + attr_set = set(header) + # no need for primary key to be reported as an attribute + attr_set.remove(key) + return data_dict, attr_set def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list): @@ -98,21 +108,24 @@ def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list): # object is a big ass array of struct, one entry per attribute, per patient/sample - so like if a table were just concatenated into a single vector data_clinical = cbio_conn.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=study_id, projection='DETAILED', clinicalDataType=data_type).result() data_dict = {} - attr_keys = [] # Use sampleId or patientID, clinicalAttributeId (column name) and value attr_dict = {"SAMPLE": "sampleId", "PATIENT": "patientId" } + status = ["OS_STATUS"] for entry in data_clinical: clinical_id = getattr(entry, attr_dict[data_type]) if clinical_id not in data_dict: + # every entry per sample as sampleId and patientId, patient just patientId. Add keys to match data_dict[clinical_id] = {} + data_dict[clinical_id]["PATIENT_ID"] = entry.patientId value = entry.value attr_id = entry.clinicalAttributeId if attr_id in aggr_list: value = split_sort_field(value, ";") + # "standardize" status field so that 0:LIVING = LIVING and 1:DECEASED = DECEASED + if attr_id in status: + value = value[2:] data_dict[clinical_id][attr_id] = value - # need to fix this...can probably just use api to get attribute keys instead - attr_keys.append(attr_id) - return data_dict, set(attr_keys.sort()) + return data_dict def main(): @@ -151,21 +164,37 @@ def main(): "validate_swagger_spec": False} ) - portal_sample_data, portal_sample_attr_keys = data_clinical_from_study(cbioportal, args.study, "SAMPLE") - build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt") + # hardcode for now names of aggregate fields + aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"] + # get attribute keys + attr_key_obj = cbioportal.Clinical_Attributes.fetchClinicalAttributesUsingPOST(studyIds=[args.study], projection='ID').result() + # gather sample-level metadata + portal_sample_data = data_clinical_from_study(cbioportal, args.study, "SAMPLE", aggr_list) + build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt", "SAMPLE_ID", aggr_list) sample_diff_out = open('sample_portal_v_build.txt', 'w') + portal_sample_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if not x.patientAttribute]) + # implicit attributes not returned by function that are required for sample view + portal_sample_attr_implicit = ['PATIENT_ID'] + portal_sample_attr_keys.update(portal_sample_attr_implicit) + # drop attributes that are post-load portal-specific + portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT'] + for attr in portal_sample_attr_skip: + portal_sample_attr_keys.remove(attr) + # sample-level diffs clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out) sample_diff_out.close() - # hardcode for now names of aggregate fields - aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"] - portal_patient_data, portal_patient_attr_keys = data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list) - build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", aggr_list) + # patient-level diffs + portal_patient_data = data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list) + build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", "PATIENT_ID", aggr_list) patient_diff_out = open('patient_portal_v_build.txt', 'w') + portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute]) + portal_patient_attr_skip = ['SAMPLE_COUNT'] + for attr in portal_patient_attr_skip: + portal_patient_attr_keys.remove(attr) + clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out) patient_diff_out.close() - pdb.set_trace() - hold=1 if __name__ == '__main__': main() From 6a25883944d4749ab11a7745dc630bdf55ca2db7 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Wed, 28 Feb 2024 16:37:01 -0500 Subject: [PATCH 4/8] :pencil: added documentation for scraper script --- README.md | 3 +- docs/DIFF_STUDY_CLINICAL.md | 106 ++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 docs/DIFF_STUDY_CLINICAL.md diff --git a/README.md b/README.md index 7565dbd..8997838 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ -# Outline on ETL for converting data from cavatica and data service to pedcbioportal format +# Outline on ETL for converting data from CAVATICA and Data Warehouse to PedcBioportal format In general, we are creating upload packages converting our data and metadata to satisfy the requirements outlined [here](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats). Further general loading notes can be found in this [Notion page](https://www.notion.so/d3b/Cbioportal-Study-Load-SOP-58812479fabe4d2fa9f72242e331b5ee). See [below](#collaborative-and-publication-workflows) for special cases like publications or collaborative efforts ## I have everything and I know I am doing Below assumes you have already created the necessary tables from dbt 1. Run commands as outlined in [scripts/get_study_metadata.py](#scriptsget_study_metadatapy). Copy/move those files to the cBio loader ec2 instance +1. Recommended, but not required: run [scripts/diff_studies.py](docs/DIFF_STUDY_CLINICAL.md). It will give a summary of metadata changes between what is currently loaded and what you plan to load, to potentially flag any suspicious changes 1. Copy over the appropriate aws account key and download files. Example using `pbta_all` study: ```sh diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md new file mode 100644 index 0000000..7ba7cd2 --- /dev/null +++ b/docs/DIFF_STUDY_CLINICAL.md @@ -0,0 +1,106 @@ +# Compare current versus build +This documentation addresses a [QC script](../scripts/diff_studies.py) for clinical metadata. It streamlines the process of identifying and summarizing changes slated to be made + +```sh +python3 scripts/diff_studies.py --help +usage: diff_studies.py [-h] [-u URL] [-s STUDY] [-t TOKEN] [-d DATA_DIR] + +Compare local clinical data to server + +options: + -h, --help show this help message and exit + -u URL, --url URL url to search against + -s STUDY, --study STUDY + Cancer study ID to compare on server + -t TOKEN, --token TOKEN + Token file obtained from Web API + -d DATA_DIR, --datasheet-dir DATA_DIR + Directory containing data_clinical_*.txt +``` + +## INPUTS: + - `-u, --url`: cBioportal api deployment site. Default: https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs + - `-s, --study`: cBioportal cancer study ID, i.e. `pbta_all` + - `-t, --token`: File obtained from navigating to https://pedcbioportal.kidsfirstdrc.org/webAPI#using-data-access-tokens, then clicking on `Download Token`. File is reusable + - `-d, --datasheet-dir`: Name of directory containing `data_clinical_patient.txt` and `data_clinical_sample.txt` being vetted for upload + +## OUTPUTS: +Essentially two change log files, `patient_portal_v_build.txt` and `sample_portal_v_build.txt`. +For the patient and sample views, each file respectively has: + - A list, one per line, per ID, per attribute, of what would change if the data were loaded + - A list of IDs that would be removed from the portal, if any + - A list of IDs that would be added in any + - A summary of the number of changes of each attribute type + +### patient_portal_v_build.txt example: +``` +Per Patient changes: +Patient PT_017WC8PS attribute ETHNICITY would change from NA to Not Available +Patient PT_01HNFSBZ attribute ETHNICITY would change from NA to Not Available +Patient PT_01HNFSBZ attribute GERMLINE_SEX_ESTIMATE would change from Unknown to NA +Patient PT_01HNFSBZ attribute CANCER_PREDISPOSITIONS would change from None documented to NA +Patient PT_01SH4F1X attribute AGE_IN_DAYS would change from 3838 to NA +Patient PT_01SH4F1X attribute OS_MONTHS would change from 45 to 54 +Patient PT_0324HWD5 attribute AGE_IN_DAYS would change from 3121 to NA +Patient PT_047YGDRW attribute ETHNICITY would change from NA to Not Available +Patient PT_04V47WFC attribute AGE_IN_DAYS would change from 5717 to NA +Patient PT_08M919BH attribute EFS_MONTHS would change from 44 to 62 +Patient PT_08M919BH attribute OS_MONTHS would change from 44 to 62 +Patient PT_0BSG3R3N attribute AGE_IN_DAYS would change from 3431 to NA +Patient PT_0BVR16FK attribute ETHNICITY would change from NA to Not Available +Patient PT_0CE0HFYB attribute GERMLINE_SEX_ESTIMATE would change from Male to NA + +... + +CHANGE SUMMARY: +ETHNICITY has 358 change(s) +GERMLINE_SEX_ESTIMATE has 220 change(s) +CANCER_PREDISPOSITIONS has 29 change(s) +AGE_IN_DAYS has 147 change(s) +OS_MONTHS has 99 change(s) +EFS_MONTHS has 60 change(s) +EFS_STATUS has 18 change(s) +SEX has 9 change(s) +AGE has 6 change(s) +OS_STATUS has 4 change(s) +``` + +### sample_portal_v_build.txt example: +``` +Per Sample changes: +Sample 16510-1 attribute TUMOR_FRACTION would change from 0.349951221921 to 0.34995122192100003 +Sample 16510-15 attribute TUMOR_FRACTION would change from 0.892871847605 to 0.8928718476049999 +Sample 16510-2 attribute TUMOR_FRACTION would change from 0.242536563786 to 0.24253656378600005 +Sample 16510-8 attribute TUMOR_FRACTION would change from 0.557284218924 to 0.5572842189239999 +Sample 7316-100 attribute TUMOR_FRACTION would change from 0.270649989118 to 0.27064998911800003 +Sample 7316-1017 attribute TUMOR_FRACTION would change from 0.570184695999 to 0.559801637737 +Sample 7316-104 attribute TUMOR_FRACTION would change from 0.664343255194 to 0.6643432551940001 +Sample 7316-1045 attribute TUMOR_FRACTION would change from 0.477859261757 to 0.496989582389 +Sample 7316-105 attribute MOLECULAR_SUBTYPE would change from NA to LGG, BRAF V600E +Sample 7316-105 attribute TUMOR_PLOIDY would change from NA to 2 +Sample 7316-105 attribute CANCER_TYPE_DETAILED would change from NA to Low-grade glioma, BRAF V600E +Sample 7316-105 attribute CANCER_GROUP would change from NA to Low-grade glioma +Sample 7316-105 attribute TUMOR_FRACTION would change from NA to 0.823344460708 +Sample 7316-105 attribute PATHOLOGY_FREE_TEXT_DIAGNOSIS would change from NA to pilocytic astrocytoma ii + +... + +CHANGE SUMMARY: +27 Samples in build would be added to the portal: 1235928,1235929,1235930,1235931,1235932,1235933,1235934,1235935,1235936,1235937,1235938,1235939,1235940,1235941,1235981,1240110,1240112,1240114,1240116,1242273,1242274,1242276,1250775,1250776,1250777,1250778,1273223 +TUMOR_FRACTION has 1005 change(s) +MOLECULAR_SUBTYPE has 488 change(s) +TUMOR_PLOIDY has 403 change(s) +CANCER_TYPE_DETAILED has 847 change(s) +CANCER_GROUP has 517 change(s) +PATHOLOGY_FREE_TEXT_DIAGNOSIS has 507 change(s) +BROAD_HISTOLOGY has 390 change(s) +CNS_REGION has 410 change(s) +CANCER_TYPE has 8 change(s) +ONCOTREE_CODE has 7 change(s) +TUMOR_TYPE has 8 change(s) +TUMOR_TISSUE_TYPE has 17 change(s) +EXPERIMENT_STRATEGY has 1 change(s) +SPECIMEN_ID has 1 change(s) +CBTN_TUMOR_TYPE has 6 change(s) +SAMPLE_TYPE has 2 change(s) +``` \ No newline at end of file From d961379ca8d2019662a2fa7bfd21e50a7494b975 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 29 Feb 2024 11:22:03 -0500 Subject: [PATCH 5/8] :pencil: fixed typo --- docs/DIFF_STUDY_CLINICAL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md index 7ba7cd2..21e3857 100644 --- a/docs/DIFF_STUDY_CLINICAL.md +++ b/docs/DIFF_STUDY_CLINICAL.md @@ -29,7 +29,7 @@ Essentially two change log files, `patient_portal_v_build.txt` and `sample_porta For the patient and sample views, each file respectively has: - A list, one per line, per ID, per attribute, of what would change if the data were loaded - A list of IDs that would be removed from the portal, if any - - A list of IDs that would be added in any + - A list of IDs that would be added if any - A summary of the number of changes of each attribute type ### patient_portal_v_build.txt example: From 590443a6e373e1d775425c35d25389e63ebeb5e8 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 29 Feb 2024 15:32:13 -0500 Subject: [PATCH 6/8] Apply suggestions from code review Co-authored-by: Dan Miller --- docs/DIFF_STUDY_CLINICAL.md | 2 +- scripts/diff_studies.py | 35 ++++++++++++++--------------------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md index 21e3857..fd7d7f0 100644 --- a/docs/DIFF_STUDY_CLINICAL.md +++ b/docs/DIFF_STUDY_CLINICAL.md @@ -1,5 +1,5 @@ # Compare current versus build -This documentation addresses a [QC script](../scripts/diff_studies.py) for clinical metadata. It streamlines the process of identifying and summarizing changes slated to be made +This documentation addresses a [QC script](../scripts/diff_studies.py) for clinical metadata. It streamlines the process of identifying and summarizing changes slated to be made. ```sh python3 scripts/diff_studies.py --help diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py index 29b1c7b..82a618b 100644 --- a/scripts/diff_studies.py +++ b/scripts/diff_studies.py @@ -15,12 +15,9 @@ def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out): # gross ID diffs portal_clinical_ids = set(portal.keys()) build_clinical_ids = set(build.keys()) - portal_only = list(portal_clinical_ids - build_clinical_ids) - portal_only.sort() - build_only = list(build_clinical_ids - portal_clinical_ids) - build_only.sort() - common_samp_ids = list(portal_clinical_ids & build_clinical_ids) - common_samp_ids.sort() + portal_only = sorted(portal_clinical_ids - build_clinical_ids) + build_only = sorted(build_clinical_ids - portal_clinical_ids) + common_samp_ids = sorted(portal_clinical_ids & build_clinical_ids) # gross attribute diffs portal_attr_only = list(portal_attr - build_attr) build_attr_only = list(build_attr - portal_attr) @@ -69,7 +66,7 @@ def table_to_dict(in_file, key, aggr_list): with open(in_file) as f: # skip lines starting with hash until normal header is reached for entry in f: - if entry[0] != "#": + if not entry.startswith("#"): header = entry.rstrip('\n').split('\t') primary = header.index(key) # get aggregate field indices @@ -89,9 +86,8 @@ def table_to_dict(in_file, key, aggr_list): for i in aggr_head: data[i] = split_sort_field(data[i], ";") # two loops, for up until primary key, then after. - for i in range(0, primary, 1): - data_dict[data[primary]][header[i]] = data[i] - for i in range((primary + 1), len(data), 1): + for i in range(len(data)): + if i == primary: continue data_dict[data[primary]][header[i]] = data[i] attr_set = set(header) # no need for primary key to be reported as an attribute @@ -115,12 +111,11 @@ def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list): clinical_id = getattr(entry, attr_dict[data_type]) if clinical_id not in data_dict: # every entry per sample as sampleId and patientId, patient just patientId. Add keys to match - data_dict[clinical_id] = {} - data_dict[clinical_id]["PATIENT_ID"] = entry.patientId + data_dict[clinical_id] = {"PATIENT_ID": entry.patientId} value = entry.value attr_id = entry.clinicalAttributeId if attr_id in aggr_list: - value = split_sort_field(value, ";") + value = ';'.join(sorted(value.split(';'))) # "standardize" status field so that 0:LIVING = LIVING and 1:DECEASED = DECEASED if attr_id in status: value = value[2:] @@ -178,22 +173,20 @@ def main(): portal_sample_attr_keys.update(portal_sample_attr_implicit) # drop attributes that are post-load portal-specific portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT'] - for attr in portal_sample_attr_skip: - portal_sample_attr_keys.remove(attr) + portal_sample_attr_keys -= set(portal_sample_attr_skip) # sample-level diffs - clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out) - sample_diff_out.close() + with open('sample_portal_v_build.txt', 'w') as sample_diff_out: + clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out) # patient-level diffs portal_patient_data = data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list) build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", "PATIENT_ID", aggr_list) patient_diff_out = open('patient_portal_v_build.txt', 'w') portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute]) portal_patient_attr_skip = ['SAMPLE_COUNT'] - for attr in portal_patient_attr_skip: - portal_patient_attr_keys.remove(attr) + portal_patient_attr_keys -= set(portal_patient_attr_skip) - clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out) - patient_diff_out.close() + with open('patient_portal_v_build.txt', 'w') as patient_diff_out: + clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out) if __name__ == '__main__': From 4af1b300a117f772fdb35f355a68adc2672e17bd Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 29 Feb 2024 16:03:44 -0500 Subject: [PATCH 7/8] :hammer: update PR suggestions :pencil: update docs --- docs/DIFF_STUDY_CLINICAL.md | 115 +++++++++++++++++++----------------- scripts/diff_studies.py | 58 ++++++++---------- 2 files changed, 88 insertions(+), 85 deletions(-) diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md index fd7d7f0..595b1b1 100644 --- a/docs/DIFF_STUDY_CLINICAL.md +++ b/docs/DIFF_STUDY_CLINICAL.md @@ -30,77 +30,86 @@ For the patient and sample views, each file respectively has: - A list, one per line, per ID, per attribute, of what would change if the data were loaded - A list of IDs that would be removed from the portal, if any - A list of IDs that would be added if any - - A summary of the number of changes of each attribute type + - A summary of the number of changes of each attribute type printed to STDOUT ### patient_portal_v_build.txt example: ``` -Per Patient changes: -Patient PT_017WC8PS attribute ETHNICITY would change from NA to Not Available -Patient PT_01HNFSBZ attribute ETHNICITY would change from NA to Not Available -Patient PT_01HNFSBZ attribute GERMLINE_SEX_ESTIMATE would change from Unknown to NA -Patient PT_01HNFSBZ attribute CANCER_PREDISPOSITIONS would change from None documented to NA -Patient PT_01SH4F1X attribute AGE_IN_DAYS would change from 3838 to NA -Patient PT_01SH4F1X attribute OS_MONTHS would change from 45 to 54 -Patient PT_0324HWD5 attribute AGE_IN_DAYS would change from 3121 to NA -Patient PT_047YGDRW attribute ETHNICITY would change from NA to Not Available -Patient PT_04V47WFC attribute AGE_IN_DAYS would change from 5717 to NA -Patient PT_08M919BH attribute EFS_MONTHS would change from 44 to 62 -Patient PT_08M919BH attribute OS_MONTHS would change from 44 to 62 -Patient PT_0BSG3R3N attribute AGE_IN_DAYS would change from 3431 to NA -Patient PT_0BVR16FK attribute ETHNICITY would change from NA to Not Available -Patient PT_0CE0HFYB attribute GERMLINE_SEX_ESTIMATE would change from Male to NA - -... - -CHANGE SUMMARY: -ETHNICITY has 358 change(s) -GERMLINE_SEX_ESTIMATE has 220 change(s) -CANCER_PREDISPOSITIONS has 29 change(s) -AGE_IN_DAYS has 147 change(s) -OS_MONTHS has 99 change(s) -EFS_MONTHS has 60 change(s) -EFS_STATUS has 18 change(s) -SEX has 9 change(s) -AGE has 6 change(s) -OS_STATUS has 4 change(s) +Patient attribute before after +PT_017WC8PS ETHNICITY NA Not Available +PT_01HNFSBZ CANCER_PREDISPOSITIONS None documented NA +PT_01HNFSBZ ETHNICITY NA Not Available +PT_01HNFSBZ GERMLINE_SEX_ESTIMATE Unknown NA +PT_01SH4F1X AGE_IN_DAYS 3838 NA +PT_01SH4F1X OS_MONTHS 45 54 +PT_0324HWD5 AGE_IN_DAYS 3121 NA +PT_047YGDRW ETHNICITY NA Not Available +PT_04V47WFC AGE_IN_DAYS 5717 NA +PT_08M919BH OS_MONTHS 44 62 +PT_08M919BH EFS_MONTHS 44 62 +PT_0BSG3R3N AGE_IN_DAYS 3431 NA +PT_0BVR16FK ETHNICITY NA Not Available +PT_0CE0HFYB GERMLINE_SEX_ESTIMATE Male NA +PT_0CVRX4SJ OS_MONTHS NA 149 ``` ### sample_portal_v_build.txt example: ``` -Per Sample changes: -Sample 16510-1 attribute TUMOR_FRACTION would change from 0.349951221921 to 0.34995122192100003 -Sample 16510-15 attribute TUMOR_FRACTION would change from 0.892871847605 to 0.8928718476049999 -Sample 16510-2 attribute TUMOR_FRACTION would change from 0.242536563786 to 0.24253656378600005 -Sample 16510-8 attribute TUMOR_FRACTION would change from 0.557284218924 to 0.5572842189239999 -Sample 7316-100 attribute TUMOR_FRACTION would change from 0.270649989118 to 0.27064998911800003 -Sample 7316-1017 attribute TUMOR_FRACTION would change from 0.570184695999 to 0.559801637737 -Sample 7316-104 attribute TUMOR_FRACTION would change from 0.664343255194 to 0.6643432551940001 -Sample 7316-1045 attribute TUMOR_FRACTION would change from 0.477859261757 to 0.496989582389 -Sample 7316-105 attribute MOLECULAR_SUBTYPE would change from NA to LGG, BRAF V600E -Sample 7316-105 attribute TUMOR_PLOIDY would change from NA to 2 -Sample 7316-105 attribute CANCER_TYPE_DETAILED would change from NA to Low-grade glioma, BRAF V600E -Sample 7316-105 attribute CANCER_GROUP would change from NA to Low-grade glioma -Sample 7316-105 attribute TUMOR_FRACTION would change from NA to 0.823344460708 -Sample 7316-105 attribute PATHOLOGY_FREE_TEXT_DIAGNOSIS would change from NA to pilocytic astrocytoma ii - -... +Sample attribute before after +16510-1 TUMOR_FRACTION 0.349951221921 0.34995122192100003 +16510-15 TUMOR_FRACTION 0.892871847605 0.8928718476049999 +16510-2 TUMOR_FRACTION 0.242536563786 0.24253656378600005 +16510-8 TUMOR_FRACTION 0.557284218924 0.5572842189239999 +7316-100 TUMOR_FRACTION 0.270649989118 0.27064998911800003 +7316-1017 TUMOR_FRACTION 0.570184695999 0.559801637737 +7316-104 TUMOR_FRACTION 0.664343255194 0.6643432551940001 +7316-1045 TUMOR_FRACTION 0.477859261757 0.496989582389 +7316-105 CNS_REGION NA Mixed +7316-105 CANCER_TYPE_DETAILED NA Low-grade glioma, BRAF V600E +7316-105 MOLECULAR_SUBTYPE NA LGG, BRAF V600E +7316-105 BROAD_HISTOLOGY NA Low-grade astrocytic tumor +7316-105 CANCER_GROUP NA Low-grade glioma +7316-105 TUMOR_PLOIDY NA 2 +7316-105 PATHOLOGY_FREE_TEXT_DIAGNOSIS NA pilocytic astrocytoma ii +7316-105 TUMOR_FRACTION NA 0.823344460708 +7316-1052 CANCER_TYPE_DETAILED Diffuse midline glioma, H3 K28-mutant Diffuse midline glioma, H3 K28-altered +7316-1052 MOLECULAR_SUBTYPE DMG, H3 K28 DMG, H3 K28, TP53 +7316-1062 CANCER_TYPE_DETAILED Diffuse midline glioma, H3 K28-mutant Diffuse midline glioma, H3 K28-altered +7316-1068 CANCER_TYPE_DETAILED Diffuse midline glioma, H3 K28-mutant Diffuse midline glioma, H3 K28-altered +7316-1072 CANCER_TYPE_DETAILED Glial-neuronal tumor NOS Glial-neuronal tumor, To be classified +7316-1072 BROAD_HISTOLOGY Low-grade astrocytic tumor Neuronal and mixed neuronal-glial tumor +7316-1072 CANCER_GROUP Glial-neuronal tumor Glial-neuronal tumor NOS +``` -CHANGE SUMMARY: +### STDOUT: +``` +Sample CHANGE SUMMARY: 27 Samples in build would be added to the portal: 1235928,1235929,1235930,1235931,1235932,1235933,1235934,1235935,1235936,1235937,1235938,1235939,1235940,1235941,1235981,1240110,1240112,1240114,1240116,1242273,1242274,1242276,1250775,1250776,1250777,1250778,1273223 TUMOR_FRACTION has 1005 change(s) -MOLECULAR_SUBTYPE has 488 change(s) -TUMOR_PLOIDY has 403 change(s) +CNS_REGION has 410 change(s) CANCER_TYPE_DETAILED has 847 change(s) +MOLECULAR_SUBTYPE has 488 change(s) +BROAD_HISTOLOGY has 390 change(s) CANCER_GROUP has 517 change(s) +TUMOR_PLOIDY has 403 change(s) PATHOLOGY_FREE_TEXT_DIAGNOSIS has 507 change(s) -BROAD_HISTOLOGY has 390 change(s) -CNS_REGION has 410 change(s) -CANCER_TYPE has 8 change(s) ONCOTREE_CODE has 7 change(s) +CANCER_TYPE has 8 change(s) TUMOR_TYPE has 8 change(s) TUMOR_TISSUE_TYPE has 17 change(s) EXPERIMENT_STRATEGY has 1 change(s) SPECIMEN_ID has 1 change(s) CBTN_TUMOR_TYPE has 6 change(s) SAMPLE_TYPE has 2 change(s) + +Patient CHANGE SUMMARY: +ETHNICITY has 358 change(s) +CANCER_PREDISPOSITIONS has 29 change(s) +GERMLINE_SEX_ESTIMATE has 220 change(s) +AGE_IN_DAYS has 147 change(s) +OS_MONTHS has 99 change(s) +EFS_MONTHS has 60 change(s) +EFS_STATUS has 18 change(s) +SEX has 9 change(s) +AGE has 6 change(s) +OS_STATUS has 4 change(s) ``` \ No newline at end of file diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py index 82a618b..8c51b5b 100644 --- a/scripts/diff_studies.py +++ b/scripts/diff_studies.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ Script to check a study on pedcbioportal for differences against a local build """ @@ -17,45 +18,38 @@ def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out): build_clinical_ids = set(build.keys()) portal_only = sorted(portal_clinical_ids - build_clinical_ids) build_only = sorted(build_clinical_ids - portal_clinical_ids) - common_samp_ids = sorted(portal_clinical_ids & build_clinical_ids) + common_clinical_ids = sorted(portal_clinical_ids & build_clinical_ids) # gross attribute diffs portal_attr_only = list(portal_attr - build_attr) build_attr_only = list(build_attr - portal_attr) common_attr = list(portal_attr & build_attr) # focus on common samp and common attr, as "everything is different for x" is not that useful - print("Per " + clin_type + " changes:", file=out) + print(clin_type + "\tattribute\tbefore\tafter", file=out) attr_cts = {} - for samp_id in common_samp_ids: + for clinical_id in common_clinical_ids: for attr in common_attr: # portal will not have a value for that attr in the struct if none - portal_value = portal[samp_id].get(attr, "NA") - if portal_value != build[samp_id][attr]: - print("{} {} attribute {} would change from {} to {}".format(clin_type, samp_id, attr, portal_value, build[samp_id][attr]), file=out) + portal_value = portal[clinical_id].get(attr, "NA") + if portal_value != build[clinical_id][attr]: + print("{}\t{}\t{}\t{}".format(clinical_id, attr, portal_value, build[clinical_id][attr]), file=out) if attr not in attr_cts: attr_cts[attr] = 0 attr_cts[attr] += 1 - print("CHANGE SUMMARY:", file=out) + # print change summary to STDOUT + print(clin_type +" CHANGE SUMMARY:") if len(portal_only) > 0: - print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only)), file=out) + print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only))) if len(build_only) > 0: - print("{} {}s in build would be added to the portal: {}".format(len(build_only), clin_type, ",".join(build_only)), file=out) + print("{} {}s in build would be added to the portal: {}".format(len(build_only), clin_type, ",".join(build_only))) if len(portal_attr_only) > 0: - print("{} attributes in portal would be removed: {}".format(len(portal_attr_only), ",".join(portal_attr_only)), file=out) + print("{} attributes in portal would be removed: {}".format(len(portal_attr_only), ",".join(portal_attr_only))) if len(build_attr_only) > 0: - print("{} attributes in build would be added to the portal: {}".format(len(build_attr_only), ",".join(build_attr_only)), file=out) + print("{} attributes in build would be added to the portal: {}".format(len(build_attr_only), ",".join(build_attr_only))) for attr in attr_cts: - print("{} has {} change(s)".format(attr, attr_cts[attr]), file=out) - - -def split_sort_field(value, sep): - """ - For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter. - Therefore, sort them so that when compared, no errors are triggered - """ - value_list = value.split(sep) - value_list.sort() - return sep.join(value_list) + print("{} has {} change(s)".format(attr, attr_cts[attr])) + # Print extra newline for readability + print ("") def table_to_dict(in_file, key, aggr_list): @@ -74,7 +68,6 @@ def table_to_dict(in_file, key, aggr_list): for aggr in aggr_list: if aggr in header: aggr_head.append(header.index(aggr)) - break data_dict = {} for entry in f: @@ -82,9 +75,10 @@ def table_to_dict(in_file, key, aggr_list): # Replace empty string with NA as that is how the portal will return it data = ["NA" if d == "" else d for d in data] data_dict[data[primary]] = {} - # sort aggr fields + # For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter + # Therefore, sort them so that when compared, no errors are triggered for i in aggr_head: - data[i] = split_sort_field(data[i], ";") + data[i] = ';'.join(sorted(data[i].split(';'))) # two loops, for up until primary key, then after. for i in range(len(data)): if i == primary: continue @@ -114,6 +108,8 @@ def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list): data_dict[clinical_id] = {"PATIENT_ID": entry.patientId} value = entry.value attr_id = entry.clinicalAttributeId + # For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter + # Therefore, sort them so that when compared, no errors are triggered if attr_id in aggr_list: value = ';'.join(sorted(value.split(';'))) # "standardize" status field so that 0:LIVING = LIVING and 1:DECEASED = DECEASED @@ -159,20 +155,20 @@ def main(): "validate_swagger_spec": False} ) - # hardcode for now names of aggregate fields + # hardcode for now names of aggregate fields, implicit, and skip fields aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"] + portal_sample_attr_implicit = ['PATIENT_ID'] + portal_patient_attr_skip = ['SAMPLE_COUNT'] + portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT'] # get attribute keys attr_key_obj = cbioportal.Clinical_Attributes.fetchClinicalAttributesUsingPOST(studyIds=[args.study], projection='ID').result() # gather sample-level metadata portal_sample_data = data_clinical_from_study(cbioportal, args.study, "SAMPLE", aggr_list) build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt", "SAMPLE_ID", aggr_list) - sample_diff_out = open('sample_portal_v_build.txt', 'w') portal_sample_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if not x.patientAttribute]) # implicit attributes not returned by function that are required for sample view - portal_sample_attr_implicit = ['PATIENT_ID'] portal_sample_attr_keys.update(portal_sample_attr_implicit) # drop attributes that are post-load portal-specific - portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT'] portal_sample_attr_keys -= set(portal_sample_attr_skip) # sample-level diffs with open('sample_portal_v_build.txt', 'w') as sample_diff_out: @@ -180,11 +176,9 @@ def main(): # patient-level diffs portal_patient_data = data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list) build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", "PATIENT_ID", aggr_list) - patient_diff_out = open('patient_portal_v_build.txt', 'w') portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute]) - portal_patient_attr_skip = ['SAMPLE_COUNT'] + # drop attributes that are post-load portal-specific portal_patient_attr_keys -= set(portal_patient_attr_skip) - with open('patient_portal_v_build.txt', 'w') as patient_diff_out: clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out) From 92ebd146077bfcf95a4ca8f31e9b2713d3190fab Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 29 Feb 2024 16:45:12 -0500 Subject: [PATCH 8/8] :hammer: make inputs more explicit --- scripts/diff_studies.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py index 8c51b5b..3356454 100644 --- a/scripts/diff_studies.py +++ b/scripts/diff_studies.py @@ -127,13 +127,16 @@ def main(): "-u", "--url", action="store", dest="url", help="url to search against", default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs" ) parser.add_argument( - "-s", "--study", action="store", dest="study", help="Cancer study ID to compare on server" + "-c", "--study", action="store", dest="study", help="Cancer study ID to compare on server" ) parser.add_argument( "-t", "--token", action="store", dest="token", help="Token file obtained from Web API" ) parser.add_argument( - "-d", "--datasheet-dir", action="store", dest="data_dir", help="Directory containing data_clinical_*.txt" + "-s", "--datasheet-sample", action="store", dest="data_sample", help="File containing cBio-formatted sample metadata, typically named data_clinical_sample.txt" + ) + parser.add_argument( + "-p", "--datasheet-patient", action="store", dest="data_patient", help="File containing cBio-formatted patient metadata, typically named data_clinical_patient.txt" ) args = parser.parse_args() @@ -164,7 +167,7 @@ def main(): attr_key_obj = cbioportal.Clinical_Attributes.fetchClinicalAttributesUsingPOST(studyIds=[args.study], projection='ID').result() # gather sample-level metadata portal_sample_data = data_clinical_from_study(cbioportal, args.study, "SAMPLE", aggr_list) - build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt", "SAMPLE_ID", aggr_list) + build_sample_data, build_sample_attr_keys = table_to_dict(args.data_sample, "SAMPLE_ID", aggr_list) portal_sample_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if not x.patientAttribute]) # implicit attributes not returned by function that are required for sample view portal_sample_attr_keys.update(portal_sample_attr_implicit) @@ -175,7 +178,7 @@ def main(): clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out) # patient-level diffs portal_patient_data = data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list) - build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", "PATIENT_ID", aggr_list) + build_patient_data, build_patient_attr_keys = table_to_dict(args.data_patient, "PATIENT_ID", aggr_list) portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute]) # drop attributes that are post-load portal-specific portal_patient_attr_keys -= set(portal_patient_attr_skip)