From f23bfb1cff2e5642c64b77eecec3cd6ea3b5607f Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 22 Feb 2024 17:07:47 -0500
Subject: [PATCH 1/8] :construction: WIP cbio server scraper

---
 scripts/diff_studies.py | 61 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 scripts/diff_studies.py

diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py
new file mode 100644
index 0000000..df876e9
--- /dev/null
+++ b/scripts/diff_studies.py
@@ -0,0 +1,61 @@
+"""
+Script to check a study on pedcbioportal for differences against a local build
+"""
+
+import argparse
+from bravado.client import SwaggerClient
+from bravado.requests_client import RequestsClient
+from urllib.parse import urlparse
+import pdb
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare local clinical data to server"
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        action="store",
+        dest="url",
+        help="url to search against",
+        default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs"
+    )
+    parser.add_argument(
+        "-s", "--study", action="store", dest="study", help="Cancer study ID to compare on server"
+    )
+    parser.add_argument(
+        "-t", "--token", action="store", dest="token", help="Token file obtained from Web API"
+    )
+    parser.add_argument(
+        "-d", "--datasheet-dir", action="store", dest="data_dir", help="Directory containing data_clinical_*.txt"
+    )
+
+    args = parser.parse_args()
+
+    with open(args.token, 'r') as token_file:
+        token = token_file.read().rstrip().split(': ')[1]
+
+    url_object = urlparse(args.url)
+
+    http_client = RequestsClient()
+    http_client.set_api_key(
+        '{}'.format(url_object.hostname), 'Bearer {}'.format(token),
+        param_name='Authorization', param_in='header'
+    )
+
+    cbioportal = SwaggerClient.from_url(args.url,
+                                        http_client=http_client,
+                                        config={"validate_requests":False,
+                                                "validate_responses":False,
+                                                "validate_swagger_spec": False}
+    )
+    # object is a big ass array, one entry per attribute, per patient/sample? - so like if a table were just concatenated into a single vector 
+    clinical_data = cbioportal.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=study).result()
+    # would need to loop through patient IDs get patient-specific fields, like age?
+    # get_clin_pt = cbioportal.Clinical_Data.getAllClinicalDataOfPatientInStudyUsingGET(studyId=study,patientId='PT_00G007DM' ).result()
+    pdb.set_trace()
+    hold=1
+
+if __name__ == '__main__':
+    main()

From 616270cff761c9d09ed3410aceb67cd006b4074f Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Fri, 23 Feb 2024 16:03:06 -0500
Subject: [PATCH 2/8] :construction: nearly complete with logic, need to wrap
 that up then test

---
 scripts/diff_studies.py | 132 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 121 insertions(+), 11 deletions(-)

diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py
index df876e9..0cab0d1 100644
--- a/scripts/diff_studies.py
+++ b/scripts/diff_studies.py
@@ -9,17 +9,118 @@
 import pdb
 
 
+def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out):
+    """
+    Compare differences in portal sample data and build.
+    """
+    # gross  ID diffs
+    portal_clinical_ids = set(list(portal.keys()).sort())
+    build_clinical_ids = set(list(build.keys()).sort())
+    portal_only = list(portal_clinical_ids - build_clinical_ids)
+    build_only = list(build_clinical_ids - portal_clinical_ids)
+    common_samp_ids = list(portal_clinical_ids & build_clinical_ids)
+    # gross attribute diffs
+    portal_attr_only = list(portal_attr - build_attr)
+    build_attr_only = list(build_attr - portal_attr)
+    common_attr = list(portal_attr & build_attr)
+
+    # focus on common samp and common attr, as "everything is different for x" is not that useful
+    print("Per " + clin_type + " changes:", file=out)
+    attr_cts = {}
+    for samp_id in common_samp_ids:
+        for attr in common_attr:
+            if portal[samp_id][attr] != build[samp_id][attr]:
+                print("{} {} attribute {} would change from {} to {}".format(clin_type, samp_id, attr, portal[samp_id][attr], build[samp_id][attr]), file=out)
+                if attr not in attr_cts:
+                    attr_cts[attr] = 0
+                attr_cts[attr] += 1
+    print("CHANGE SUMMARY:", file=out)
+    if len(portal_only) > 0:
+        print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only)), file=out)
+    if len(build_only) > 0:
+        print("{} {}s in build would be added to the portal: {}".format(len(build_only), clin_type,  ",".join(build_only)), file=out)
+    if len(portal_attr_only) > 0:
+        print("{} attributes in portal would be removed: {}".format(len(portal_attr_only), ",".join(portal_attr_only)), file=out)
+    if len(build_attr_only) > 0:
+        print("{} attributes in build would be added to the portal: {}".format(len(build_attr_only), ",".join(build_attr_only)), file=out)
+    for attr in attr_cts:
+        print("{} has {} change(s)".format(attr, attr_cts[attr]), file=out)
+
+
+def split_sort_field(value, sep):
+    """
+    For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter.
+    Therefore, sort them so that when compared, no errors are triggered
+    """
+    return sep.join(value.split(sep).sort())
+
+
+def table_to_dict(in_file, key, aggr_list):
+    """
+    Take a text file and convert to dict with certain row value as primary, all other row values as subkeys.
+    Also return a set of attribute keys
+    """
+    with open(in_file) as f:
+        # skip lines starting with hash until normal header is reached
+        for entry in f:
+            if entry[0] != "#":
+                head = next(f)
+                header = head.rstrip('\n').split('\t')
+                primary = header.index(key)
+                # get aggregate field indices
+                aggr_head = []
+                for aggr in aggr_list:
+                    if aggr in header:
+                        aggr_head.append(header.index(aggr))
+
+                break
+        data_dict = {}
+        for entry in f:
+            data = entry.rstrip('\n').split('\t')
+            data_dict[data[primary]] = {}
+            # sort aggr fields
+            for i in aggr_head:
+                entry[i] = split_sort_field(entry[i], ";")
+            # two loops, for up until primary key, then after
+            for i in range(0, primary, 1):
+                data_dict[data[primary]][header[i]] = entry[i]
+            for i in range((primary + 1), len(entry), 1):
+                data_dict[data[primary]][header[i]] = entry[i]
+    return data_dict, set(header.sort())
+
+
+def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list):
+    """
+    Get all the column-value pairs for each data_type(SAMPLE or PATIENT) for a specific study
+    Convert result to dict
+    Also return a set of attribute keys
+    """
+    # object is a big ass array of struct, one entry per attribute, per patient/sample - so like if a table were just concatenated into a single vector 
+    data_clinical = cbio_conn.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=study_id, projection='DETAILED', clinicalDataType=data_type).result()
+    data_dict = {}
+    attr_keys = []
+    # Use sampleId or patientID, clinicalAttributeId (column name) and value
+    attr_dict = {"SAMPLE": "sampleId", "PATIENT": "patientId" }
+    for entry in data_clinical:
+        clinical_id = getattr(entry, attr_dict[data_type])
+        if clinical_id not in data_dict:
+            data_dict[clinical_id] = {}
+        value = entry.value
+        attr_id = entry.clinicalAttributeId
+        if attr_id in aggr_list:
+            value = split_sort_field(value, ";")
+        data_dict[clinical_id][attr_id] = value
+        # need to fix this...can probably just use api to get attribute keys instead
+        attr_keys.append(attr_id)
+    return data_dict, set(attr_keys.sort())
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Compare local clinical data to server"
     )
     parser.add_argument(
-        "-u",
-        "--url",
-        action="store",
-        dest="url",
-        help="url to search against",
-        default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs"
+        "-u", "--url", action="store", dest="url", help="url to search against", default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs"
     )
     parser.add_argument(
         "-s", "--study", action="store", dest="study", help="Cancer study ID to compare on server"
@@ -32,7 +133,6 @@ def main():
     )
 
     args = parser.parse_args()
-
     with open(args.token, 'r') as token_file:
         token = token_file.read().rstrip().split(': ')[1]
 
@@ -50,10 +150,20 @@ def main():
                                                 "validate_responses":False,
                                                 "validate_swagger_spec": False}
     )
-    # object is a big ass array, one entry per attribute, per patient/sample? - so like if a table were just concatenated into a single vector 
-    clinical_data = cbioportal.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=study).result()
-    # would need to loop through patient IDs get patient-specific fields, like age?
-    # get_clin_pt = cbioportal.Clinical_Data.getAllClinicalDataOfPatientInStudyUsingGET(studyId=study,patientId='PT_00G007DM' ).result()
+    
+    portal_sample_data, portal_sample_attr_keys = data_clinical_from_study(cbioportal, args.study, "SAMPLE")
+    build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt")
+    sample_diff_out = open('sample_portal_v_build.txt', 'w')
+    clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out)
+    sample_diff_out.close()
+    # hardcode for now names of aggregate fields
+    aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"]
+    portal_patient_data, portal_patient_attr_keys =  data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list)
+    build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", aggr_list)
+    patient_diff_out = open('patient_portal_v_build.txt', 'w')
+    clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out)
+    patient_diff_out.close()
+
     pdb.set_trace()
     hold=1
 

From 9c041d40a2438b2564c1a181a7fb17cef4a54174 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Mon, 26 Feb 2024 15:56:34 -0500
Subject: [PATCH 3/8] :tada: complete working version

---
 scripts/diff_studies.py | 85 +++++++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 28 deletions(-)

diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py
index 0cab0d1..29b1c7b 100644
--- a/scripts/diff_studies.py
+++ b/scripts/diff_studies.py
@@ -6,34 +6,38 @@
 from bravado.client import SwaggerClient
 from bravado.requests_client import RequestsClient
 from urllib.parse import urlparse
-import pdb
 
 
 def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out):
     """
     Compare differences in portal sample data and build.
     """
-    # gross  ID diffs
-    portal_clinical_ids = set(list(portal.keys()).sort())
-    build_clinical_ids = set(list(build.keys()).sort())
+    # gross ID diffs
+    portal_clinical_ids = set(portal.keys())
+    build_clinical_ids = set(build.keys())
     portal_only = list(portal_clinical_ids - build_clinical_ids)
+    portal_only.sort()
     build_only = list(build_clinical_ids - portal_clinical_ids)
+    build_only.sort()
     common_samp_ids = list(portal_clinical_ids & build_clinical_ids)
+    common_samp_ids.sort()
     # gross attribute diffs
     portal_attr_only = list(portal_attr - build_attr)
     build_attr_only = list(build_attr - portal_attr)
     common_attr = list(portal_attr & build_attr)
-
     # focus on common samp and common attr, as "everything is different for x" is not that useful
     print("Per " + clin_type + " changes:", file=out)
     attr_cts = {}
     for samp_id in common_samp_ids:
         for attr in common_attr:
-            if portal[samp_id][attr] != build[samp_id][attr]:
-                print("{} {} attribute {} would change from {} to {}".format(clin_type, samp_id, attr, portal[samp_id][attr], build[samp_id][attr]), file=out)
+            # portal will not have a value for that attr in the struct if none
+            portal_value = portal[samp_id].get(attr, "NA")
+            if portal_value != build[samp_id][attr]:
+                print("{} {} attribute {} would change from {} to {}".format(clin_type, samp_id, attr, portal_value, build[samp_id][attr]), file=out)
                 if attr not in attr_cts:
                     attr_cts[attr] = 0
                 attr_cts[attr] += 1
+
     print("CHANGE SUMMARY:", file=out)
     if len(portal_only) > 0:
         print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only)), file=out)
@@ -52,7 +56,9 @@ def split_sort_field(value, sep):
     For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter.
     Therefore, sort them so that when compared, no errors are triggered
     """
-    return sep.join(value.split(sep).sort())
+    value_list = value.split(sep)
+    value_list.sort()
+    return sep.join(value_list)
 
 
 def table_to_dict(in_file, key, aggr_list):
@@ -64,8 +70,7 @@ def table_to_dict(in_file, key, aggr_list):
         # skip lines starting with hash until normal header is reached
         for entry in f:
             if entry[0] != "#":
-                head = next(f)
-                header = head.rstrip('\n').split('\t')
+                header = entry.rstrip('\n').split('\t')
                 primary = header.index(key)
                 # get aggregate field indices
                 aggr_head = []
@@ -77,16 +82,21 @@ def table_to_dict(in_file, key, aggr_list):
         data_dict = {}
         for entry in f:
             data = entry.rstrip('\n').split('\t')
+            # Replace empty string with NA as that is how the portal will return it
+            data = ["NA" if d == "" else d for d in data]
             data_dict[data[primary]] = {}
             # sort aggr fields
             for i in aggr_head:
-                entry[i] = split_sort_field(entry[i], ";")
-            # two loops, for up until primary key, then after
+                data[i] = split_sort_field(data[i], ";")
+            # two loops, for up until primary key, then after.
             for i in range(0, primary, 1):
-                data_dict[data[primary]][header[i]] = entry[i]
-            for i in range((primary + 1), len(entry), 1):
-                data_dict[data[primary]][header[i]] = entry[i]
-    return data_dict, set(header.sort())
+                data_dict[data[primary]][header[i]] = data[i]
+            for i in range((primary + 1), len(data), 1):
+                data_dict[data[primary]][header[i]] = data[i]
+    attr_set = set(header)
+    # no need for primary key to be reported as an attribute
+    attr_set.remove(key)
+    return data_dict, attr_set
 
 
 def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list):
@@ -98,21 +108,24 @@ def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list):
     # object is a big ass array of struct, one entry per attribute, per patient/sample - so like if a table were just concatenated into a single vector 
     data_clinical = cbio_conn.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=study_id, projection='DETAILED', clinicalDataType=data_type).result()
     data_dict = {}
-    attr_keys = []
     # Use sampleId or patientID, clinicalAttributeId (column name) and value
     attr_dict = {"SAMPLE": "sampleId", "PATIENT": "patientId" }
+    status = ["OS_STATUS"]
     for entry in data_clinical:
         clinical_id = getattr(entry, attr_dict[data_type])
         if clinical_id not in data_dict:
+            # every entry per sample as sampleId and patientId, patient just patientId. Add keys to match
             data_dict[clinical_id] = {}
+            data_dict[clinical_id]["PATIENT_ID"] = entry.patientId
         value = entry.value
         attr_id = entry.clinicalAttributeId
         if attr_id in aggr_list:
             value = split_sort_field(value, ";")
+        # "standardize" status field so that 0:LIVING = LIVING and 1:DECEASED = DECEASED
+        if attr_id in status:
+            value = value[2:]
         data_dict[clinical_id][attr_id] = value
-        # need to fix this...can probably just use api to get attribute keys instead
-        attr_keys.append(attr_id)
-    return data_dict, set(attr_keys.sort())
+    return data_dict
 
 
 def main():
@@ -151,21 +164,37 @@ def main():
                                                 "validate_swagger_spec": False}
     )
     
-    portal_sample_data, portal_sample_attr_keys = data_clinical_from_study(cbioportal, args.study, "SAMPLE")
-    build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt")
+    # hardcode for now names of aggregate fields
+    aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"]
+    # get attribute keys
+    attr_key_obj = cbioportal.Clinical_Attributes.fetchClinicalAttributesUsingPOST(studyIds=[args.study], projection='ID').result()
+    # gather sample-level metadata
+    portal_sample_data = data_clinical_from_study(cbioportal, args.study, "SAMPLE", aggr_list)
+    build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt", "SAMPLE_ID", aggr_list)
     sample_diff_out = open('sample_portal_v_build.txt', 'w')
+    portal_sample_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if not x.patientAttribute])
+    # implicit attributes not returned by function that are required for sample view
+    portal_sample_attr_implicit = ['PATIENT_ID']
+    portal_sample_attr_keys.update(portal_sample_attr_implicit)
+    # drop attributes that are post-load portal-specific
+    portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT']
+    for attr in portal_sample_attr_skip:
+        portal_sample_attr_keys.remove(attr)
+    # sample-level diffs
     clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out)
     sample_diff_out.close()
-    # hardcode for now names of aggregate fields
-    aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"]
-    portal_patient_data, portal_patient_attr_keys =  data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list)
-    build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", aggr_list)
+    # patient-level diffs
+    portal_patient_data =  data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list)
+    build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", "PATIENT_ID", aggr_list)
     patient_diff_out = open('patient_portal_v_build.txt', 'w')
+    portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute])
+    portal_patient_attr_skip = ['SAMPLE_COUNT']
+    for attr in portal_patient_attr_skip:
+        portal_patient_attr_keys.remove(attr)
+
     clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out)
     patient_diff_out.close()
 
-    pdb.set_trace()
-    hold=1
 
 if __name__ == '__main__':
     main()

From 6a25883944d4749ab11a7745dc630bdf55ca2db7 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Wed, 28 Feb 2024 16:37:01 -0500
Subject: [PATCH 4/8] :pencil: added documentation for scraper script

---
 README.md                   |   3 +-
 docs/DIFF_STUDY_CLINICAL.md | 106 ++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 docs/DIFF_STUDY_CLINICAL.md

diff --git a/README.md b/README.md
index 7565dbd..8997838 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,11 @@
-# Outline on ETL for converting data from cavatica and data service to pedcbioportal format
+# Outline on ETL for converting data from CAVATICA and Data Warehouse to PedcBioportal format
 In general, we are creating upload packages converting our data and metadata to satisfy the requirements outlined [here](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats).
 Further general loading notes can be found in this [Notion page](https://www.notion.so/d3b/Cbioportal-Study-Load-SOP-58812479fabe4d2fa9f72242e331b5ee).
 See [below](#collaborative-and-publication-workflows) for special cases like publications or collaborative efforts
 ## I have everything and I know I am doing
 Below assumes you have already created the necessary tables from dbt
 1. Run commands as outlined in [scripts/get_study_metadata.py](#scriptsget_study_metadatapy). Copy/move those files to the cBio loader ec2 instance
+1. Recommended, but not required: run [scripts/diff_studies.py](docs/DIFF_STUDY_CLINICAL.md). It will give a summary of metadata changes between what is currently loaded and what you plan to load, to potentially flag any suspicious changes
 1. Copy over the appropriate aws account key and download files. Example using `pbta_all` study:
 
    ```sh
diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md
new file mode 100644
index 0000000..7ba7cd2
--- /dev/null
+++ b/docs/DIFF_STUDY_CLINICAL.md
@@ -0,0 +1,106 @@
+# Compare current versus build
+This documentation addresses a [QC script](../scripts/diff_studies.py) for clinical metadata. It streamlines the process of identifying and summarizing changes slated to be made
+
+```sh
+python3 scripts/diff_studies.py --help
+usage: diff_studies.py [-h] [-u URL] [-s STUDY] [-t TOKEN] [-d DATA_DIR]
+
+Compare local clinical data to server
+
+options:
+  -h, --help            show this help message and exit
+  -u URL, --url URL     url to search against
+  -s STUDY, --study STUDY
+                        Cancer study ID to compare on server
+  -t TOKEN, --token TOKEN
+                        Token file obtained from Web API
+  -d DATA_DIR, --datasheet-dir DATA_DIR
+                        Directory containing data_clinical_*.txt
+```
+
+## INPUTS:
+ - `-u, --url`: cBioportal api deployment site. Default: https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs
+ - `-s, --study`: cBioportal cancer study ID, i.e. `pbta_all`
+ - `-t, --token`: File obtained from navigating to https://pedcbioportal.kidsfirstdrc.org/webAPI#using-data-access-tokens, then clicking on `Download Token`. File is reusable
+ - `-d, --datasheet-dir`: Name of directory containing `data_clinical_patient.txt` and `data_clinical_sample.txt` being vetted for upload
+
+## OUTPUTS:
+Essentially two change log files, `patient_portal_v_build.txt` and `sample_portal_v_build.txt`.
+For the patient and sample views, each file respectively has:
+ - A list, one per line, per ID, per attribute, of what would change if the data were loaded
+ - A list of IDs that would be removed from the portal, if any
+ - A list of IDs that would be added in any
+ - A summary of the number of changes of each attribute type
+
+### patient_portal_v_build.txt example:
+```
+Per Patient changes:
+Patient PT_017WC8PS attribute ETHNICITY would change from NA to Not Available
+Patient PT_01HNFSBZ attribute ETHNICITY would change from NA to Not Available
+Patient PT_01HNFSBZ attribute GERMLINE_SEX_ESTIMATE would change from Unknown to NA
+Patient PT_01HNFSBZ attribute CANCER_PREDISPOSITIONS would change from None documented to NA
+Patient PT_01SH4F1X attribute AGE_IN_DAYS would change from 3838 to NA
+Patient PT_01SH4F1X attribute OS_MONTHS would change from 45 to 54
+Patient PT_0324HWD5 attribute AGE_IN_DAYS would change from 3121 to NA
+Patient PT_047YGDRW attribute ETHNICITY would change from NA to Not Available
+Patient PT_04V47WFC attribute AGE_IN_DAYS would change from 5717 to NA
+Patient PT_08M919BH attribute EFS_MONTHS would change from 44 to 62
+Patient PT_08M919BH attribute OS_MONTHS would change from 44 to 62
+Patient PT_0BSG3R3N attribute AGE_IN_DAYS would change from 3431 to NA
+Patient PT_0BVR16FK attribute ETHNICITY would change from NA to Not Available
+Patient PT_0CE0HFYB attribute GERMLINE_SEX_ESTIMATE would change from Male to NA
+
+...
+
+CHANGE SUMMARY:
+ETHNICITY has 358 change(s)
+GERMLINE_SEX_ESTIMATE has 220 change(s)
+CANCER_PREDISPOSITIONS has 29 change(s)
+AGE_IN_DAYS has 147 change(s)
+OS_MONTHS has 99 change(s)
+EFS_MONTHS has 60 change(s)
+EFS_STATUS has 18 change(s)
+SEX has 9 change(s)
+AGE has 6 change(s)
+OS_STATUS has 4 change(s)
+```
+
+### sample_portal_v_build.txt example:
+```
+Per Sample changes:
+Sample 16510-1 attribute TUMOR_FRACTION would change from 0.349951221921 to 0.34995122192100003
+Sample 16510-15 attribute TUMOR_FRACTION would change from 0.892871847605 to 0.8928718476049999
+Sample 16510-2 attribute TUMOR_FRACTION would change from 0.242536563786 to 0.24253656378600005
+Sample 16510-8 attribute TUMOR_FRACTION would change from 0.557284218924 to 0.5572842189239999
+Sample 7316-100 attribute TUMOR_FRACTION would change from 0.270649989118 to 0.27064998911800003
+Sample 7316-1017 attribute TUMOR_FRACTION would change from 0.570184695999 to 0.559801637737
+Sample 7316-104 attribute TUMOR_FRACTION would change from 0.664343255194 to 0.6643432551940001
+Sample 7316-1045 attribute TUMOR_FRACTION would change from 0.477859261757 to 0.496989582389
+Sample 7316-105 attribute MOLECULAR_SUBTYPE would change from NA to LGG, BRAF V600E
+Sample 7316-105 attribute TUMOR_PLOIDY would change from NA to 2
+Sample 7316-105 attribute CANCER_TYPE_DETAILED would change from NA to Low-grade glioma, BRAF V600E
+Sample 7316-105 attribute CANCER_GROUP would change from NA to Low-grade glioma
+Sample 7316-105 attribute TUMOR_FRACTION would change from NA to 0.823344460708
+Sample 7316-105 attribute PATHOLOGY_FREE_TEXT_DIAGNOSIS would change from NA to pilocytic astrocytoma ii
+
+...
+
+CHANGE SUMMARY:
+27 Samples in build would be added to the portal: 1235928,1235929,1235930,1235931,1235932,1235933,1235934,1235935,1235936,1235937,1235938,1235939,1235940,1235941,1235981,1240110,1240112,1240114,1240116,1242273,1242274,1242276,1250775,1250776,1250777,1250778,1273223
+TUMOR_FRACTION has 1005 change(s)
+MOLECULAR_SUBTYPE has 488 change(s)
+TUMOR_PLOIDY has 403 change(s)
+CANCER_TYPE_DETAILED has 847 change(s)
+CANCER_GROUP has 517 change(s)
+PATHOLOGY_FREE_TEXT_DIAGNOSIS has 507 change(s)
+BROAD_HISTOLOGY has 390 change(s)
+CNS_REGION has 410 change(s)
+CANCER_TYPE has 8 change(s)
+ONCOTREE_CODE has 7 change(s)
+TUMOR_TYPE has 8 change(s)
+TUMOR_TISSUE_TYPE has 17 change(s)
+EXPERIMENT_STRATEGY has 1 change(s)
+SPECIMEN_ID has 1 change(s)
+CBTN_TUMOR_TYPE has 6 change(s)
+SAMPLE_TYPE has 2 change(s)
+```
\ No newline at end of file

From d961379ca8d2019662a2fa7bfd21e50a7494b975 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 29 Feb 2024 11:22:03 -0500
Subject: [PATCH 5/8] :pencil: fixed typo

---
 docs/DIFF_STUDY_CLINICAL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md
index 7ba7cd2..21e3857 100644
--- a/docs/DIFF_STUDY_CLINICAL.md
+++ b/docs/DIFF_STUDY_CLINICAL.md
@@ -29,7 +29,7 @@ Essentially two change log files, `patient_portal_v_build.txt` and `sample_porta
 For the patient and sample views, each file respectively has:
  - A list, one per line, per ID, per attribute, of what would change if the data were loaded
  - A list of IDs that would be removed from the portal, if any
- - A list of IDs that would be added in any
+ - A list of IDs that would be added if any
  - A summary of the number of changes of each attribute type
 
 ### patient_portal_v_build.txt example:

From 590443a6e373e1d775425c35d25389e63ebeb5e8 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 29 Feb 2024 15:32:13 -0500
Subject: [PATCH 6/8] Apply suggestions from code review

Co-authored-by: Dan Miller <dmiller15@users.noreply.github.com>
---
 docs/DIFF_STUDY_CLINICAL.md |  2 +-
 scripts/diff_studies.py     | 35 ++++++++++++++---------------------
 2 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md
index 21e3857..fd7d7f0 100644
--- a/docs/DIFF_STUDY_CLINICAL.md
+++ b/docs/DIFF_STUDY_CLINICAL.md
@@ -1,5 +1,5 @@
 # Compare current versus build
-This documentation addresses a [QC script](../scripts/diff_studies.py) for clinical metadata. It streamlines the process of identifying and summarizing changes slated to be made
+This documentation addresses a [QC script](../scripts/diff_studies.py) for clinical metadata. It streamlines the process of identifying and summarizing changes slated to be made.
 
 ```sh
 python3 scripts/diff_studies.py --help
diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py
index 29b1c7b..82a618b 100644
--- a/scripts/diff_studies.py
+++ b/scripts/diff_studies.py
@@ -15,12 +15,9 @@ def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out):
     # gross ID diffs
     portal_clinical_ids = set(portal.keys())
     build_clinical_ids = set(build.keys())
-    portal_only = list(portal_clinical_ids - build_clinical_ids)
-    portal_only.sort()
-    build_only = list(build_clinical_ids - portal_clinical_ids)
-    build_only.sort()
-    common_samp_ids = list(portal_clinical_ids & build_clinical_ids)
-    common_samp_ids.sort()
+    portal_only = sorted(portal_clinical_ids - build_clinical_ids)
+    build_only = sorted(build_clinical_ids - portal_clinical_ids)
+    common_samp_ids = sorted(portal_clinical_ids & build_clinical_ids)
     # gross attribute diffs
     portal_attr_only = list(portal_attr - build_attr)
     build_attr_only = list(build_attr - portal_attr)
@@ -69,7 +66,7 @@ def table_to_dict(in_file, key, aggr_list):
     with open(in_file) as f:
         # skip lines starting with hash until normal header is reached
         for entry in f:
-            if entry[0] != "#":
+            if not entry.startswith("#"):
                 header = entry.rstrip('\n').split('\t')
                 primary = header.index(key)
                 # get aggregate field indices
@@ -89,9 +86,8 @@ def table_to_dict(in_file, key, aggr_list):
             for i in aggr_head:
                 data[i] = split_sort_field(data[i], ";")
             # two loops, for up until primary key, then after.
-            for i in range(0, primary, 1):
-                data_dict[data[primary]][header[i]] = data[i]
-            for i in range((primary + 1), len(data), 1):
+            for i in range(len(data)):
+                if i == primary: continue
                 data_dict[data[primary]][header[i]] = data[i]
     attr_set = set(header)
     # no need for primary key to be reported as an attribute
@@ -115,12 +111,11 @@ def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list):
         clinical_id = getattr(entry, attr_dict[data_type])
         if clinical_id not in data_dict:
             # every entry per sample as sampleId and patientId, patient just patientId. Add keys to match
-            data_dict[clinical_id] = {}
-            data_dict[clinical_id]["PATIENT_ID"] = entry.patientId
+            data_dict[clinical_id] = {"PATIENT_ID": entry.patientId}
         value = entry.value
         attr_id = entry.clinicalAttributeId
         if attr_id in aggr_list:
-            value = split_sort_field(value, ";")
+            value = ';'.join(sorted(value.split(';')))
         # "standardize" status field so that 0:LIVING = LIVING and 1:DECEASED = DECEASED
         if attr_id in status:
             value = value[2:]
@@ -178,22 +173,20 @@ def main():
     portal_sample_attr_keys.update(portal_sample_attr_implicit)
     # drop attributes that are post-load portal-specific
     portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT']
-    for attr in portal_sample_attr_skip:
-        portal_sample_attr_keys.remove(attr)
+    portal_sample_attr_keys -= set(portal_sample_attr_skip)
     # sample-level diffs
-    clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out)
-    sample_diff_out.close()
+    with open('sample_portal_v_build.txt', 'w') as sample_diff_out:
+        clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out)
     # patient-level diffs
     portal_patient_data =  data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list)
     build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", "PATIENT_ID", aggr_list)
     patient_diff_out = open('patient_portal_v_build.txt', 'w')
     portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute])
     portal_patient_attr_skip = ['SAMPLE_COUNT']
-    for attr in portal_patient_attr_skip:
-        portal_patient_attr_keys.remove(attr)
+    portal_patient_attr_keys -= set(portal_patient_attr_skip)
 
-    clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out)
-    patient_diff_out.close()
+    with open('patient_portal_v_build.txt', 'w') as patient_diff_out:
+        clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out)
 
 
 if __name__ == '__main__':

From 4af1b300a117f772fdb35f355a68adc2672e17bd Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 29 Feb 2024 16:03:44 -0500
Subject: [PATCH 7/8] :hammer: update PR suggestions :pencil: update docs

---
 docs/DIFF_STUDY_CLINICAL.md | 115 +++++++++++++++++++-----------------
 scripts/diff_studies.py     |  58 ++++++++----------
 2 files changed, 88 insertions(+), 85 deletions(-)

diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md
index fd7d7f0..595b1b1 100644
--- a/docs/DIFF_STUDY_CLINICAL.md
+++ b/docs/DIFF_STUDY_CLINICAL.md
@@ -30,77 +30,86 @@ For the patient and sample views, each file respectively has:
  - A list, one per line, per ID, per attribute, of what would change if the data were loaded
  - A list of IDs that would be removed from the portal, if any
  - A list of IDs that would be added if any
- - A summary of the number of changes of each attribute type
+ - A summary of the number of changes of each attribute type printed to STDOUT
 
 ### patient_portal_v_build.txt example:
 ```
-Per Patient changes:
-Patient PT_017WC8PS attribute ETHNICITY would change from NA to Not Available
-Patient PT_01HNFSBZ attribute ETHNICITY would change from NA to Not Available
-Patient PT_01HNFSBZ attribute GERMLINE_SEX_ESTIMATE would change from Unknown to NA
-Patient PT_01HNFSBZ attribute CANCER_PREDISPOSITIONS would change from None documented to NA
-Patient PT_01SH4F1X attribute AGE_IN_DAYS would change from 3838 to NA
-Patient PT_01SH4F1X attribute OS_MONTHS would change from 45 to 54
-Patient PT_0324HWD5 attribute AGE_IN_DAYS would change from 3121 to NA
-Patient PT_047YGDRW attribute ETHNICITY would change from NA to Not Available
-Patient PT_04V47WFC attribute AGE_IN_DAYS would change from 5717 to NA
-Patient PT_08M919BH attribute EFS_MONTHS would change from 44 to 62
-Patient PT_08M919BH attribute OS_MONTHS would change from 44 to 62
-Patient PT_0BSG3R3N attribute AGE_IN_DAYS would change from 3431 to NA
-Patient PT_0BVR16FK attribute ETHNICITY would change from NA to Not Available
-Patient PT_0CE0HFYB attribute GERMLINE_SEX_ESTIMATE would change from Male to NA
-
-...
-
-CHANGE SUMMARY:
-ETHNICITY has 358 change(s)
-GERMLINE_SEX_ESTIMATE has 220 change(s)
-CANCER_PREDISPOSITIONS has 29 change(s)
-AGE_IN_DAYS has 147 change(s)
-OS_MONTHS has 99 change(s)
-EFS_MONTHS has 60 change(s)
-EFS_STATUS has 18 change(s)
-SEX has 9 change(s)
-AGE has 6 change(s)
-OS_STATUS has 4 change(s)
+Patient attribute       before  after
+PT_017WC8PS     ETHNICITY       NA      Not Available
+PT_01HNFSBZ     CANCER_PREDISPOSITIONS  None documented NA
+PT_01HNFSBZ     ETHNICITY       NA      Not Available
+PT_01HNFSBZ     GERMLINE_SEX_ESTIMATE   Unknown NA
+PT_01SH4F1X     AGE_IN_DAYS     3838    NA
+PT_01SH4F1X     OS_MONTHS       45      54
+PT_0324HWD5     AGE_IN_DAYS     3121    NA
+PT_047YGDRW     ETHNICITY       NA      Not Available
+PT_04V47WFC     AGE_IN_DAYS     5717    NA
+PT_08M919BH     OS_MONTHS       44      62
+PT_08M919BH     EFS_MONTHS      44      62
+PT_0BSG3R3N     AGE_IN_DAYS     3431    NA
+PT_0BVR16FK     ETHNICITY       NA      Not Available
+PT_0CE0HFYB     GERMLINE_SEX_ESTIMATE   Male    NA
+PT_0CVRX4SJ     OS_MONTHS       NA      149
 ```
 
 ### sample_portal_v_build.txt example:
 ```
-Per Sample changes:
-Sample 16510-1 attribute TUMOR_FRACTION would change from 0.349951221921 to 0.34995122192100003
-Sample 16510-15 attribute TUMOR_FRACTION would change from 0.892871847605 to 0.8928718476049999
-Sample 16510-2 attribute TUMOR_FRACTION would change from 0.242536563786 to 0.24253656378600005
-Sample 16510-8 attribute TUMOR_FRACTION would change from 0.557284218924 to 0.5572842189239999
-Sample 7316-100 attribute TUMOR_FRACTION would change from 0.270649989118 to 0.27064998911800003
-Sample 7316-1017 attribute TUMOR_FRACTION would change from 0.570184695999 to 0.559801637737
-Sample 7316-104 attribute TUMOR_FRACTION would change from 0.664343255194 to 0.6643432551940001
-Sample 7316-1045 attribute TUMOR_FRACTION would change from 0.477859261757 to 0.496989582389
-Sample 7316-105 attribute MOLECULAR_SUBTYPE would change from NA to LGG, BRAF V600E
-Sample 7316-105 attribute TUMOR_PLOIDY would change from NA to 2
-Sample 7316-105 attribute CANCER_TYPE_DETAILED would change from NA to Low-grade glioma, BRAF V600E
-Sample 7316-105 attribute CANCER_GROUP would change from NA to Low-grade glioma
-Sample 7316-105 attribute TUMOR_FRACTION would change from NA to 0.823344460708
-Sample 7316-105 attribute PATHOLOGY_FREE_TEXT_DIAGNOSIS would change from NA to pilocytic astrocytoma ii
-
-...
+Sample  attribute       before  after
+16510-1 TUMOR_FRACTION  0.349951221921  0.34995122192100003
+16510-15        TUMOR_FRACTION  0.892871847605  0.8928718476049999
+16510-2 TUMOR_FRACTION  0.242536563786  0.24253656378600005
+16510-8 TUMOR_FRACTION  0.557284218924  0.5572842189239999
+7316-100        TUMOR_FRACTION  0.270649989118  0.27064998911800003
+7316-1017       TUMOR_FRACTION  0.570184695999  0.559801637737
+7316-104        TUMOR_FRACTION  0.664343255194  0.6643432551940001
+7316-1045       TUMOR_FRACTION  0.477859261757  0.496989582389
+7316-105        CNS_REGION      NA      Mixed
+7316-105        CANCER_TYPE_DETAILED    NA      Low-grade glioma, BRAF V600E
+7316-105        MOLECULAR_SUBTYPE       NA      LGG, BRAF V600E
+7316-105        BROAD_HISTOLOGY NA      Low-grade astrocytic tumor
+7316-105        CANCER_GROUP    NA      Low-grade glioma
+7316-105        TUMOR_PLOIDY    NA      2
+7316-105        PATHOLOGY_FREE_TEXT_DIAGNOSIS   NA      pilocytic astrocytoma ii
+7316-105        TUMOR_FRACTION  NA      0.823344460708
+7316-1052       CANCER_TYPE_DETAILED    Diffuse midline glioma, H3 K28-mutant   Diffuse midline glioma, H3 K28-altered
+7316-1052       MOLECULAR_SUBTYPE       DMG, H3 K28     DMG, H3 K28, TP53
+7316-1062       CANCER_TYPE_DETAILED    Diffuse midline glioma, H3 K28-mutant   Diffuse midline glioma, H3 K28-altered
+7316-1068       CANCER_TYPE_DETAILED    Diffuse midline glioma, H3 K28-mutant   Diffuse midline glioma, H3 K28-altered
+7316-1072       CANCER_TYPE_DETAILED    Glial-neuronal tumor NOS        Glial-neuronal tumor,  To be classified
+7316-1072       BROAD_HISTOLOGY Low-grade astrocytic tumor      Neuronal and mixed neuronal-glial tumor
+7316-1072       CANCER_GROUP    Glial-neuronal tumor    Glial-neuronal tumor NOS
+```
 
-CHANGE SUMMARY:
+### STDOUT:
+```
+Sample CHANGE SUMMARY:
 27 Samples in build would be added to the portal: 1235928,1235929,1235930,1235931,1235932,1235933,1235934,1235935,1235936,1235937,1235938,1235939,1235940,1235941,1235981,1240110,1240112,1240114,1240116,1242273,1242274,1242276,1250775,1250776,1250777,1250778,1273223
 TUMOR_FRACTION has 1005 change(s)
-MOLECULAR_SUBTYPE has 488 change(s)
-TUMOR_PLOIDY has 403 change(s)
+CNS_REGION has 410 change(s)
 CANCER_TYPE_DETAILED has 847 change(s)
+MOLECULAR_SUBTYPE has 488 change(s)
+BROAD_HISTOLOGY has 390 change(s)
 CANCER_GROUP has 517 change(s)
+TUMOR_PLOIDY has 403 change(s)
 PATHOLOGY_FREE_TEXT_DIAGNOSIS has 507 change(s)
-BROAD_HISTOLOGY has 390 change(s)
-CNS_REGION has 410 change(s)
-CANCER_TYPE has 8 change(s)
 ONCOTREE_CODE has 7 change(s)
+CANCER_TYPE has 8 change(s)
 TUMOR_TYPE has 8 change(s)
 TUMOR_TISSUE_TYPE has 17 change(s)
 EXPERIMENT_STRATEGY has 1 change(s)
 SPECIMEN_ID has 1 change(s)
 CBTN_TUMOR_TYPE has 6 change(s)
 SAMPLE_TYPE has 2 change(s)
+
+Patient CHANGE SUMMARY:
+ETHNICITY has 358 change(s)
+CANCER_PREDISPOSITIONS has 29 change(s)
+GERMLINE_SEX_ESTIMATE has 220 change(s)
+AGE_IN_DAYS has 147 change(s)
+OS_MONTHS has 99 change(s)
+EFS_MONTHS has 60 change(s)
+EFS_STATUS has 18 change(s)
+SEX has 9 change(s)
+AGE has 6 change(s)
+OS_STATUS has 4 change(s)
 ```
\ No newline at end of file
diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py
index 82a618b..8c51b5b 100644
--- a/scripts/diff_studies.py
+++ b/scripts/diff_studies.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 Script to check a study on pedcbioportal for differences against a local build
 """
@@ -17,45 +18,38 @@ def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out):
     build_clinical_ids = set(build.keys())
     portal_only = sorted(portal_clinical_ids - build_clinical_ids)
     build_only = sorted(build_clinical_ids - portal_clinical_ids)
-    common_samp_ids = sorted(portal_clinical_ids & build_clinical_ids)
+    common_clinical_ids = sorted(portal_clinical_ids & build_clinical_ids)
     # gross attribute diffs
     portal_attr_only = list(portal_attr - build_attr)
     build_attr_only = list(build_attr - portal_attr)
     common_attr = list(portal_attr & build_attr)
     # focus on common samp and common attr, as "everything is different for x" is not that useful
-    print("Per " + clin_type + " changes:", file=out)
+    print(clin_type + "\tattribute\tbefore\tafter", file=out)
     attr_cts = {}
-    for samp_id in common_samp_ids:
+    for clinical_id in common_clinical_ids:
         for attr in common_attr:
             # portal will not have a value for that attr in the struct if none
-            portal_value = portal[samp_id].get(attr, "NA")
-            if portal_value != build[samp_id][attr]:
-                print("{} {} attribute {} would change from {} to {}".format(clin_type, samp_id, attr, portal_value, build[samp_id][attr]), file=out)
+            portal_value = portal[clinical_id].get(attr, "NA")
+            if portal_value != build[clinical_id][attr]:
+                print("{}\t{}\t{}\t{}".format(clinical_id, attr, portal_value, build[clinical_id][attr]), file=out)
                 if attr not in attr_cts:
                     attr_cts[attr] = 0
                 attr_cts[attr] += 1
 
-    print("CHANGE SUMMARY:", file=out)
+    # print change summary to STDOUT
+    print(clin_type +" CHANGE SUMMARY:")
     if len(portal_only) > 0:
-        print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only)), file=out)
+        print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only)))
     if len(build_only) > 0:
-        print("{} {}s in build would be added to the portal: {}".format(len(build_only), clin_type,  ",".join(build_only)), file=out)
+        print("{} {}s in build would be added to the portal: {}".format(len(build_only), clin_type,  ",".join(build_only)))
     if len(portal_attr_only) > 0:
-        print("{} attributes in portal would be removed: {}".format(len(portal_attr_only), ",".join(portal_attr_only)), file=out)
+        print("{} attributes in portal would be removed: {}".format(len(portal_attr_only), ",".join(portal_attr_only)))
     if len(build_attr_only) > 0:
-        print("{} attributes in build would be added to the portal: {}".format(len(build_attr_only), ",".join(build_attr_only)), file=out)
+        print("{} attributes in build would be added to the portal: {}".format(len(build_attr_only), ",".join(build_attr_only)))
     for attr in attr_cts:
-        print("{} has {} change(s)".format(attr, attr_cts[attr]), file=out)
-
-
-def split_sort_field(value, sep):
-    """
-    For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter.
-    Therefore, sort them so that when compared, no errors are triggered
-    """
-    value_list = value.split(sep)
-    value_list.sort()
-    return sep.join(value_list)
+        print("{} has {} change(s)".format(attr, attr_cts[attr]))
+    # Print extra newline for readability
+    print ("")
 
 
 def table_to_dict(in_file, key, aggr_list):
@@ -74,7 +68,6 @@ def table_to_dict(in_file, key, aggr_list):
                 for aggr in aggr_list:
                     if aggr in header:
                         aggr_head.append(header.index(aggr))
-
                 break
         data_dict = {}
         for entry in f:
@@ -82,9 +75,10 @@ def table_to_dict(in_file, key, aggr_list):
             # Replace empty string with NA as that is how the portal will return it
             data = ["NA" if d == "" else d for d in data]
             data_dict[data[primary]] = {}
-            # sort aggr fields
+            # For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter
+            # Therefore, sort them so that when compared, no errors are triggered
             for i in aggr_head:
-                data[i] = split_sort_field(data[i], ";")
+                data[i] = ';'.join(sorted(data[i].split(';')))
             # two loops, for up until primary key, then after.
             for i in range(len(data)):
                 if i == primary: continue
@@ -114,6 +108,8 @@ def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list):
             data_dict[clinical_id] = {"PATIENT_ID": entry.patientId}
         value = entry.value
         attr_id = entry.clinicalAttributeId
+        # For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter
+        # Therefore, sort them so that when compared, no errors are triggered
         if attr_id in aggr_list:
             value = ';'.join(sorted(value.split(';')))
         # "standardize" status field so that 0:LIVING = LIVING and 1:DECEASED = DECEASED
@@ -159,20 +155,20 @@ def main():
                                                 "validate_swagger_spec": False}
     )
     
-    # hardcode for now names of aggregate fields
+    # hardcode for now names of aggregate fields, implicit, and skip fields
     aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"]
+    portal_sample_attr_implicit = ['PATIENT_ID']
+    portal_patient_attr_skip = ['SAMPLE_COUNT']
+    portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT']
     # get attribute keys
     attr_key_obj = cbioportal.Clinical_Attributes.fetchClinicalAttributesUsingPOST(studyIds=[args.study], projection='ID').result()
     # gather sample-level metadata
     portal_sample_data = data_clinical_from_study(cbioportal, args.study, "SAMPLE", aggr_list)
     build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt", "SAMPLE_ID", aggr_list)
-    sample_diff_out = open('sample_portal_v_build.txt', 'w')
     portal_sample_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if not x.patientAttribute])
     # implicit attributes not returned by function that are required for sample view
-    portal_sample_attr_implicit = ['PATIENT_ID']
     portal_sample_attr_keys.update(portal_sample_attr_implicit)
     # drop attributes that are post-load portal-specific
-    portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT']
     portal_sample_attr_keys -= set(portal_sample_attr_skip)
     # sample-level diffs
     with open('sample_portal_v_build.txt', 'w') as sample_diff_out:
@@ -180,11 +176,9 @@ def main():
     # patient-level diffs
     portal_patient_data =  data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list)
     build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", "PATIENT_ID", aggr_list)
-    patient_diff_out = open('patient_portal_v_build.txt', 'w')
     portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute])
-    portal_patient_attr_skip = ['SAMPLE_COUNT']
+    # drop attributes that are post-load portal-specific
     portal_patient_attr_keys -= set(portal_patient_attr_skip)
-
     with open('patient_portal_v_build.txt', 'w') as patient_diff_out:
         clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out)
 

From 92ebd146077bfcf95a4ca8f31e9b2713d3190fab Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 29 Feb 2024 16:45:12 -0500
Subject: [PATCH 8/8] :hammer: make inputs more explicit

---
 scripts/diff_studies.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py
index 8c51b5b..3356454 100644
--- a/scripts/diff_studies.py
+++ b/scripts/diff_studies.py
@@ -127,13 +127,16 @@ def main():
         "-u", "--url", action="store", dest="url", help="url to search against", default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs"
     )
     parser.add_argument(
-        "-s", "--study", action="store", dest="study", help="Cancer study ID to compare on server"
+        "-c", "--study", action="store", dest="study", help="Cancer study ID to compare on server"
     )
     parser.add_argument(
         "-t", "--token", action="store", dest="token", help="Token file obtained from Web API"
     )
     parser.add_argument(
-        "-d", "--datasheet-dir", action="store", dest="data_dir", help="Directory containing data_clinical_*.txt"
+        "-s", "--datasheet-sample", action="store", dest="data_sample", help="File containing cBio-formatted sample metadata, typically named data_clinical_sample.txt"
+    )
+    parser.add_argument(
+        "-p", "--datasheet-patient", action="store", dest="data_patient", help="File containing cBio-formatted patient metadata, typically named data_clinical_patient.txt"
     )
 
     args = parser.parse_args()
@@ -164,7 +167,7 @@ def main():
     attr_key_obj = cbioportal.Clinical_Attributes.fetchClinicalAttributesUsingPOST(studyIds=[args.study], projection='ID').result()
     # gather sample-level metadata
     portal_sample_data = data_clinical_from_study(cbioportal, args.study, "SAMPLE", aggr_list)
-    build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt", "SAMPLE_ID", aggr_list)
+    build_sample_data, build_sample_attr_keys = table_to_dict(args.data_sample, "SAMPLE_ID", aggr_list)
     portal_sample_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if not x.patientAttribute])
     # implicit attributes not returned by function that are required for sample view
     portal_sample_attr_keys.update(portal_sample_attr_implicit)
@@ -175,7 +178,7 @@ def main():
         clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out)
     # patient-level diffs
     portal_patient_data =  data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list)
-    build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", "PATIENT_ID", aggr_list)
+    build_patient_data, build_patient_attr_keys = table_to_dict(args.data_patient, "PATIENT_ID", aggr_list)
     portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute])
     # drop attributes that are post-load portal-specific
     portal_patient_attr_keys -= set(portal_patient_attr_skip)