diff --git a/.vs/AI-SDC/v16/.suo b/.vs/AI-SDC/v16/.suo deleted file mode 100644 index 51ce64b8..00000000 Binary files a/.vs/AI-SDC/v16/.suo and /dev/null differ diff --git a/.vs/ProjectSettings.json b/.vs/ProjectSettings.json deleted file mode 100644 index f8b48885..00000000 --- a/.vs/ProjectSettings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "CurrentProjectSetting": null -} \ No newline at end of file diff --git a/.vs/PythonSettings.json b/.vs/PythonSettings.json deleted file mode 100644 index 533603ce..00000000 --- a/.vs/PythonSettings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "Interpreter": "Global|VisualStudio|dependencies AI-SDC" -} \ No newline at end of file diff --git a/.vs/VSWorkspaceState.json b/.vs/VSWorkspaceState.json deleted file mode 100644 index 72ba150c..00000000 --- a/.vs/VSWorkspaceState.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "ExpandedNodes": [ - "", - "\\aisdc", - "\\aisdc\\data", - "\\aisdc\\experiments", - "\\aisdc\\preprocessing" - ], - "SelectedNode": "\\aisdc\\preprocessing\\loaders.py", - "PreviewInSolutionExplorer": false -} \ No newline at end of file diff --git a/.vs/slnx.sqlite b/.vs/slnx.sqlite deleted file mode 100644 index 981b28d7..00000000 Binary files a/.vs/slnx.sqlite and /dev/null differ diff --git a/aisdc/preprocessing/loaders.py b/aisdc/preprocessing/loaders.py index 7423efae..42a7678a 100644 --- a/aisdc/preprocessing/loaders.py +++ b/aisdc/preprocessing/loaders.py @@ -72,6 +72,7 @@ def get_data_sklearn( # pylint: disable = too-many-branches medical-mnist-ab-v-br-500 (requires data download) medical-mnist-all-100 (requires data download) indian liver (requires data download) + texas hospitals 10 (requires data download) synth-ae (requires data download) synth-ae-small (requires data download) nursery (downloads automatically) @@ -138,22 +139,16 @@ def get_data_sklearn( # pylint: disable = too-many-branches ) if dataset_name == "indian liver": return _indian_liver(data_folder) + if dataset_name == "texas hospitals 10": + return _texas_hospitals(data_folder) if dataset_name == "synth-ae": return _synth_ae(data_folder) if dataset_name == "synth-ae-small": return _synth_ae(data_folder, 200) - if dataset_name == "synth-ae-large": - return _synth_ae(data_folder, 500000) - if dataset_name == "synth-ae-extra-large": - return _synth_ae(data_folder, 2000000) - if dataset_name == "synth-ae-XXL": - return _synth_ae(data_folder, 50000000) if dataset_name == "nursery": return _nursery() if dataset_name == "iris": return _iris() - if dataset_name == "RDMP": - return _RDMP(data_folder) raise UnknownDataset(dataset_name) @@ -278,17 +273,15 @@ def _synth_ae( https://data.england.nhs.uk/dataset/a-e-synthetic-data/resource/81b068e5-6501-4840-a880-a8e7aa56890e # pylint: disable=line-too-long. """ - file_path = os.path.join(data_folder, "AE_England_synthetic.csv") + file_path = os.path.join( + data_folder, "AE_England_synthetic.csv" #'A&E Synthetic Data.csv' + ) if not os.path.exists(file_path): help_message = f""" Data file {file_path} does not exist. Please download the file from: https://data.england.nhs.uk/dataset/a-e-synthetic-data/resource/81b068e5-6501-4840-a880-a8e7aa56890e - -Alternatively, download the file directly from the following URL: -https://nhsengland-direct-uploads.s3-eu-west-1.amazonaws.com/A%26E+Synthetic+Data.7z - -Unzip it (7z) and then copy the .csv file into your data folder. +unzip it (7z) and then copy the .csv file into your data folder. """ raise DataNotAvailable(help_message) @@ -382,7 +375,6 @@ def _in_hospital_mortality(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame # download, and where to store files = ["data01.csv", "doi_10.5061_dryad.0p2ngf1zd__v5.zip"] file_path = [os.path.join(data_folder, f) for f in files] - print(file_path) if not any( # pylint: disable=use-a-generator [os.path.exists(fp) for fp in file_path] @@ -391,9 +383,6 @@ def _in_hospital_mortality(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame Data file {file_path[0]} or {file_path[1]} does not exist. Please download the file from: https://datadryad.org/stash/dataset/doi:10.5061/dryad.0p2ngf1zd and place it in the correct folder. It works with either the zip file or uncompressed. -Alternatively download the data file from this URL: - https://datadryad.org/stash/downloads/file_stream/773992 -and then change the name of the file 773992 to data01.csv. """ raise DataNotAvailable(help_message) @@ -414,19 +403,17 @@ def _in_hospital_mortality(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame def _mimic_iaccd(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Loads the mimic_iaccd data and performs pre-processing.""" + """Loads the mimic_iaccd data and performs Alba's pre-processing.""" # Check the data has been downloaded. If not throw an exception with instructions on how to # download, and where to store file_path = os.path.join(data_folder, "mimic2-iaccd", "1.0", "full_cohort_data.csv") - print(file_path, os.path.exists(file_path)) if not os.path.exists(file_path): help_message = f""" The MIMIC2-iaccd data is not available in {data_folder}. The following file should exist: {file_path}. - Please download from https://physionet.org/files/mimic2-iaccd/1.0/full_cohort_data.csv?download - and rename the file to full_cohort_data.csv. + Please download from https://physionet.org/content/mimic2-iaccd/1.0/full_cohort_data.csv """ raise DataNotAvailable(help_message) @@ -466,22 +453,208 @@ def _mimic_iaccd(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: return (X, y) +def _texas_hospitals( + data_folder: str, +) -> Tuple[pd.DataFrame, pd.DataFrame]: # pragma: no cover + # pylint: disable=too-many-statements, too-many-locals + """ + Texas Hospitals Dataset + (https://www.dshs.texas.gov/THCIC/Hospitals/Download.shtm). + + Note: this has been tested repeated in the GRAIMatter project. + However, for licensing reasons we cannot redistribute the data. + Therefore it is omitted from CI test coverage and metrics. + """ + file_list = [ + "PUDF-1Q2006-tab-delimited.zip", + "PUDF-1Q2007-tab-delimited.zip", + "PUDF-1Q2008-tab-delimited.zip", + "PUDF-1Q2009-tab-delimited.zip", + "PUDF-2Q2006-tab-delimited.zip", + "PUDF-2Q2007-tab-delimited.zip", + "PUDF-2Q2008-tab-delimited.zip", + "PUDF-2Q2009-tab-delimited.zip", + "PUDF-3Q2006-tab-delimited.zip", + "PUDF-3Q2007-tab-delimited.zip", + "PUDF-3Q2008-tab-delimited.zip", + "PUDF-3Q2009-tab-delimited.zip", + "PUDF-4Q2006-tab-delimited.zip", + "PUDF-4Q2007-tab-delimited.zip", + "PUDF-4Q2008-tab-delimited.zip", + "PUDF-4Q2009-tab-delimited.zip", + ] + + files_path = [os.path.join(data_folder, "TexasHospitals", f) for f in file_list] + + found = [os.path.exists(file_path) for file_path in files_path] + not_found = [file_path for file_path in files_path if not os.path.exists(file_path)] -def _RDMP( # pylint: disable=too-many-locals, too-many-statements + processed_data_file = "texas_data10_rm_binary.csv" + if not all(found): + help_message = f""" + Some or all data files do not exist. Please accept their terms & conditions,then download the + tab delimited files from each quarter during 2006-2009 from: + https://www.dshs.texas.gov/THCIC/Hospitals/Download.shtm +and place it in the correct folder. + + Missing files are: + {not_found} + """ + raise DataNotAvailable(help_message) + + if not os.path.exists( + os.path.join(data_folder, "TexasHospitals", processed_data_file) + ): + logger.info("Processing Texas Hospitals data (2006-2009)") + + # Load data + columns_names = [ + "THCIC_ID", # Provider ID. Unique identifier assigned to the provider by DSHS. + # Hospitals with fewer than 50 discharges have been aggregated into the + # Provider ID '999999' + "DISCHARGE_QTR", # yyyyQm + "TYPE_OF_ADMISSION", + "SOURCE_OF_ADMISSION", + "PAT_ZIP", # Patient’s five-digit ZIP code + "PUBLIC_HEALTH_REGION", # Public Health Region of patient’s address + "PAT_STATUS", # Code indicating patient status as of the ending date of service for + # the period of care reported + "SEX_CODE", + "RACE", + "ETHNICITY", + "LENGTH_OF_STAY", + "PAT_AGE", # Code indicating age of patient in days or years on date of discharge. + "PRINC_DIAG_CODE", # diagnosis code for the principal diagnosis + "E_CODE_1", # external cause of injury + "PRINC_SURG_PROC_CODE", # Code for the principal surgical or other procedure performed + # during the period covered by the bill + "RISK_MORTALITY", # Assignment of a risk of mortality score from the All Patient + # Refined (APR) Diagnosis Related Group (DRG) + "ILLNESS_SEVERITY", # Assignment of a severity of illness score from the All Patient + # Refined (APR) Diagnosis RelatedGroup (DRG + "RECORD_ID", + ] + # obtain the 100 most frequent procedures + tmp = [] + for f in files_path: + df = [ + pd.read_csv( + ZipFile(f).open(i), sep="\t", usecols=["PRINC_SURG_PROC_CODE"] + ) + for i in ZipFile(f).namelist() + if "base" in i + ][0] + df.dropna(inplace=True) + tmp.extend(list(df.PRINC_SURG_PROC_CODE)) + princ_surg_proc_keep = [k for k, v in Counter(tmp).most_common(10)] + # remove unnecessary variables + del tmp + + # Load the data + tx_data = pd.DataFrame() + for f in files_path: + df = [ + pd.read_csv(ZipFile(f).open(i), sep="\t", usecols=columns_names) + for i in ZipFile(f).namelist() + if "base" in i + ][0] + # keep only those rows with one of the 10 most common principal surgical procedure + df = df[df["PRINC_SURG_PROC_CODE"].isin(princ_surg_proc_keep)] + # clean up data + df.dropna(inplace=True) + df.replace("`", pd.NA, inplace=True) + df.replace("*", pd.NA, inplace=True) + # replace sex to numeric + df.SEX_CODE.replace("M", 0, inplace=True) + df.SEX_CODE.replace("F", 1, inplace=True) + df.SEX_CODE.replace("U", 2, inplace=True) + # set to numerical variable + for d_code in set(list(df.DISCHARGE_QTR)): + df.DISCHARGE_QTR.replace( + d_code, "".join(d_code.split("Q")), inplace=True + ) + df.dropna(inplace=True) + # merge data + tx_data = pd.concat([tx_data, df]) + # remove uncessary variables + del df + + # Risk moratality, make it binary + # 1 Minor + # 2 Moderate + # 3 Major + # 4 Extreme + tx_data.RISK_MORTALITY.astype(int) + tx_data.RISK_MORTALITY.replace(1, 0, inplace=True) + tx_data.RISK_MORTALITY.replace(2, 0, inplace=True) + tx_data.RISK_MORTALITY.replace(3, 1, inplace=True) + tx_data.RISK_MORTALITY.replace(4, 1, inplace=True) + + # renumber non-numerical codes for cols + cols = ["PRINC_DIAG_CODE", "SOURCE_OF_ADMISSION", "E_CODE_1"] + for col in cols: + tmp = list( + { + x + for x in tx_data[col] + if not str(x).isdigit() and not isinstance(x, float) + } # pylint: disable=consider-using-set-comprehension + ) + n = max( + list( + { + int(x) + for x in tx_data[col] + if str(x).isdigit() or isinstance(x, float) + } # pylint: disable=consider-using-set-comprehension + ) + ) + for i, x in enumerate(tmp): + tx_data[col].replace(x, n + i, inplace=True) + del tmp, n + # set index + tx_data.set_index("RECORD_ID", inplace=True) + # final check and drop of NAs + tx_data.dropna(inplace=True) + # convert all data to numerical + tx_data = tx_data.astype(int) + # save csv file + tx_data.to_csv(os.path.join(data_folder, "TexasHospitals", processed_data_file)) + else: + logger.info("Loading processed Texas Hospitals data (2006-2009) csv file.") + # load texas data processed csv file + tx_data = pd.read_csv( + os.path.join(data_folder, "TexasHospitals", processed_data_file) + ) + + # extract target + var = "RISK_MORTALITY" + labels = tx_data[var] + # Drop the column that contains the labels + tx_data.drop([var], axis=1, inplace=True) + + label_encoder = LabelEncoder() + encoded_labels = label_encoder.fit_transform(labels.values) + labels = pd.DataFrame({var: encoded_labels}) + + return (tx_data, labels) + +def _RDMP( # pylint: disable=too-many-locals, too-many-statements data_folder: str, -) -> Tuple[pd.DataFrame, pd.DataFrame]: + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + def find_age(row): - date_ = pd.to_datetime("01/06/2020") - if row.date_of_death != row.date_of_death: - age = np.floor((date_ - row.date_of_birth).days / 365.25) + date_ = pd.to_datetime('01/06/2020') + if row.date_of_death!=row.date_of_death: + age = np.floor((date_-row.date_of_birth).days/365.25) else: - age = np.floor((row.date_of_death - row.date_of_birth).days / 365.25) + age = np.floor((row.date_of_death-row.date_of_birth).days/365.25) return age def hospital_days(row): - if row.DischargeDate == row.DischargeDate: - t = row.DischargeDate - row.AdmissionDate - days = t.days + round(((t.seconds / 60) / 60) / 24) + if row.DischargeDate==row.DischargeDate: + t = row.DischargeDate-row.AdmissionDate + days = t.days+round(((t.seconds/60)/60)/24) else: days = 0 return days @@ -490,20 +663,16 @@ def hospital_days(row): os.path.join(data_folder, "RDMP") if os.path.exists(os.path.join(data_folder, "RDMP", processed_data_file)): logger.info("Loading processed RDMP file.") - # load processed csv file - df = pd.read_csv(os.path.join(data_folder, "RDMP", processed_data_file)) + # load texas data processed csv file + df = pd.read_csv( + os.path.join(data_folder, "RDMP", processed_data_file) + ) else: logger.info("Processing RDMP synthetic data") - file_list = [ - "CarotidArteryScan.csv", - "Demography.csv", - "HospitalAdmissions.csv", - ] + file_list = ['CarotidArteryScan.csv', 'Demography.csv', 'HospitalAdmissions.csv'] files_path = [os.path.join(data_folder, "RDMP", f) for f in file_list] found = [os.path.exists(file_path) for file_path in files_path] - not_found = [ - file_path for file_path in files_path if not os.path.exists(file_path) - ] + not_found = [file_path for file_path in files_path if not os.path.exists(file_path)] if not all(found): help_message = f""" @@ -513,148 +682,75 @@ def hospital_days(row): Missing files are: {not_found} """ - raise DataNotAvailable(help_message) + raise DataNotAvailable(help_message) # Load data - headers0 = [ - "R_CC_STEN_A", - "R_CC_STEN_B", - "R_CC_STEN_C", - "R_CC_STEN_D", - "R_CC_STEN_S", # pylint: disable=unreachable - "L_IC_STEN_A", - "L_IC_STEN_B", - "L_IC_STEN_C", - "L_IC_STEN_D", - "L_IC_STEN_S", - "R_IC_STEN_A", - "R_IC_STEN_B", - "R_IC_STEN_C", - "R_IC_STEN_D", - "R_IC_STEN_S", - "PatientID", - "L_CC_STEN_S", - "L_CC_STEN_D", - "L_CC_STEN_B", - "L_BD_RATIO", - "L_AC_RATIO", - "R_BD_RATIO", - "R_AC_RATIO", - "L_CC_STENOSIS", - "L_CC_PEAK_SYS", - "L_CC_END_DIA", - "L_IC_STENOSIS", - "L_IC_PEAK_SYS", - "L_IC_END_DIA", - "L_EC_STENOSIS", - "L_PLAQUE", - "L_SYMPTOMS", - "L_BRUIT", - "L_CC_STEN_A", - "ON_STEN_STUDY", - "R_VERT_ARTERY", - "R_BRUIT", - "R_SYMPTOMS", - "R_PLAQUE", - "L_CC_STEN_C", - "R_EC_STENOSIS", - "R_IC_PEAK_SYS", - "R_IC_STENOSIS", - "R_CC_END_DIA", - "R_CC_PEAK_SYS", - "R_CC_STENOSIS", - "L_VERT_ARTERY", - "R_IC_END_DIA", - ] - headers1 = [ - "chi", - "sex", - "current_address_L2", - "date_of_death", - "date_of_birth", - ] - headers2 = [ - "chi", - "AdmissionDate", - "DischargeDate", - "MainCondition", - "OtherCondition1", - "OtherCondition2", - "OtherCondition3", - "MainOperation", - "MainOperationB", - "OtherOperation1", - "OtherOperation1B", - "OtherOperation2", - "OtherOperation2B", - "OtherOperation3", - "OtherOperation3B", - ] - - # Process first file - df = pd.read_csv(files_path[0], usecols=headers0, encoding='ISO 8859-1') - # Change name to be the same in all files - df.rename(columns={"PatientID": "chi"}, inplace=True) - df = df.groupby(["chi"]).max() - - # Process second file + headers0 = ['R_CC_STEN_A', 'R_CC_STEN_B', 'R_CC_STEN_C', 'R_CC_STEN_D', 'R_CC_STEN_S',# pylint: disable=unreachable + 'L_IC_STEN_A', 'L_IC_STEN_B', 'L_IC_STEN_C', 'L_IC_STEN_D', 'L_IC_STEN_S', 'R_IC_STEN_A', + 'R_IC_STEN_B', 'R_IC_STEN_C', 'R_IC_STEN_D', 'R_IC_STEN_S', 'PatientID','L_CC_STEN_S', + 'L_CC_STEN_D', 'L_CC_STEN_B', 'L_BD_RATIO', 'L_AC_RATIO', 'R_BD_RATIO', 'R_AC_RATIO', + 'L_CC_STENOSIS', 'L_CC_PEAK_SYS', 'L_CC_END_DIA', 'L_IC_STENOSIS','L_IC_PEAK_SYS', + 'L_IC_END_DIA', 'L_EC_STENOSIS', 'L_PLAQUE','L_SYMPTOMS', 'L_BRUIT', 'L_CC_STEN_A', + 'ON_STEN_STUDY', 'R_VERT_ARTERY', 'R_BRUIT', 'R_SYMPTOMS', 'R_PLAQUE','L_CC_STEN_C', + 'R_EC_STENOSIS', 'R_IC_PEAK_SYS', 'R_IC_STENOSIS', 'R_CC_END_DIA', 'R_CC_PEAK_SYS', + 'R_CC_STENOSIS', 'L_VERT_ARTERY', 'R_IC_END_DIA'] + headers1 = ['chi', 'sex', 'current_address_L2', 'date_of_death', 'date_of_birth'] + headers2 = ['chi','AdmissionDate', 'DischargeDate', 'MainCondition', + 'OtherCondition1', 'OtherCondition2', + 'OtherCondition3', + 'MainOperation', 'MainOperationB', 'OtherOperation1', 'OtherOperation1B', + 'OtherOperation2', + 'OtherOperation2B', 'OtherOperation3', 'OtherOperation3B'] + + #Process first file + df = pd.read_csv(files_path[0], usecols=headers0) + #Change name to be the same in all files + df.rename(columns={'PatientID':'chi'}, inplace=True) + df = df.groupby(['chi']).max() + + #Process second file df_ = pd.read_csv(files_path[1], usecols=headers1) - df_["date_of_birth"] = pd.to_datetime(df_["date_of_birth"]) - df_["date_of_death"] = pd.to_datetime(df_["date_of_death"]) - df_ = df_.groupby(["chi"]).max() + df_['date_of_birth']=pd.to_datetime(df_['date_of_birth']) + df_['date_of_death']=pd.to_datetime(df_['date_of_death']) + df_ = df_.groupby(['chi']).max() - # Merge first and second file - df = df.merge(df_, how="inner", on="chi", suffixes=(False, False)) + #Merge first and second file + df = df.merge(df_, how='inner', on='chi', suffixes=(False, False)) del df_ - # Process third file - df__ = pd.read_csv(files_path[2], usecols=headers2, encoding='ISO 8859-1') - df__["AdmissionDate"] = pd.to_datetime(df__["AdmissionDate"]) - df__["DischargeDate"] = pd.to_datetime(df__["DischargeDate"]) - df__["days_in_hospital"] = df__.apply(hospital_days, axis=1) - number_stays = df__.groupby(["chi"]).count()["AdmissionDate"] - dih = df__.groupby(["chi"])["days_in_hospital"].sum() - nc = ( - df__.groupby(["chi"])[[x for x in df__.columns if "Condition" in x]] - .count() - .mean(axis=1) - ) # pylint: disable=line-too-long - no = ( - df__.groupby(["chi"])[[x for x in df__.columns if "Operation" in x]] - .count() - .sum(axis=1) - ) # pylint: disable=line-too-long - df__.drop( - columns=[ - x - for x in df__.columns - if "Date" in x or "Operation" in x or "Condition" in x - ], - inplace=True, - ) # pylint: disable=line-too-long + #Process third file + df__ = pd.read_csv(files_path[2], usecols=headers2) + df__['AdmissionDate']=pd.to_datetime(df__['AdmissionDate']) + df__['DischargeDate']=pd.to_datetime(df__['DischargeDate']) + df__['days_in_hospital'] = df__.apply(hospital_days, axis=1) + number_stays = df__.groupby(['chi']).count()['AdmissionDate'] + dih = df__.groupby(['chi'])['days_in_hospital'].sum() + nc = df__.groupby(['chi'])[[x for x in df__.columns if 'Condition' in x]].count().mean(axis=1) # pylint: disable=line-too-long + no = df__.groupby(['chi'])[[x for x in df__.columns if 'Operation' in x]].count().sum(axis=1) # pylint: disable=line-too-long + df__.drop(columns=[x for x in df__.columns if 'Date' in x or 'Operation' in x or 'Condition' in x], inplace=True) # pylint: disable=line-too-long df__ = pd.DataFrame() - df__["days_in_hospital"] = dih - df__["average_number_conditions"] = nc - df__["total_number_operations"] = no - df__["number_admissions"] = number_stays + df__['days_in_hospital'] = dih + df__['average_number_conditions'] = nc + df__['total_number_operations'] = no + df__['number_admissions'] = number_stays - # merge the third file - df = df.merge(df__, how="inner", on="chi", suffixes=(False, False)) + #merge the third file + df = df.merge(df__, how='inner', on='chi', suffixes=(False, False)) - # Final processing after merging - df["death"] = [1 if x else 0 for x in pd.notna(df.date_of_death)] - df["age"] = df.apply(find_age, axis=1).astype("int64") - df.drop(columns=["date_of_birth", "date_of_death"], inplace=True) + #Final processing after merging + df['death'] = [1 if x else 0 for x in pd.notna(df.date_of_death)] + df['age'] = df.apply(find_age, axis=1).astype('int64') + df.drop(columns=['date_of_birth', 'date_of_death'], inplace=True) - # save the dataframe + #save the dataframe df.to_csv(os.path.join(data_folder, "RDMP", processed_data_file)) - labels = df["death"] - df.drop(columns=["death"], inplace=True) + labels = df['death'] + df.drop(columns=['death'], inplace=True) - # OneHotEncoder + #OneHotEncoder for col in df.columns: - if df[col].dtypes in ("bool", "object"): + if df[col].dtypes in ('bool', 'object'): encoder = LabelEncoder() df[col] = encoder.fit_transform(df[col].values)