diff --git a/.codecov.yml b/.codecov.yml index dcfbc6a8..247dbfbb 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -5,4 +5,5 @@ ignore: - "aisdc/safemodel/classifiers/new_model_template.py" - "aisdc/preprocessing" - "user_stories" + - "examples" ... diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f976241a..e967f572 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -23,5 +23,5 @@ jobs: - name: pylint run: | - pylint -v --recursive=True aisdc.safemodel aisdc.attacks tests --fail-under 10 + pylint -v --recursive=True aisdc tests --fail-under 10 ... diff --git a/README.md b/README.md index 1f826951..effc4fc7 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ $ pip install aisdc[safemodel] ## Running -To run an example, simply execute the desired script or start up `jupyter notebook` and run one of the notebooks. For example, to run the LiRA example: +To run an example, simply execute the desired script. For example, to run LiRA: ``` $ python -m lira_attack_example diff --git a/aisdc/attacks/attack.py b/aisdc/attacks/attack.py index c15a1f10..ece859ba 100644 --- a/aisdc/attacks/attack.py +++ b/aisdc/attacks/attack.py @@ -9,14 +9,14 @@ class Attack: """Base (abstract) class to represent an attack.""" - def __init__(self): + def __init__(self) -> None: self.attack_config_json_file_name = None def attack(self, target: Target) -> None: """Run an attack.""" raise NotImplementedError - def __str__(self): + def __str__(self) -> str: """Return the string representation of an attack.""" raise NotImplementedError @@ -28,17 +28,16 @@ def _update_params_from_config_file(self) -> None: setattr(self, key, value) @classmethod - def _get_param_names(cls): + def _get_param_names(cls) -> list[str]: """Get parameter names.""" init_signature = inspect.signature(cls.__init__) - parameters = [ + return [ p.name for p in init_signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD ] - return parameters - def get_params(self): + def get_params(self) -> dict: """Get parameters for this attack. Returns diff --git a/aisdc/attacks/attack_report_formatter.py b/aisdc/attacks/attack_report_formatter.py index 50cfd1e6..64f359a2 100644 --- a/aisdc/attacks/attack_report_formatter.py +++ b/aisdc/attacks/attack_report_formatter.py @@ -14,11 +14,11 @@ def cleanup_files_for_release( - move_into_artefacts, - copy_into_release, - release_dir="release_files", - artefacts_dir="training_artefacts", -): + move_into_artefacts: list[str], + copy_into_release: list[str], + release_dir: str = "release_files", + artefacts_dir: str = "training_artefacts", +) -> None: """Move files created during the release process into appropriate folders.""" if not os.path.exists(release_dir): os.makedirs(release_dir) @@ -42,7 +42,7 @@ def cleanup_files_for_release( class GenerateJSONModule: """Create and append to a JSON file.""" - def __init__(self, filename=None): + def __init__(self, filename: str | None = None) -> None: self.filename = filename if self.filename is None: @@ -57,16 +57,12 @@ def __init__(self, filename=None): with open(self.filename, "w", encoding="utf-8") as f: f.write("") - def add_attack_output(self, incoming_json, class_name): + def add_attack_output(self, incoming_json: dict, class_name: str) -> None: """Add a section of JSON to the file which is already open.""" # Read the contents of the file and then clear the file with open(self.filename, "r+", encoding="utf-8") as f: file_contents = f.read() - if file_contents != "": - file_data = json.loads(file_contents) - else: - file_data = {} - + file_data = json.loads(file_contents) if file_contents != "" else {} f.truncate(0) # Add the new JSON to the JSON that was in the file, and re-write @@ -79,11 +75,11 @@ def add_attack_output(self, incoming_json, class_name): file_data[class_name] = incoming_json json.dump(file_data, f) - def get_output_filename(self): + def get_output_filename(self) -> str: """Return the filename of the JSON file which has been created.""" return self.filename - def clean_file(self): + def clean_file(self) -> None: """Delete the file if it exists.""" if os.path.exists(self.filename): os.remove(self.filename) @@ -95,20 +91,20 @@ def clean_file(self): class AnalysisModule: """Wrapper module for metrics analysis modules.""" - def __init__(self): + def __init__(self) -> None: self.immediate_rejection = [] self.support_rejection = [] self.support_release = [] - def process_dict(self): + def process_dict(self) -> dict: """Produce a risk summary output based on analysis in this module.""" raise NotImplementedError() - def get_recommendation(self): + def get_recommendation(self) -> tuple: """Return the three recommendation buckets created by this module.""" return self.immediate_rejection, self.support_rejection, self.support_release - def __str__(self): + def __str__(self) -> str: """Return the string representation of an analysis module.""" raise NotImplementedError() @@ -116,23 +112,21 @@ def __str__(self): class FinalRecommendationModule(AnalysisModule): # pylint: disable=too-many-instance-attributes """Generate the first layer of a recommendation report.""" - def __init__(self, report: dict): + def __init__(self, report: dict) -> None: super().__init__() self.P_VAL_THRESH = 0.05 self.MEAN_AUC_THRESH = 0.65 - self.INSTANCE_MODEL_WEIGHTING_SCORE = 5 self.MIN_SAMPLES_LEAF_SCORE = 5 self.STATISTICALLY_SIGNIFICANT_SCORE = 2 self.MEAN_AUC_SCORE = 4 self.report = report - self.scores = [] self.reasons = [] - def _is_instance_based_model(self, instance_based_model_score): + def _is_instance_based_model(self, instance_based_model_score) -> bool: if "model_name" in self.report: if self.report["model_name"] == "SVC": self.scores.append(instance_based_model_score) @@ -146,7 +140,7 @@ def _is_instance_based_model(self, instance_based_model_score): return True return False - def _tree_min_samples_leaf(self, min_samples_leaf_score): + def _tree_min_samples_leaf(self, min_samples_leaf_score: int | float) -> None: # Find min samples per leaf requirement base_path = pathlib.Path(__file__).parents[1] risk_appetite_path = os.path.join(base_path, "safemodel", "rules.json") @@ -158,83 +152,87 @@ def _tree_min_samples_leaf(self, min_samples_leaf_score): rules = json_structure["DecisionTreeClassifier"]["rules"] for entry in rules: - if "keyword" in entry.keys() and entry["keyword"] == "min_samples_leaf": - if "operator" in entry.keys() and entry["operator"] == "min": - min_samples_leaf_appetite = entry["value"] - break - - if ("model_params" in self.report) and min_samples_leaf_appetite is not None: - if "min_samples_leaf" in self.report["model_params"]: - min_samples_leaf = self.report["model_params"]["min_samples_leaf"] - if min_samples_leaf < min_samples_leaf_appetite: - self.scores.append(min_samples_leaf_score) - - msg = "Min samples per leaf < " + str(min_samples_leaf_appetite) - self.reasons.append(msg) - self.support_rejection.append(msg) - else: - msg = "Min samples per leaf > " + str(min_samples_leaf_appetite) - self.support_release.append(msg) + if ( + "keyword" in entry + and entry["keyword"] == "min_samples_leaf" + and "operator" in entry + and entry["operator"] == "min" + ): + min_samples_leaf_appetite = entry["value"] + break + + if ( + ("model_params" in self.report) + and min_samples_leaf_appetite is not None + and "min_samples_leaf" in self.report["model_params"] + ): + min_samples_leaf = self.report["model_params"]["min_samples_leaf"] + if min_samples_leaf < min_samples_leaf_appetite: + self.scores.append(min_samples_leaf_score) + + msg = "Min samples per leaf < " + str(min_samples_leaf_appetite) + self.reasons.append(msg) + self.support_rejection.append(msg) + else: + msg = "Min samples per leaf > " + str(min_samples_leaf_appetite) + self.support_release.append(msg) def _statistically_significant_auc( - self, p_val_thresh, mean_auc_thresh, stat_sig_score, mean_auc_score - ): + self, + p_val_thresh: float, + mean_auc_thresh: float, + stat_sig_score: float, + mean_auc_score: float, + ) -> None: stat_sig_auc = [] - for k in self.report.keys(): - if isinstance(self.report[k], dict): - if "attack_experiment_logger" in self.report[k]: - for i in self.report[k]["attack_experiment_logger"][ + for k in self.report: + if ( + isinstance(self.report[k], dict) + and "attack_experiment_logger" in self.report[k] + ): + for i in self.report[k]["attack_experiment_logger"][ + "attack_instance_logger" + ]: + instance = self.report[k]["attack_experiment_logger"][ "attack_instance_logger" - ]: - instance = self.report[k]["attack_experiment_logger"][ - "attack_instance_logger" - ][i] - - auc_key = "P_HIGHER_AUC" - if ( - auc_key in instance.keys() - and instance[auc_key] < p_val_thresh - ): - stat_sig_auc.append(instance["AUC"]) - - n_instances = len( - self.report[k]["attack_experiment_logger"][ - "attack_instance_logger" - ] + ][i] + + auc_key = "P_HIGHER_AUC" + if auc_key in instance and instance[auc_key] < p_val_thresh: + stat_sig_auc.append(instance["AUC"]) + + n_instances = len( + self.report[k]["attack_experiment_logger"]["attack_instance_logger"] + ) + if len(stat_sig_auc) / n_instances > 0.1: + msg = ">10% AUC are statistically significant in experiment " + str( + k + ) + + self.scores.append(stat_sig_score) + self.reasons.append(msg) + self.support_rejection.append(msg) + else: + msg = "<10% AUC are statistically significant in experiment " + str( + k ) - if ( - len(stat_sig_auc) / n_instances > 0.1 - ): # > 10% of AUC are statistically significant - msg = ( - ">10% AUC are statistically significant in experiment " - + str(k) - ) - - self.scores.append(stat_sig_score) + self.support_release.append(msg) + + if len(stat_sig_auc) > 0: + mean = np.mean(np.array(stat_sig_auc)) + if mean > mean_auc_thresh: + msg = "Attack AUC > threshold of " + str(mean_auc_thresh) + msg = msg + " in experiment " + str(k) + + self.scores.append(mean_auc_score) self.reasons.append(msg) self.support_rejection.append(msg) else: - msg = ( - "<10% AUC are statistically significant in experiment " - + str(k) - ) + msg = "Attack AUC <= threshold of " + str(mean_auc_thresh) + msg = msg + " in experiment " + str(k) self.support_release.append(msg) - if len(stat_sig_auc) > 0: - mean = np.mean(np.array(stat_sig_auc)) - if mean > mean_auc_thresh: - msg = "Attack AUC > threshold of " + str(mean_auc_thresh) - msg = msg + " in experiment " + str(k) - - self.scores.append(mean_auc_score) - self.reasons.append(msg) - self.support_rejection.append(msg) - else: - msg = "Attack AUC <= threshold of " + str(mean_auc_thresh) - msg = msg + " in experiment " + str(k) - self.support_release.append(msg) - - def process_dict(self): + def process_dict(self) -> dict: """Return a dictionary summarising the metrics.""" self._tree_min_samples_leaf(self.MIN_SAMPLES_LEAF_SCORE) self._statistically_significant_auc( @@ -254,10 +252,9 @@ def process_dict(self): if self._is_instance_based_model(self.INSTANCE_MODEL_WEIGHTING_SCORE): summarised_score = self.INSTANCE_MODEL_WEIGHTING_SCORE - output = {} - return output + return {} - def __str__(self): + def __str__(self) -> str: """Return string representation of the final recommendation.""" return "Final Recommendation" @@ -265,7 +262,7 @@ def __str__(self): class SummariseUnivariateMetricsModule(AnalysisModule): """Summarise a set of chosen univariate metrics from the output dictionary.""" - def __init__(self, report: dict, metrics_list=None): + def __init__(self, report: dict, metrics_list=None) -> None: super().__init__() if metrics_list is None: @@ -274,31 +271,33 @@ def __init__(self, report: dict, metrics_list=None): self.report = report self.metrics_list = metrics_list - def process_dict(self): + def process_dict(self) -> dict: """Return a dictionary summarising the metrics.""" output_dict = {} - for k in self.report.keys(): - if isinstance(self.report[k], dict): - if "attack_experiment_logger" in self.report[k]: - metrics_dict = {m: [] for m in self.metrics_list} - for _, iteration_value in self.report[k][ - "attack_experiment_logger" - ]["attack_instance_logger"].items(): - for m in metrics_dict: - metrics_dict[m].append(iteration_value[m]) - output = {} - for m in self.metrics_list: - output[m] = { - "min": min(metrics_dict[m]), - "max": max(metrics_dict[m]), - "mean": np.mean(metrics_dict[m]), - "median": np.median(metrics_dict[m]), - } - output_dict[k] = output + for k in self.report: + if ( + isinstance(self.report[k], dict) + and "attack_experiment_logger" in self.report[k] + ): + metrics_dict = {m: [] for m in self.metrics_list} + for _, iteration_value in self.report[k]["attack_experiment_logger"][ + "attack_instance_logger" + ].items(): + for m in metrics_dict: + metrics_dict[m].append(iteration_value[m]) + output = {} + for m in self.metrics_list: + output[m] = { + "min": min(metrics_dict[m]), + "max": max(metrics_dict[m]), + "mean": np.mean(metrics_dict[m]), + "median": np.median(metrics_dict[m]), + } + output_dict[k] = output return output_dict - def __str__(self): + def __str__(self) -> str: """Return the string representation of a univariate metrics module.""" return "Summary of Univarite Metrics" @@ -306,7 +305,9 @@ def __str__(self): class SummariseAUCPvalsModule(AnalysisModule): """Summarise a list of AUC values.""" - def __init__(self, report: dict, p_thresh: float = 0.05, correction: str = "bh"): + def __init__( + self, report: dict, p_thresh: float = 0.05, correction: str = "bh" + ) -> None: super().__init__() self.report = report @@ -332,28 +333,29 @@ def _n_sig(self, p_val_list: list[float], correction: str = "none") -> int: def _get_metrics_list(self) -> list[float]: metrics_list = [] - for k in self.report.keys(): - if isinstance(self.report[k], dict): - if "attack_experiment_logger" in self.report[k]: - for _, iteration_value in self.report[k][ - "attack_experiment_logger" - ]["attack_instance_logger"].items(): - metrics_list.append(iteration_value["P_HIGHER_AUC"]) + for k in self.report: + if ( + isinstance(self.report[k], dict) + and "attack_experiment_logger" in self.report[k] + ): + for _, iteration_value in self.report[k]["attack_experiment_logger"][ + "attack_instance_logger" + ].items(): + metrics_list.append(iteration_value["P_HIGHER_AUC"]) return metrics_list - def process_dict(self): + def process_dict(self) -> dict: """Process the dict to summarise the number of significant AUC p-values.""" p_val_list = self._get_metrics_list() - output = { + return { "n_total": len(p_val_list), "p_thresh": self.p_thresh, "n_sig_uncorrected": self._n_sig(p_val_list), "correction": self.correction, "n_sig_corrected": self._n_sig(p_val_list, self.correction), } - return output - def __str__(self): + def __str__(self) -> str: """Return the string representation of a AUC p-values module.""" return f"Summary of AUC p-values at p = ({self.p_thresh})" @@ -366,10 +368,9 @@ def get_metric_list(self, input_dict: dict) -> list[float]: metric_list = [] for _, iteration_value in input_dict["attack_instance_logger"].items(): metric_list.append(iteration_value["PDIF01"]) - metric_list = [np.exp(-m) for m in metric_list] - return metric_list + return [np.exp(-m) for m in metric_list] - def __str__(self): + def __str__(self) -> str: """Return the string representation of a FDIF p-values module.""" return f"Summary of FDIF p-values at p = ({self.p_thresh})" @@ -377,61 +378,66 @@ def __str__(self): class LogLogROCModule(AnalysisModule): """Generate a log-log plot.""" - def __init__(self, report: dict, output_folder=None, include_mean=True): + def __init__( + self, report: dict, output_folder: str | None = None, include_mean: bool = True + ) -> None: super().__init__() self.report = report self.output_folder = output_folder self.include_mean = include_mean - def process_dict(self): + def process_dict(self) -> str: """Create a roc plot for multiple repetitions.""" log_plot_names = [] - for k in self.report.keys(): - if isinstance(self.report[k], dict): - if "attack_experiment_logger" in self.report[k]: - plt.figure(figsize=(8, 8)) - plt.plot([0, 1], [0, 1], "k--") + for k in self.report: + if ( + isinstance(self.report[k], dict) + and "attack_experiment_logger" in self.report[k] + ): + plt.figure(figsize=(8, 8)) + plt.plot([0, 1], [0, 1], "k--") + + # Compute average ROC + base_fpr = np.linspace(0, 1, 1000) + metrics = self.report[k]["attack_experiment_logger"][ + "attack_instance_logger" + ].values() + all_tpr = np.zeros((len(metrics), len(base_fpr)), float) + + for i, metric_set in enumerate(metrics): + all_tpr[i, :] = np.interp( + base_fpr, metric_set["fpr"], metric_set["tpr"] + ) - # Compute average ROC - base_fpr = np.linspace(0, 1, 1000) - metrics = self.report[k]["attack_experiment_logger"][ - "attack_instance_logger" - ].values() - all_tpr = np.zeros((len(metrics), len(base_fpr)), float) - - for i, metric_set in enumerate(metrics): - all_tpr[i, :] = np.interp( - base_fpr, metric_set["fpr"], metric_set["tpr"] - ) - - for _, metric_set in enumerate(metrics): - plt.plot( - metric_set["fpr"], - metric_set["tpr"], - color="lightsalmon", - linewidth=0.5, - ) - - tpr_mu = all_tpr.mean(axis=0) - plt.plot(base_fpr, tpr_mu, "r") - - plt.xlabel("False Positive Rate") - plt.ylabel("True Positive Rate") - plt.xscale("log") - plt.yscale("log") - plt.tight_layout() - plt.grid() - out_file = f"{self.report['log_id']}-{self.report['metadata']['attack']}.png" - if self.output_folder is not None: - out_file = os.path.join(self.output_folder, out_file) - plt.savefig(out_file) - log_plot_names.append(out_file) - msg = "Log plot(s) saved to " + str(log_plot_names) - return msg - - def __str__(self): + for _, metric_set in enumerate(metrics): + plt.plot( + metric_set["fpr"], + metric_set["tpr"], + color="lightsalmon", + linewidth=0.5, + ) + + tpr_mu = all_tpr.mean(axis=0) + plt.plot(base_fpr, tpr_mu, "r") + + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.xscale("log") + plt.yscale("log") + plt.tight_layout() + plt.grid() + out_file = ( + f"{self.report['log_id']}-{self.report['metadata']['attack']}.png" + ) + if self.output_folder is not None: + out_file = os.path.join(self.output_folder, out_file) + plt.savefig(out_file) + log_plot_names.append(out_file) + return "Log plot(s) saved to " + str(log_plot_names) + + def __str__(self) -> str: """Return the string representation of a ROC log plot module.""" return "ROC Log Plot" @@ -439,7 +445,7 @@ def __str__(self): class GenerateTextReport: """Generate a text report from a JSON input.""" - def __init__(self): + def __init__(self) -> None: self.text_out = [] self.target_json_filename = None self.attack_json_filename = None @@ -449,7 +455,7 @@ def __init__(self): self.support_rejection = [] self.support_release = [] - def _process_target_json(self): + def _process_target_json(self) -> None: """Create a summary of a target model JSON file.""" model_params_of_interest = [ "C", @@ -469,25 +475,25 @@ def _process_target_json(self): output_string = "TARGET MODEL SUMMARY\n" - if "model_name" in json_report.keys(): + if "model_name" in json_report: output_string = ( output_string + "model_name: " + json_report["model_name"] + "\n" ) - if "n_samples" in json_report.keys(): + if "n_samples" in json_report: output_string = output_string + "number of samples used to train: " output_string = output_string + str(json_report["n_samples"]) + "\n" - if "model_params" in json_report.keys(): + if "model_params" in json_report: for param in model_params_of_interest: - if param in json_report["model_params"].keys(): + if param in json_report["model_params"]: output_string = output_string + param + ": " output_string = output_string + str( json_report["model_params"][param] ) output_string = output_string + "\n" - if "model_path" in json_report.keys(): + if "model_path" in json_report: filepath = os.path.split(os.path.abspath(self.target_json_filename))[0] self.model_name_from_target = os.path.join( filepath, json_report["model_path"] @@ -495,19 +501,17 @@ def _process_target_json(self): self.text_out.append(output_string) - def pretty_print(self, report: dict, title) -> str: + def pretty_print(self, report: dict, title: str) -> str: """Format JSON code to make it more readable for TREs.""" - returned_string = str(title) + "\n" - - for key in report.keys(): + returned_string = title + "\n" + for key in report: returned_string = returned_string + key + "\n" returned_string = returned_string + pprint.pformat(report[key]) + "\n\n" - return returned_string def process_attack_target_json( self, attack_filename: str, target_filename: str = None - ): + ) -> None: """Create a neat summary of an attack JSON file.""" self.attack_json_filename = attack_filename @@ -563,11 +567,11 @@ def process_attack_target_json( def export_to_file( # pylint: disable=too-many-arguments self, output_filename: str = "summary.txt", - move_files=False, - model_filename=None, - release_dir="release_files", - artefacts_dir="training_artefacts", - ): + move_files: bool = False, + model_filename: str | None = None, + release_dir: str = "release_files", + artefacts_dir: str = "training_artefacts", + ) -> None: """Take the input strings collected and combine into a neat text file.""" copy_of_text_out = self.text_out self.text_out = [] diff --git a/aisdc/attacks/attribute_attack.py b/aisdc/attacks/attribute_attack.py index b978adef..a74b05a1 100644 --- a/aisdc/attacks/attribute_attack.py +++ b/aisdc/attacks/attribute_attack.py @@ -67,7 +67,7 @@ def __init__( # pylint: disable = too-many-arguments self.attack_metrics: dict = {} self.metadata: dict = {} - def __str__(self): + def __str__(self) -> str: """Return the name of the attack.""" return "Attribute inference attack" @@ -155,7 +155,7 @@ def _get_inference_data( # pylint: disable=too-many-locals """Return a dataset of each sample with the attributes to test.""" attack_feature: dict = target.features[feature_id] indices: list[int] = attack_feature["indices"] - unique = np.unique(target.x_orig[:, feature_id]) + unique = np.unique(target.X_orig[:, feature_id]) n_unique: int = len(unique) if attack_feature["encoding"] == "onehot": onehot_enc = OneHotEncoder() @@ -165,12 +165,12 @@ def _get_inference_data( # pylint: disable=too-many-locals # which is only called for categorical data values = unique # samples after encoding (e.g. one-hot) - samples: np.ndarray = target.x_train + samples: np.ndarray = target.X_train # samples before encoding (e.g. str) - samples_orig: np.ndarray = target.x_train_orig + samples_orig: np.ndarray = target.X_train_orig if not memberset: - samples = target.x_test - samples_orig = target.x_test_orig + samples = target.X_test + samples_orig = target.X_test_orig n_samples, x_dim = np.shape(samples) x_values = np.zeros((n_samples, n_unique, x_dim), dtype=np.float64) y_values = target.model.predict(samples) @@ -205,7 +205,7 @@ def _infer( # pylint: disable=too-many-locals total: int = 0 # total number of inferences made x_values, y_values, baseline = _get_inference_data(target, feature_id, memberset) n_unique: int = len(x_values[1]) - samples = target.x_train if memberset else target.x_test + samples = target.X_train if memberset else target.X_test for i, x in enumerate(x_values): # each sample to perform inference on # get model confidence scores for all possible values for the sample confidence = target.model.predict_proba(x) @@ -417,8 +417,7 @@ def _attack_brute_force( logger.debug("Brute force attacking categorical features") args = [(target, feature_id, attack_threshold) for feature_id in features] with mp.Pool(processes=n_cpu) as pool: - results = pool.starmap(_infer_categorical, args) - return results + return pool.starmap(_infer_categorical, args) def _get_bounds_risk_for_sample( # pylint: disable=too-many-locals,too-many-arguments @@ -526,14 +525,14 @@ def _get_bounds_risk( target_model: BaseEstimator, feature_name: str, feature_id: int, - x_train: np.ndarray, - x_test: np.ndarray, + X_train: np.ndarray, + X_test: np.ndarray, ) -> dict: """Return a dict containing the dataset risks of a quantitative feature.""" risk: dict = { "name": feature_name, - "train": _get_bounds_risk_for_feature(target_model, feature_id, x_train), - "test": _get_bounds_risk_for_feature(target_model, feature_id, x_test), + "train": _get_bounds_risk_for_feature(target_model, feature_id, X_train), + "test": _get_bounds_risk_for_feature(target_model, feature_id, X_test), } return risk @@ -546,14 +545,13 @@ def _get_bounds_risks(target: Target, features: list[int], n_cpu: int) -> list[d target.model, target.features[feature_id]["name"], feature_id, - target.x_train, - target.x_test, + target.X_train, + target.X_test, ) for feature_id in features ] with mp.Pool(processes=n_cpu) as pool: - results = pool.starmap(_get_bounds_risk, args) - return results + return pool.starmap(_get_bounds_risk, args) def _attribute_inference(target: Target, n_cpu: int) -> dict: @@ -620,7 +618,7 @@ def create_aia_report(output: dict, name: str = "aia_report") -> FPDF: return pdf -def _run_attack_from_configfile(args): +def _run_attack_from_configfile(args: dict) -> None: """Run a command line attack based on saved files described in .json file.""" attack_obj = AttributeAttack( attack_config_json_file_name=str(args.attack_config_json_file_name), @@ -632,7 +630,7 @@ def _run_attack_from_configfile(args): attack_obj.make_report() -def main(): +def main() -> None: """Parse args and invoke relevant code.""" parser = argparse.ArgumentParser(add_help=False) diff --git a/aisdc/attacks/failfast.py b/aisdc/attacks/failfast.py index 390713ab..ac072f2a 100644 --- a/aisdc/attacks/failfast.py +++ b/aisdc/attacks/failfast.py @@ -13,7 +13,7 @@ class FailFast: a test. For the new test a new object will require to be instantiated. """ - def __init__(self, attack_obj: Any): + def __init__(self, attack_obj: Any) -> None: self.metric_name = attack_obj.attack_metric_success_name self.metric_success_thresh = attack_obj.attack_metric_success_thresh self.comp_type = attack_obj.attack_metric_success_comp_type @@ -64,7 +64,7 @@ def get_success_count(self) -> int: """Return a count of attack being successful.""" return self.success_count - def get_fail_count(self): + def get_fail_count(self) -> int: """Return a count of attack being not successful.""" return self.fail_count diff --git a/aisdc/attacks/likelihood_attack.py b/aisdc/attacks/likelihood_attack.py index fcbf22e0..23e07a43 100644 --- a/aisdc/attacks/likelihood_attack.py +++ b/aisdc/attacks/likelihood_attack.py @@ -1,6 +1,5 @@ """Likelihood testing scenario from https://arxiv.org/pdf/2112.03570.pdf.""" -# pylint: disable = invalid-name # pylint: disable = too-many-branches from __future__ import annotations @@ -37,13 +36,13 @@ class DummyClassifier: """A Dummy Classifier to allow this code to work with get_metrics.""" - def predict(self, test_X): + def predict(self, X_test): """Return an array of 1/0 depending on value in second column.""" - return 1 * (test_X[:, 1] > 0.5) + return 1 * (X_test[:, 1] > 0.5) - def predict_proba(self, test_X): - """Simply return the test_X.""" - return test_X + def predict_proba(self, X_test): + """Simply return the X_test.""" + return X_test def _logit(p: float) -> float: @@ -68,8 +67,7 @@ def _logit(p: float) -> float: if p > 1 - EPS: # pylint:disable=consider-using-min-builtin p = 1 - EPS p = max(p, EPS) - li = np.log(p / (1 - p)) - return li + return np.log(p / (1 - p)) class LIRAAttack(Attack): @@ -161,7 +159,7 @@ def __str__(self): def attack(self, target: Target) -> None: """Run a LiRA attack from a Target object and a target model. - Needs to have x_train, x_test, y_train and y_test set. + Needs to have X_train, X_test, y_train and y_test set. Parameters ---------- @@ -174,12 +172,12 @@ def attack(self, target: Target) -> None: self.run_scenario_from_preds( shadow_clf, - target.x_train, + target.X_train, target.y_train, - target.model.predict_proba(target.x_train), - target.x_test, + target.model.predict_proba(target.X_train), + target.X_test, target.y_test, - target.model.predict_proba(target.x_test), + target.model.predict_proba(target.X_test), ) def _check_and_update_dataset(self, target: Target) -> Target: @@ -210,8 +208,8 @@ def _check_and_update_dataset(self, target: Target) -> Target: ok_pos.append(i) y_test_new.append(classes.index(y)) - if len(y_test_new) != len(target.x_test): - target.x_test = target.x_test[ok_pos, :] + if len(y_test_new) != len(target.X_test): + target.X_test = target.X_test[ok_pos, :] target.y_test = np.array(y_test_new, int) logger.info( "new ytest has values and counts: %s", @@ -265,19 +263,19 @@ def run_scenario_from_preds( # pylint: disable = too-many-statements, too-many- Examples -------- >>> X, y = load_breast_cancer(return_X_y=True, as_frame=False) - >>> train_X, test_X, train_y, test_y = train_test_split( + >>> X_train, X_test, y_train, y_test = train_test_split( >>> X, y, test_size=0.5, stratify=y >>> ) >>> rf = RandomForestClassifier(min_samples_leaf=1, min_samples_split=2) - >>> rf.fit(train_X, train_y) + >>> rf.fit(X_train, y_train) >>> mia_test_probs, mia_test_labels, mia_clf = likelihood_scenario( >>> RandomForestClassifier(min_samples_leaf=1, min_samples_split=2, max_depth=10), - >>> train_X, - >>> train_y, - >>> rf.predict_proba(train_X), - >>> test_X, - >>> test_y, - >>> rf.predict_proba(test_X), + >>> X_train, + >>> y_train, + >>> rf.predict_proba(X_train), + >>> X_test, + >>> y_test, + >>> rf.predict_proba(X_test), >>> n_shadow_models=100 >>> ) """ @@ -288,7 +286,7 @@ def run_scenario_from_preds( # pylint: disable = too-many-statements, too-many- indices = np.arange(0, n_train_rows + n_shadow_rows, 1) # Combine taregt and shadow train, from which to sample datasets - combined_X_train = np.vstack((X_target_train, X_shadow_train)) + combined_x_train = np.vstack((X_target_train, X_shadow_train)) combined_y_train = np.hstack((y_target_train, y_shadow_train)) train_row_to_confidence = {i: [] for i in range(n_train_rows)} @@ -302,19 +300,18 @@ def run_scenario_from_preds( # pylint: disable = too-many-statements, too-many- # Pick the indices to use for training this one np.random.seed(model_idx) # Reproducibility these_idx = np.random.choice(indices, n_train_rows, replace=False) - temp_X_train = combined_X_train[these_idx, :] + temp_x_train = combined_x_train[these_idx, :] temp_y_train = combined_y_train[these_idx] # Fit the shadow model shadow_clf.set_params(random_state=model_idx) - shadow_clf.fit(temp_X_train, temp_y_train) + shadow_clf.fit(temp_x_train, temp_y_train) # map a class to a column class_map = {c: i for i, c in enumerate(shadow_clf.classes_)} # Get the predicted probabilities on the training data confidences = shadow_clf.predict_proba(X_target_train) - # print(f'shadow clf returned confidences with shape {confidences.shape}') these_idx = set(these_idx) for i in range(n_train_rows): @@ -394,10 +391,8 @@ def run_scenario_from_preds( # pylint: disable = too-many-statements, too-many- mia_scores = np.array(mia_scores) mia_labels = np.array(mia_labels) - y_pred_proba, y_test = metrics.get_probabilities( - mia_clf, mia_scores, mia_labels, permute_rows=True - ) - self.attack_metrics = [metrics.get_metrics(y_pred_proba, y_test)] + y_pred_proba = mia_clf.predict_proba(mia_scores) + self.attack_metrics = [metrics.get_metrics(y_pred_proba, mia_labels)] def example(self) -> None: """Run an example attack using data from sklearn. @@ -405,19 +400,19 @@ def example(self) -> None: Generates example data, trains a classifier and tuns the attack """ X, y = load_breast_cancer(return_X_y=True, as_frame=False) - train_X, test_X, train_y, test_y = train_test_split( + X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, stratify=y ) rf = RandomForestClassifier(min_samples_leaf=1, min_samples_split=2) - rf.fit(train_X, train_y) + rf.fit(X_train, y_train) self.run_scenario_from_preds( sklearn.base.clone(rf), - train_X, - train_y, - rf.predict_proba(train_X), - test_X, - test_y, - rf.predict_proba(test_X), + X_train, + y_train, + rf.predict_proba(X_train), + X_test, + y_test, + rf.predict_proba(X_test), ) def _construct_metadata(self) -> None: @@ -514,19 +509,19 @@ def setup_example_data(self) -> None: the attack from the command line. """ X, y = load_breast_cancer(return_X_y=True) - train_X, test_X, train_y, test_y = train_test_split( + X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, stratify=y ) rf = RandomForestClassifier(min_samples_split=2, min_samples_leaf=1) - rf.fit(train_X, train_y) - train_data = np.hstack((train_X, train_y[:, None])) + rf.fit(X_train, y_train) + train_data = np.hstack((X_train, y_train[:, None])) np.savetxt("train_data.csv", train_data, delimiter=",") - test_data = np.hstack((test_X, test_y[:, None])) + test_data = np.hstack((X_test, y_test[:, None])) np.savetxt("test_data.csv", test_data, delimiter=",") - train_preds = rf.predict_proba(train_X) - test_preds = rf.predict_proba(test_X) + train_preds = rf.predict_proba(X_train) + test_preds = rf.predict_proba(X_test) np.savetxt("train_preds.csv", train_preds, delimiter=",") np.savetxt("test_preds.csv", test_preds, delimiter=",") @@ -547,23 +542,23 @@ def attack_from_config(self) -> None: # pylint: disable = too-many-locals logger = logging.getLogger("run-attack") logger.info("Loading training data csv from %s", self.training_data_filename) training_data = np.loadtxt(self.training_data_filename, delimiter=",") - train_X = training_data[:, :-1] - train_y = training_data[:, -1].flatten().astype(int) - logger.info("Loaded %d rows", len(train_X)) + X_train = training_data[:, :-1] + y_train = training_data[:, -1].flatten().astype(int) + logger.info("Loaded %d rows", len(X_train)) logger.info("Loading test data csv from %s", self.test_data_filename) test_data = np.loadtxt(self.test_data_filename, delimiter=",") - test_X = test_data[:, :-1] - test_y = test_data[:, -1].flatten().astype(int) - logger.info("Loaded %d rows", len(test_X)) + X_test = test_data[:, :-1] + y_test = test_data[:, -1].flatten().astype(int) + logger.info("Loaded %d rows", len(X_test)) logger.info("Loading train predictions form %s", self.training_preds_filename) train_preds = np.loadtxt(self.training_preds_filename, delimiter=",") - assert len(train_preds) == len(train_X) + assert len(train_preds) == len(X_train) logger.info("Loading test predictions form %s", self.test_preds_filename) test_preds = np.loadtxt(self.test_preds_filename, delimiter=",") - assert len(test_preds) == len(test_X) + assert len(test_preds) == len(X_test) if self.target_model is None: raise ValueError("Target model cannot be None") if self.target_model_hyp is None: @@ -575,7 +570,7 @@ def attack_from_config(self) -> None: # pylint: disable = too-many-locals clf = clf_class(**clf_params) logger.info("Created model: %s", str(clf)) self.run_scenario_from_preds( - clf, train_X, train_y, train_preds, test_X, test_y, test_preds + clf, X_train, y_train, train_preds, X_test, y_test, test_preds ) logger.info("Computing metrics") @@ -610,7 +605,6 @@ def _example(args): def _run_attack(args): """Run a command line attack based on saved files described in .json file.""" - # attack_obj = LIRAAttack(**args.__dict__) attack_obj = LIRAAttack( n_shadow_models=args.n_shadow_models, n_shadow_rows_confidences_min=args.n_shadow_rows_confidences_min, diff --git a/aisdc/attacks/multiple_attacks.py b/aisdc/attacks/multiple_attacks.py index 48e4191c..c192fd5b 100644 --- a/aisdc/attacks/multiple_attacks.py +++ b/aisdc/attacks/multiple_attacks.py @@ -19,22 +19,18 @@ class MultipleAttacks(Attack): """Wrap the MIA and AIA attack codes.""" - def __init__( - self, - config_filename: str = None, - ) -> None: - super().__init__() - self.config_filename = config_filename - """Construct an object to execute a worst case attack. + def __init__(self, config_filename: str = None) -> None: + """Construct an object to execute multiple attacks. Parameters ---------- config_filename : str - name of the configuration file which has configurations in a single - JSON file to support running multiple attacks. + Name of a JSON file containing attack configurations. """ + super().__init__() + self.config_filename = config_filename - def __str__(self): + def __str__(self) -> None: """Return the name of the attack.""" return "Multiple Attacks (MIA and AIA) given configurations" @@ -44,8 +40,8 @@ def attack(self, target: Target) -> None: Parameters ---------- target : attacks.target.Target - target as an instance of the Target class. Needs to have x_train, - x_test, y_train and y_test set. + Target as an instance of the Target class. Needs to have X_train, + X_test, y_train and y_test set. """ logger = logging.getLogger("attack-multiple attacks") logger.info("Running attacks") @@ -84,10 +80,7 @@ def attack(self, target: Target) -> None: class ConfigFile: """Create a single JSON configuration file.""" - def __init__( - self, - filename: str = None, - ) -> None: + def __init__(self, filename: str = None) -> None: self.filename = filename dirname = os.path.normpath(os.path.dirname(self.filename)) @@ -117,15 +110,11 @@ def read_config_file(self) -> dict: """Read a JSON config file and return dict with configuration objects.""" with open(self.filename, encoding="utf-8") as f: file_contents = f.read() - if file_contents != "": - config_file_data = json.loads(file_contents) - else: - config_file_data = {} - return config_file_data + return json.loads(file_contents) if file_contents != "" else {} -def _run_attack_from_configfile(args): - """Run a command line attack based on saved files described in .json file.""" +def _run_attack_from_configfile(args) -> None: + """Run a command line attack based on saved files described in a JSON file.""" attack_obj = MultipleAttacks( config_filename=str(args.config_filename), ) @@ -134,7 +123,7 @@ def _run_attack_from_configfile(args): attack_obj.attack(target) -def main(): +def main() -> None: """Parse args and invoke relevant code.""" parser = argparse.ArgumentParser(add_help=False) diff --git a/aisdc/attacks/report.py b/aisdc/attacks/report.py index 043154d4..3712fdc5 100644 --- a/aisdc/attacks/report.py +++ b/aisdc/attacks/report.py @@ -3,6 +3,7 @@ import abc import json import os +from typing import Any import numpy as np import pylab as plt @@ -79,7 +80,7 @@ class NumpyArrayEncoder(json.JSONEncoder): """Json encoder that can cope with numpy arrays.""" - def default(self, o): + def default(self, o: Any): """If an object is an np.ndarray, convert to list.""" if isinstance(o, np.ndarray): return o.tolist() @@ -92,7 +93,9 @@ def default(self, o): return json.JSONEncoder.default(self, o) -def _write_dict(pdf, input_dict, indent=0, border=BORDER): +def _write_dict( + pdf: FPDF, input_dict: dict, indent: int = 0, border: int = BORDER +) -> None: """Write a dictionary to the pdf.""" for key, value in input_dict.items(): pdf.set_font("arial", "B", 14) @@ -103,7 +106,13 @@ def _write_dict(pdf, input_dict, indent=0, border=BORDER): pdf.ln(h=5) -def title(pdf, text, border=BORDER, font_size=24, font_style="B"): +def title( + pdf: FPDF, + text: str, + border: int = BORDER, + font_size: int = 24, + font_style: str = "B", +) -> None: """Write a title block.""" pdf.set_font("arial", font_style, font_size) pdf.ln(h=5) @@ -111,14 +120,29 @@ def title(pdf, text, border=BORDER, font_size=24, font_style="B"): pdf.ln(h=5) -def subtitle(pdf, text, indent=10, border=BORDER, font_size=12, font_style="B"): # pylint: disable = too-many-arguments +def subtitle( # pylint: disable = too-many-arguments + pdf: FPDF, + text: str, + indent: int = 10, + border: int = BORDER, + font_size: int = 12, + font_style: str = "B", +) -> None: """Write a subtitle block.""" pdf.cell(indent, border=border) pdf.set_font("arial", font_style, font_size) pdf.cell(75, 10, text, border, 1) -def line(pdf, text, indent=0, border=BORDER, font_size=11, font_style="", font="arial"): # pylint: disable = too-many-arguments +def line( # pylint: disable = too-many-arguments + pdf: FPDF, + text: str, + indent: int = 0, + border: int = BORDER, + font_size: int = 11, + font_style: str = "", + font: str = "arial", +) -> None: """Write a standard block.""" if indent > 0: pdf.cell(indent, border=border) @@ -126,7 +150,7 @@ def line(pdf, text, indent=0, border=BORDER, font_size=11, font_style="", font=" pdf.multi_cell(0, 5, text, border, 1) -def _roc_plot_single(metrics, save_name): +def _roc_plot_single(metrics: dict, save_name: str) -> None: """Create a roc_plot for a single experiment.""" plt.figure() plt.plot([0, 1], [0, 1], "k--") @@ -140,18 +164,14 @@ def _roc_plot_single(metrics, save_name): plt.savefig(save_name) -def _roc_plot(metrics, dummy_metrics, save_name): +def _roc_plot(metrics: dict, dummy_metrics: list, save_name: str) -> None: """Create a roc plot for multiple repetitions.""" plt.figure() plt.plot([0, 1], [0, 1], "k--") - if dummy_metrics is None or len(dummy_metrics) == 0: - do_dummy = False - else: - do_dummy = True + do_dummy = bool(dummy_metrics) # Compute average ROC base_fpr = np.linspace(0, 1, 1000) - # base_fpr = np.logspace(-4, 0, 1000) all_tpr = np.zeros((len(metrics), len(base_fpr)), float) for i, metric_set in enumerate(metrics): all_tpr[i, :] = np.interp(base_fpr, metric_set["fpr"], metric_set["tpr"]) @@ -199,23 +219,21 @@ def create_mia_report(attack_output: dict) -> FPDF: Parameters ---------- attack_output : dict - dictionary with following items - - metadata: dict - dictionary of metadata + Dictionary with the following items: - attack_experiment_logger: dict - list of metrics as dictionary items for an experiment - - dummy_attack_experiment_logger: dict - list of metrics as dictionary items across dummy experiments + metadata : dict + Dictionary of metadata. + attack_experiment_logger : dict + List of metrics as dictionary items for an experiment. + dummy_attack_experiment_logger : dict + List of metrics as dictionary items across dummy experiments. Returns ------- pdf : fpdf.FPDF fpdf document object """ - # dummy_metrics = attack_output["dummy_attack_metrics"] + do_dummy = False dummy_metrics = [] mia_metrics = [ v @@ -223,12 +241,7 @@ def create_mia_report(attack_output: dict) -> FPDF: "attack_instance_logger" ].items() ] - # mia_metrics = attack_output["attack_metrics"] metadata = attack_output["metadata"] - if dummy_metrics is None or len(dummy_metrics) == 0: - do_dummy = False - else: - do_dummy = True dest_log_roc = ( os.path.join( @@ -332,7 +345,7 @@ def add_output_to_pdf(report_dest: str, pdf_report: FPDF, attack_type: str) -> N os.remove(path) -def _add_log_roc_to_page(log_roc: str = None, pdf_obj: FPDF = None): +def _add_log_roc_to_page(log_roc: str = None, pdf_obj: FPDF = None) -> None: if log_roc is not None: pdf_obj.add_page() subtitle(pdf_obj, "Log ROC") @@ -340,7 +353,7 @@ def _add_log_roc_to_page(log_roc: str = None, pdf_obj: FPDF = None): pdf_obj.set_font("arial", "", 12) -def create_json_report(output): +def create_json_report(output: dict) -> None: """Create a report in json format for injestion by other tools.""" # Initial work, just dump mia_metrics and dummy_metrics into a json structure return json.dumps(output, cls=NumpyArrayEncoder) @@ -352,16 +365,15 @@ def create_lr_report(output: dict) -> FPDF: Parameters ---------- output : dict - dictionary with following items + Dictionary with the following items: - metadata: dict - dictionary of metadata + metadata : dict + Dictionary of metadata. - attack_experiment_logger: dict - list of metrics as dictionary items for an experiments - In case of LIRA attack scenario, this will have dictionary - items of attack_instance_logger that - will have a single metrics dictionary + attack_experiment_logger : dict + List of metrics as dictionary items for an experiments. + In case of LiRA attack scenario, this will have dictionary items of + `attack_instance_logger` that will have a single metrics dictionary. Returns ------- @@ -396,9 +408,8 @@ def create_lr_report(output: dict) -> FPDF: key: val for key, val in mia_metrics.items() if isinstance(val, float) } for key, value in sub_metrics_dict.items(): - if key in MAPPINGS: - value = MAPPINGS[key](value) - line(pdf, f"{key:>30s}: {value:.4f}", font="courier") + val = MAPPINGS[key](value) if key in MAPPINGS else value + line(pdf, f"{key:>30s}: {val:.4f}", font="courier") pdf.add_page() subtitle(pdf, "ROC Curve") diff --git a/aisdc/attacks/structural_attack.py b/aisdc/attacks/structural_attack.py index a9f26380..f44ec5f0 100644 --- a/aisdc/attacks/structural_attack.py +++ b/aisdc/attacks/structural_attack.py @@ -51,7 +51,7 @@ def get_unnecessary_risk(model: BaseEstimator) -> bool: Notes ----- - Returns 1 if high risk, otherwise 0. + Returns True if high risk, otherwise False. """ if not isinstance( model, (DecisionTreeClassifier, RandomForestClassifier, XGBClassifier) @@ -205,19 +205,23 @@ def __init__( # pylint: disable = too-many-arguments attack_config_json_file_name: str = None, risk_appetite_config: str = "default", target_path: str = None, - output_dir="outputs_structural", - report_name="report_structural", + output_dir: str = "outputs_structural", + report_name: str = "report_structural", ) -> None: """Construct an object to execute a structural attack. Parameters ---------- - report_name : str - name of the pdf and json output reports - target_path : str - path to the saved trained target model and target data + attack_config_json_file_name : str + Name of a JSON file containing an attack configuration. risk_appetite_config : str - path to yaml file specifying TRE risk appetite + Path to yaml file specifying TRE risk appetite. + target_path : str + Path to the saved trained target model and target data. + output_dir : str + Name of a directory to write outputs. + report_name : str + Name of the pdf and json output reports. """ super().__init__() logger = logging.getLogger("structural_attack") @@ -257,7 +261,7 @@ def __init__( # pylint: disable = too-many-arguments self.output_dir = output_dir self.report_name = report_name - def __str__(self): + def __str__(self) -> str: """Return the name of the attack.""" return "Structural attack" @@ -280,7 +284,7 @@ def attack(self, target: Target) -> None: raise NotImplementedError(errstr) # get proba values for training data - x = self.target.x_train + x = self.target.X_train y = self.target.y_train assert x.shape[0] == len(y), "length mismatch between trainx and trainy" self.yprobs = self.target.model.predict_proba(x) @@ -300,7 +304,7 @@ def attack(self, target: Target) -> None: # now assess the risk # Degrees of Freedom n_params = get_model_param_count(target.model) - residual_dof = self.target.x_train.shape[0] - n_params + residual_dof = self.target.X_train.shape[0] - n_params self.DoF_risk = 1 if residual_dof < self.DOF_THRESHOLD else 0 # k-anonymity @@ -320,7 +324,7 @@ def attack(self, target: Target) -> None: def dt_get_equivalence_classes(self) -> tuple: """Get details of equivalence classes based on white box inspection.""" - destinations = self.target.model.apply(self.target.x_train) + destinations = self.target.model.apply(self.target.X_train) ret_tuple = np.unique(destinations, return_counts=True) leaves = ret_tuple[0] counts = ret_tuple[1] @@ -332,7 +336,7 @@ def dt_get_equivalence_classes(self) -> tuple: equiv_classes = np.zeros((len(leaves), self.target.model.n_classes_)) for group in range(len(leaves)): sample_id = members[group][0] - sample = self.target.x_train[sample_id] + sample = self.target.X_train[sample_id] proba = self.target.model.predict_proba(sample.reshape(1, -1)) equiv_classes[group] = proba return [equiv_classes, counts, members] @@ -408,7 +412,7 @@ def make_report(self) -> dict: return output -def _run_attack(args): +def _run_attack(args) -> None: """Initialise class and run attack.""" attack_obj = StructuralAttack( risk_appetite_config=args.risk_appetite_config, @@ -423,7 +427,7 @@ def _run_attack(args): _ = attack_obj.make_report() -def _run_attack_from_configfile(args): +def _run_attack_from_configfile(args) -> None: """Initialise class and run attack using config file.""" attack_obj = StructuralAttack( attack_config_json_file_name=str(args.attack_config_json_file_name), @@ -435,7 +439,7 @@ def _run_attack_from_configfile(args): _ = attack_obj.make_report() -def main(): +def main() -> None: """Parse arguments and invoke relevant method.""" logger = logging.getLogger("main") parser = argparse.ArgumentParser(description="Perform a structural attack") diff --git a/aisdc/attacks/target.py b/aisdc/attacks/target.py index 71f88180..0f3e977e 100644 --- a/aisdc/attacks/target.py +++ b/aisdc/attacks/target.py @@ -35,11 +35,11 @@ def __init__(self, model: sklearn.base.BaseEstimator | None = None) -> None: The name of the dataset. n_samples : int The total number of samples in the dataset. - x_train : np.ndarray + X_train : np.ndarray The (processed) training inputs. y_train : np.ndarray The (processed) training outputs. - x_test : np.ndarray + X_test : np.ndarray The (processed) testing inputs. y_test : np.ndarray The (processed) testing outputs. @@ -47,15 +47,15 @@ def __init__(self, model: sklearn.base.BaseEstimator | None = None) -> None: Dictionary describing the dataset features. n_features : int The total number of features. - x_orig : np.ndarray + X_orig : np.ndarray The original (unprocessed) dataset inputs. y_orig : np.ndarray The original (unprocessed) dataset outputs. - x_train_orig : np.ndarray + X_train_orig : np.ndarray The original (unprocessed) training inputs. y_train_orig : np.ndarray The original (unprocessed) training outputs. - x_test_orig : np.ndarray + X_test_orig : np.ndarray The original (unprocessed) testing inputs. y_test_orig : np.ndarray The original (unprocessed) testing outputs. @@ -68,17 +68,17 @@ def __init__(self, model: sklearn.base.BaseEstimator | None = None) -> None: """ self.name: str = "" self.n_samples: int = 0 - self.x_train: np.ndarray + self.X_train: np.ndarray self.y_train: np.ndarray - self.x_test: np.ndarray + self.X_test: np.ndarray self.y_test: np.ndarray self.features: dict = {} self.n_features: int = 0 - self.x_orig: np.ndarray + self.X_orig: np.ndarray self.y_orig: np.ndarray - self.x_train_orig: np.ndarray + self.X_train_orig: np.ndarray self.y_train_orig: np.ndarray - self.x_test_orig: np.ndarray + self.X_test_orig: np.ndarray self.y_test_orig: np.ndarray self.n_samples_orig: int = 0 self.model: sklearn.base.BaseEstimator | None = model @@ -86,17 +86,17 @@ def __init__(self, model: sklearn.base.BaseEstimator | None = None) -> None: def add_processed_data( self, - x_train: np.ndarray, + X_train: np.ndarray, y_train: np.ndarray, - x_test: np.ndarray, + X_test: np.ndarray, y_test: np.ndarray, ) -> None: """Add a processed and split dataset.""" - self.x_train = x_train + self.X_train = X_train self.y_train = np.array(y_train, int) - self.x_test = x_test + self.X_test = X_test self.y_test = np.array(y_test, int) - self.n_samples = len(x_train) + len(x_test) + self.n_samples = len(X_train) + len(X_test) def add_feature(self, name: str, indices: list[int], encoding: str) -> None: """Add a feature description to the data dictionary.""" @@ -110,21 +110,21 @@ def add_feature(self, name: str, indices: list[int], encoding: str) -> None: def add_raw_data( # pylint: disable=too-many-arguments self, - x_orig: np.ndarray, + X_orig: np.ndarray, y_orig: np.ndarray, - x_train_orig: np.ndarray, + X_train_orig: np.ndarray, y_train_orig: np.ndarray, - x_test_orig: np.ndarray, + X_test_orig: np.ndarray, y_test_orig: np.ndarray, ) -> None: """Add original unprocessed dataset.""" - self.x_orig = x_orig + self.X_orig = X_orig self.y_orig = y_orig - self.x_train_orig = x_train_orig + self.X_train_orig = X_train_orig self.y_train_orig = y_train_orig - self.x_test_orig = x_test_orig + self.X_test_orig = X_test_orig self.y_test_orig = y_test_orig - self.n_samples_orig = len(x_orig) + self.n_samples_orig = len(X_orig) def __save_model(self, path: str, ext: str, target: dict) -> None: """Save the target model. @@ -220,15 +220,15 @@ def __save_data(self, path: str, target: dict) -> None: target : dict Target class as a dictionary for writing JSON. """ - self.__save_numpy(path, target, "x_train") + self.__save_numpy(path, target, "X_train") self.__save_numpy(path, target, "y_train") - self.__save_numpy(path, target, "x_test") + self.__save_numpy(path, target, "X_test") self.__save_numpy(path, target, "y_test") - self.__save_numpy(path, target, "x_orig") + self.__save_numpy(path, target, "X_orig") self.__save_numpy(path, target, "y_orig") - self.__save_numpy(path, target, "x_train_orig") + self.__save_numpy(path, target, "X_train_orig") self.__save_numpy(path, target, "y_train_orig") - self.__save_numpy(path, target, "x_test_orig") + self.__save_numpy(path, target, "X_test_orig") self.__save_numpy(path, target, "y_test_orig") def __load_data(self, path: str, target: dict) -> None: @@ -241,15 +241,15 @@ def __load_data(self, path: str, target: dict) -> None: target : dict Target class as a dictionary read from JSON. """ - self.__load_numpy(path, target, "x_train") + self.__load_numpy(path, target, "X_train") self.__load_numpy(path, target, "y_train") - self.__load_numpy(path, target, "x_test") + self.__load_numpy(path, target, "X_test") self.__load_numpy(path, target, "y_test") - self.__load_numpy(path, target, "x_orig") + self.__load_numpy(path, target, "X_orig") self.__load_numpy(path, target, "y_orig") - self.__load_numpy(path, target, "x_train_orig") + self.__load_numpy(path, target, "X_train_orig") self.__load_numpy(path, target, "y_train_orig") - self.__load_numpy(path, target, "x_test_orig") + self.__load_numpy(path, target, "X_test_orig") self.__load_numpy(path, target, "y_test_orig") def __ge(self) -> str: @@ -262,14 +262,14 @@ def __ge(self) -> str: """ if ( hasattr(self.model, "score") - and hasattr(self, "x_train") + and hasattr(self, "X_train") and hasattr(self, "y_train") - and hasattr(self, "x_test") + and hasattr(self, "X_test") and hasattr(self, "y_test") ): try: - train = self.model.score(self.x_train, self.y_train) - test = self.model.score(self.x_test, self.y_test) + train = self.model.score(self.X_train, self.y_train) + test = self.model.score(self.X_test, self.y_test) return str(test - train) except sklearn.exceptions.NotFittedError: return "not fitted" @@ -351,6 +351,6 @@ def add_safemodel_results(self, data: list) -> None: """ self.safemodel = data - def __str__(self): + def __str__(self) -> str: """Return the name of the dataset used.""" return self.name diff --git a/aisdc/attacks/worst_case_attack.py b/aisdc/attacks/worst_case_attack.py index cbd18d70..17a6f866 100644 --- a/aisdc/attacks/worst_case_attack.py +++ b/aisdc/attacks/worst_case_attack.py @@ -1,15 +1,14 @@ """Run a worst case attack based upon predictive probabilities.""" -# pylint: disable = too-many-lines - from __future__ import annotations import argparse import logging import os import uuid +from collections.abc import Iterable from datetime import datetime -from typing import Any, Iterable +from typing import Any import numpy as np from sklearn.ensemble import RandomForestClassifier @@ -196,7 +195,7 @@ def __init__( # pylint: disable = too-many-arguments, too-many-locals, too-many self.dummy_attack_metric_failfast_summary = None self.metadata = None - def __str__(self): + def __str__(self) -> str: """Return name of attack.""" return "WorstCase attack" @@ -210,13 +209,13 @@ def attack(self, target: Target) -> None: target : attacks.target.Target target as a Target class object """ - train_preds = target.model.predict_proba(target.x_train) - test_preds = target.model.predict_proba(target.x_test) + train_preds = target.model.predict_proba(target.X_train) + test_preds = target.model.predict_proba(target.X_test) train_correct = None test_correct = None if self.include_model_correct_feature: - train_correct = 1 * (target.y_train == target.model.predict(target.x_train)) - test_correct = 1 * (target.y_test == target.model.predict(target.x_test)) + train_correct = 1 * (target.y_train == target.model.predict(target.X_train)) + test_correct = 1 * (target.y_test == target.model.predict(target.X_test)) self.attack_from_preds( train_preds, @@ -225,7 +224,7 @@ def attack(self, target: Target) -> None: test_correct=test_correct, ) - def attack_from_prediction_files(self): + def attack_from_prediction_files(self) -> None: """Run attack from saved prediction files. To be used when only saved predictions are available. @@ -374,11 +373,9 @@ def run_attack_reps( # pylint: disable = too-many-locals ) attack_classifier = self.mia_attack_model(**self.mia_attack_model_hyp) attack_classifier.fit(mi_train_x, mi_train_y) - y_pred_proba, y_test = metrics.get_probabilities( - attack_classifier, mi_test_x, mi_test_y, permute_rows=True - ) - mia_metrics.append(metrics.get_metrics(y_pred_proba, y_test)) + y_pred_proba = attack_classifier.predict_proba(mi_test_x) + mia_metrics.append(metrics.get_metrics(y_pred_proba, mi_test_y)) if self.include_model_correct_feature and train_correct is not None: # Compute the Yeom TPR and FPR @@ -453,7 +450,9 @@ def _get_global_metrics(self, attack_metrics: list) -> dict: return global_metrics - def _get_n_significant(self, p_val_list, p_thresh, bh_fdr_correction=False) -> int: + def _get_n_significant( + self, p_val_list: list[float], p_thresh: float, bh_fdr_correction: bool = False + ) -> int: """Return number of p-values significant at p_thresh. Can perform multiple testing correction. @@ -464,11 +463,7 @@ def _get_n_significant(self, p_val_list, p_thresh, bh_fdr_correction=False) -> i n_vals = len(p_val_list) hoch_vals = np.array([(k / n_vals) * P_THRESH for k in range(1, n_vals + 1)]) bh_sig_list = p_val_list <= hoch_vals - if any(bh_sig_list): - n_sig_bh = (np.where(bh_sig_list)[0]).max() + 1 - else: - n_sig_bh = 0 - return n_sig_bh + return np.where(bh_sig_list)[0].max() + 1 if any(bh_sig_list) else 0 def _generate_array(self, n_rows: int, beta: float) -> np.ndarray: """Generate array of predictions, used when doing baseline experiments. @@ -527,11 +522,6 @@ def generate_arrays( def make_dummy_data(self) -> None: """Make dummy data for testing functionality. - Parameters - ---------- - args : dict - Command line arguments - Notes ----- Returns nothing but saves two .csv files. @@ -553,7 +543,7 @@ def make_dummy_data(self) -> None: np.savetxt(self.training_preds_filename, train_preds, delimiter=",") np.savetxt(self.test_preds_filename, test_preds, delimiter=",") - def _construct_metadata(self): + def _construct_metadata(self) -> None: """Construct the metadata object after attacks.""" self.metadata = {} # Store all args @@ -639,7 +629,7 @@ def make_report(self) -> dict: return output -def _make_dummy_data(args): +def _make_dummy_data(args) -> None: """Initialise class and run dummy data creation.""" args.__dict__["training_preds_filename"] = "train_preds.csv" args.__dict__["test_preds_filename"] = "test_preds.csv" @@ -654,7 +644,7 @@ def _make_dummy_data(args): attack_obj.make_dummy_data() -def _run_attack(args): +def _run_attack(args) -> None: """Initialise class and run attack from prediction files.""" attack_obj = WorstCaseAttack( n_reps=args.n_reps, @@ -679,7 +669,7 @@ def _run_attack(args): _ = attack_obj.make_report() -def _run_attack_from_configfile(args): +def _run_attack_from_configfile(args) -> None: """Initialise class and run attack from prediction files using config file.""" attack_obj = WorstCaseAttack( attack_config_json_file_name=str(args.attack_config_json_file_name), @@ -691,7 +681,7 @@ def _run_attack_from_configfile(args): _ = attack_obj.make_report() -def main(): +def main() -> None: """Parse arguments and invoke relevant method.""" logger = logging.getLogger("main") parser = argparse.ArgumentParser( diff --git a/aisdc/metrics.py b/aisdc/metrics.py index 2591ad5f..2b50793c 100644 --- a/aisdc/metrics.py +++ b/aisdc/metrics.py @@ -9,8 +9,6 @@ from scipy.stats import norm from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve -# pylint: disable = invalid-name - VAR_THRESH = 1e-2 @@ -34,11 +32,7 @@ def _div(x: float, y: float, default: float) -> float: division : float x / y, or default if y == 0 """ - if y != 0: - division = round(float(x / y), 8) - else: - division = float(default) - return division + return round(float(x / y), 8) if y != 0 else float(default) def _tpr_at_fpr( @@ -76,9 +70,7 @@ def _tpr_at_fpr( tpr_from_thresh = interpolate.interp1d(thresh_vals, tpr_vals) thresh = thresh_from_fpr(fpr) - tpr = tpr_from_thresh(thresh) - - return tpr + return tpr_from_thresh(thresh) def _expected_auc_var(auc: float, num_pos: int, num_neg: int) -> float: @@ -101,12 +93,11 @@ def _expected_auc_var(auc: float, num_pos: int, num_neg: int) -> float: null variance of AUC """ p_xxy = p_xyy = 1 / 3 - var = ( + return ( auc * (1 - auc) + (num_pos - 1) * (p_xxy - auc**2) + (num_neg - 1) * (p_xyy - auc**2) ) / (num_pos * num_neg) - return var def min_max_disc( @@ -176,11 +167,7 @@ def min_max_disc( sdm = np.sqrt(2 * pos_frequency * (1 - pos_frequency) / n_examples) pval = 1 - norm.cdf(mmd, loc=0, scale=sdm) # normal CDF if log_p: - if pval < 1e-50: - pval = -115.13 - else: - pval = np.log(pval) - + pval = -115.13 if pval < 1e-50 else np.log(pval) # Return return maxd, mind, mmd, pval @@ -209,55 +196,25 @@ def auc_p_val(auc: float, n_pos: int, n_neg: int) -> tuple[float, float]: return auc_p, auc_std -def get_probabilities( # pylint: disable=too-many-locals - clf, - X_test: np.ndarray, - y_test: np.ndarray = np.array([]), - permute_rows: bool = False, -): - """Get probabilities for a given model and dataset. +def _permute_rows(X_test: np.ndarray, y_test: np.ndarray) -> None: + """Permute rows. Parameters ---------- - clf : sklearn.Model - trained model X_test : np.ndarray - test data matrix + Array of features to be permuted. y_test : np.ndarray - test data labels - permute_rows : boolean - a flag to indicate whether rows should be permuted - - Returns - ------- - y_pred_proba : a list of probabilities for each sample in the dataset - - Notes - ----- - If permute_rows is set to true, y_test must also be supplied. - The function will then return both the predicted probabilities and corresponding y_test + Array of labels to be permuted. """ - if permute_rows and (y_test is None): - raise ValueError("If permute_rows is set to True, y_test must be supplied") - - if permute_rows: - N, _ = np.array(X_test).shape - order = np.random.RandomState( # pylint: disable = no-member - seed=10 - ).permutation(N) - X_test = X_test[order, :] - y_test = y_test[order] - - y_pred_proba = clf.predict_proba(X_test) + N, _ = X_test.shape + order = np.random.RandomState(seed=10).permutation(N) + X_test = X_test[order, :] + y_test = y_test[order] - if permute_rows: - return y_pred_proba, y_test - return y_pred_proba - -def get_metrics( # pylint: disable=too-many-locals, too-many-statements - y_pred_proba: np.ndarray, y_test: np.ndarray -): +def get_metrics( # pylint: disable=too-many-locals + y_pred_proba: np.ndarray, y_test: np.ndarray, permute_rows: bool = True +) -> dict: """Calculate metrics, including attacker advantage for MIA binary. Implemented as Definition 4 on https://arxiv.org/pdf/1709.01604.pdf @@ -267,9 +224,11 @@ def get_metrics( # pylint: disable=too-many-locals, too-many-statements Parameters ---------- y_test : np.ndarray - test data labels + Test data labels. y_pred_proba : np.ndarray of shape [x,2] and type float - predicted probabilities + Predicted probabilities. + permute_rows : bool, default True + Whether to permute arrays, see: https://github.com/AI-SDC/AI-SDC/issues/106 Returns ------- @@ -293,26 +252,20 @@ def get_metrics( # pylint: disable=too-many-locals, too-many-statements * F1 Score - harmonic mean of precision and recall. * Advantage. """ - invalid_format = ( - "y_pred must be an array of shape [x,2] with elements of type float" - ) + if len(y_pred_proba.shape) != 2: + raise ValueError("y_pred must be an array of floats of shape [x,2]") + if y_pred_proba.shape[1] != 2: + raise ValueError("Metrics for multiclass classification are unsupported") - shape = y_pred_proba.shape - if len(shape) != 2: - raise ValueError(invalid_format) - - if shape[1] != 2: - raise ValueError( - "Cannot use this function to calculate metrics for multiclass classification" - ) - - metrics = {} + if permute_rows: + _permute_rows(y_pred_proba, y_test) y_pred = np.argmax(y_pred_proba, axis=1) y_pred_proba = y_pred_proba[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() + metrics = {} # true positive rate or recall metrics["TPR"] = round(float(tp / (tp + fn)), 8) # false positive rate, proportion of negative examples incorrectly classified as positives @@ -334,7 +287,7 @@ def get_metrics( # pylint: disable=too-many-locals, too-many-statements metrics["F1score"] = _div( 2 * metrics["PPV"] * metrics["TPR"], metrics["PPV"] + metrics["TPR"], 0 ) - # Advantage: TPR - FPR + # Advantage metrics["Advantage"] = float(abs(metrics["TPR"] - metrics["FPR"])) # calculate AUC of model diff --git a/aisdc/preprocessing/loaders.py b/aisdc/preprocessing/loaders.py index 515b0f07..c9f380ef 100644 --- a/aisdc/preprocessing/loaders.py +++ b/aisdc/preprocessing/loaders.py @@ -1,11 +1,9 @@ """Handlers to pull in datasets and perform preprocessing.""" -# pylint: disable=import-error, invalid-name, consider-using-with, too-many-return-statements +# pylint: disable=consider-using-with, too-many-return-statements import logging import os -from collections import Counter -from typing import List, Tuple from zipfile import BadZipFile, ZipFile import numpy as np @@ -14,10 +12,6 @@ from sklearn.datasets import fetch_openml, load_iris from sklearn.preprocessing import LabelEncoder, OneHotEncoder -# Following is to stop pylint always sqwarking at pandas things -# pylint: disable = no-member, unsubscriptable-object - - logging.basicConfig(level="DEBUG") logger = logging.getLogger(__file__) @@ -37,7 +31,7 @@ class DataNotAvailable(Exception): def get_data_sklearn( # pylint: disable = too-many-branches dataset_name: str, data_folder: str = os.path.join(PROJECT_ROOT_FOLDER, "data") -) -> Tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[pd.DataFrame, pd.DataFrame]: """Get data in a format sensible for sklearn. User passes a name and that dataset is returned as a tuple of pandas @@ -114,7 +108,7 @@ def get_data_sklearn( # pylint: disable = too-many-branches sub_name = dataset_name.split("round")[1].strip() logger.debug(sub_name) feature_df, target_df = get_data_sklearn(sub_name, data_folder) - column_dtype = feature_df.dtypes # pylint: disable = no-member + column_dtype = feature_df.dtypes for i, column in enumerate(feature_df.columns): if column_dtype[i] == "float64": @@ -152,11 +146,11 @@ def get_data_sklearn( # pylint: disable = too-many-branches if dataset_name == "iris": return _iris() if dataset_name == "RDMP": - return _RDMP(data_folder) + return _rdmp(data_folder) raise UnknownDataset(dataset_name) -def _iris() -> Tuple[pd.DataFrame, pd.DataFrame]: +def _iris() -> tuple[pd.DataFrame, pd.DataFrame]: """Get the Sklearn iris data - just first two classes.""" X, y = load_iris(return_X_y=True, as_frame=True) X = X[y < 2] @@ -164,7 +158,7 @@ def _iris() -> Tuple[pd.DataFrame, pd.DataFrame]: return X, pd.DataFrame(y) -def _nursery() -> Tuple[pd.DataFrame, pd.DataFrame]: +def _nursery() -> tuple[pd.DataFrame, pd.DataFrame]: """Return the sklearn nursery dataset.""" data = fetch_openml(data_id=26, as_frame=True) @@ -183,7 +177,7 @@ def _nursery() -> Tuple[pd.DataFrame, pd.DataFrame]: def _images_to_ndarray( images_dir: str, number_to_load: int, label: int, flatten: bool = True -) -> Tuple[np.array, np.array]: +) -> tuple[np.array, np.array]: """Get number_to_load images from the images_dir and create arrays. Patched to support non-flattened images. @@ -193,7 +187,7 @@ def _images_to_ndarray( images_names = sorted(os.listdir(folder_path)) images_names = images_names[:number_to_load] # fix f or macosx - if ".DS_Store" in images_names: # pragma: no cover + if ".DS_Store" in images_names: images_names.remove(".DS_Store") if flatten: @@ -207,8 +201,8 @@ def _images_to_ndarray( def _medical_mnist_loader( # pylint: disable = too-many-locals - data_folder: str, n_per_class: int, classes: List[str] -) -> Tuple[pd.DataFrame, pd.DataFrame]: + data_folder: str, n_per_class: int, classes: list[str] +) -> tuple[pd.DataFrame, pd.DataFrame]: """Get Medical MNIST into pandas format. Borrows heavily from: https://www.kaggle.com/harelshattenstein/medical-mnist-knn @@ -238,7 +232,7 @@ def _medical_mnist_loader( # pylint: disable = too-many-locals with ZipFile(zip_file) as zip_handle: zip_handle.extractall(base_folder) logger.debug("Extracted all") - except BadZipFile: # pragma: no cover + except BadZipFile: logger.error("Encountered bad zip file") raise @@ -271,7 +265,7 @@ def _medical_mnist_loader( # pylint: disable = too-many-locals def _synth_ae( data_folder: str, n_rows: int = 5000 -) -> Tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[pd.DataFrame, pd.DataFrame]: """Get synth ae data. First norws (default 5000) rows from the Synthetic A&E data from NHS England @@ -327,7 +321,7 @@ def _synth_ae( return (X, y) -def _indian_liver(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: +def _indian_liver(data_folder: str) -> tuple[pd.DataFrame, pd.DataFrame]: """Get Indian Liver Patient Dataset. https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv # pylint: disable=line-too-long. @@ -371,7 +365,7 @@ def _indian_liver(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: return (liver_data, liver_labels) -def _in_hospital_mortality(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: +def _in_hospital_mortality(data_folder: str) -> tuple[pd.DataFrame, pd.DataFrame]: """Get In-hospital mortality data. See: https://datadryad.org/stash/dataset/doi:10.5061/dryad.0p2ngf1zd. @@ -383,9 +377,7 @@ def _in_hospital_mortality(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame file_path = [os.path.join(data_folder, f) for f in files] print(file_path) - if not any( # pylint: disable=use-a-generator - [os.path.exists(fp) for fp in file_path] - ): # pylint: disable=use-a-generator + if not any(os.path.exists(fp) for fp in file_path): help_message = f""" Data file {file_path[0]} or {file_path[1]} does not exist. Please download the file from: https://datadryad.org/stash/dataset/doi:10.5061/dryad.0p2ngf1zd @@ -412,7 +404,7 @@ def _in_hospital_mortality(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame return (features, labels) -def _mimic_iaccd(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: +def _mimic_iaccd(data_folder: str) -> tuple[pd.DataFrame, pd.DataFrame]: """Get the mimic_iaccd data and perform preprocessing.""" # Check the data has been downloaded. # If not throw an exception with instructions on how to @@ -465,12 +457,12 @@ def _mimic_iaccd(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: return (X, y) -def _RDMP( # pylint: disable=too-many-locals, too-many-statements +def _rdmp( # pylint: disable=too-many-locals, too-many-statements data_folder: str, -) -> Tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[pd.DataFrame, pd.DataFrame]: """Get the RDMP dataset.""" - def find_age(row): + def find_age(row: pd.Series) -> int: date_ = pd.to_datetime("01/06/2020") if row.date_of_death != row.date_of_death: age = np.floor((date_ - row.date_of_birth).days / 365.25) @@ -478,7 +470,7 @@ def find_age(row): age = np.floor((row.date_of_death - row.date_of_birth).days / 365.25) return age - def hospital_days(row): + def hospital_days(row: pd.Series) -> int: if row.DischargeDate == row.DischargeDate: t = row.DischargeDate - row.AdmissionDate days = t.days + round(((t.seconds / 60) / 60) / 24) @@ -521,7 +513,7 @@ def hospital_days(row): "R_CC_STEN_B", "R_CC_STEN_C", "R_CC_STEN_D", - "R_CC_STEN_S", # pylint: disable=unreachable + "R_CC_STEN_S", "L_IC_STEN_A", "L_IC_STEN_B", "L_IC_STEN_C", @@ -618,12 +610,12 @@ def hospital_days(row): df__.groupby(["chi"])[[x for x in df__.columns if "Condition" in x]] .count() .mean(axis=1) - ) # pylint: disable=line-too-long + ) no = ( df__.groupby(["chi"])[[x for x in df__.columns if "Operation" in x]] .count() .sum(axis=1) - ) # pylint: disable=line-too-long + ) df__.drop( columns=[ x @@ -631,7 +623,7 @@ def hospital_days(row): if "Date" in x or "Operation" in x or "Condition" in x ], inplace=True, - ) # pylint: disable=line-too-long + ) df__ = pd.DataFrame() df__["days_in_hospital"] = dih df__["average_number_conditions"] = nc diff --git a/aisdc/safemodel/__init__.py b/aisdc/safemodel/__init__.py index 4dd28a47..43faa7b8 100644 --- a/aisdc/safemodel/__init__.py +++ b/aisdc/safemodel/__init__.py @@ -1,3 +1 @@ """Collection of defensive wrappers for preserving the privacy of ML models.""" - -from .reporting import get_reporting_string diff --git a/aisdc/safemodel/classifiers/__init__.py b/aisdc/safemodel/classifiers/__init__.py index c6b45ee1..9565de52 100644 --- a/aisdc/safemodel/classifiers/__init__.py +++ b/aisdc/safemodel/classifiers/__init__.py @@ -1,8 +1,17 @@ -"""Makes class for various models supported.""" +"""Supported classifiers.""" from .dp_svc import DPSVC from .safedecisiontreeclassifier import SafeDecisionTreeClassifier from .safekeras import SafeKerasModel from .saferandomforestclassifier import SafeRandomForestClassifier from .safesvc import SafeSVC -from .safetf import Safe_tf_DPModel +from .safetf import SafeTFModel + +__all__ = [ + "DPSVC", + "SafeDecisionTreeClassifier", + "SafeKerasModel", + "SafeRandomForestClassifier", + "SafeSVC", + "SafeTFModel", +] diff --git a/aisdc/safemodel/classifiers/dp_svc.py b/aisdc/safemodel/classifiers/dp_svc.py index f47e2f3e..05070750 100644 --- a/aisdc/safemodel/classifiers/dp_svc.py +++ b/aisdc/safemodel/classifiers/dp_svc.py @@ -1,7 +1,8 @@ """Differentially private SVC.""" +from __future__ import annotations + import logging -from typing import Any import numpy as np from sklearn.linear_model import LogisticRegression @@ -12,9 +13,7 @@ SMALL_NUMBER = 1e-16 # used to set gamma value if zero to avoid divide by zero -# pylint: disable = invalid-name -# pylint: disable=R0902: too-many-instance-attributes -# pylint:disable = fixme +# pylint: disable=too-many-instance-attributes class DPSVC: @@ -53,7 +52,14 @@ class DPSVC: is in {-1,1}) """ - def __init__(self, C=1.0, gamma="scale", dhat=1000, eps=10, **kwargs): + def __init__( + self, + C: float = 1.0, + gamma: str | float = "scale", + dhat: int = 1000, + eps: float = 10, + **kwargs: dict, + ) -> None: self.svc = None self.gamma = gamma self.dpsvc_gamma = None @@ -70,31 +76,27 @@ def __init__(self, C=1.0, gamma="scale", dhat=1000, eps=10, **kwargs): self.noisy_weights = None self.set_params(**kwargs) - def phi_hat(self, input_vector): + def phi_hat(self, input_vector: np.ndarray) -> np.ndarray: """Project a single feature.""" vt1 = (self.rho * input_vector).sum(axis=1) vt = (self.dhat ** (-0.5)) * np.column_stack((np.cos(vt1), np.sin(vt1))) return vt.reshape(2 * self.dhat) - def phi_hat_multi(self, input_features): + def phi_hat_multi(self, input_features: np.ndarray) -> np.ndarray: """Compute feature space for a matrix of inputs.""" - # TODO: could this be vectorised? n_data, _ = input_features.shape phi_hat = np.zeros((n_data, 2 * self.dhat), float) for i in range(n_data): phi_hat[i, :] = self.phi_hat(input_features[i, :]) return phi_hat - def k_hat_svm(self, x, y=None): + def k_hat_svm(self, x: np.ndarray, y: np.ndarray | None = None) -> np.ndarray: """Define the version which is sent to sklearn.svm.""" phi_hat_x = self.phi_hat_multi(x) - if y is None: - phi_hat_y = phi_hat_x - else: - phi_hat_y = self.phi_hat_multi(y) + phi_hat_y = phi_hat_x if y is None else self.phi_hat_multi(y) return np.dot(phi_hat_x, phi_hat_y.T) - def fit(self, train_features: Any, train_labels: Any) -> None: + def fit(self, train_features: np.ndarray, train_labels: np.ndarray) -> None: """Fit the model.""" # Check that the data passed is np.ndarray if not isinstance(train_features, np.ndarray) or not isinstance( @@ -178,7 +180,7 @@ def fit(self, train_features: Any, train_labels: Any) -> None: local_logger.info("Fitting Platt scaling") self.platt_transform.fit(ypredn.reshape(-1, 1), train_labels) - def set_params(self, **kwargs) -> None: + def set_params(self, **kwargs: dict) -> None: """Set params.""" for key, value in kwargs.items(): if key == "gamma": @@ -190,20 +192,17 @@ def set_params(self, **kwargs) -> None: else: local_logger.warning("Unsupported parameter: %s", key) - def _raw_outputs(self, test_features: Any) -> np.ndarray: + def _raw_outputs(self, test_features: np.ndarray) -> np.ndarray: """Get the raw output, used by predict and predict_proba.""" projected_features = self.phi_hat_multi(test_features) - out = np.dot(projected_features, self.noisy_weights) + self.intercept - return out + return np.dot(projected_features, self.noisy_weights) + self.intercept - def predict(self, test_features: Any) -> np.ndarray: + def predict(self, test_features: np.ndarray) -> np.ndarray: """Return the predictions.""" out = self._raw_outputs(test_features) - out = 1 * (out > 0) - return out + return 1 * (out > 0) - def predict_proba(self, test_features: Any) -> np.ndarray: + def predict_proba(self, test_features: np.ndarray) -> np.ndarray: """Return the predictive probabilities.""" out = self._raw_outputs(test_features) - pred_probs = self.platt_transform.predict_proba(out.reshape(-1, 1)) - return pred_probs + return self.platt_transform.predict_proba(out.reshape(-1, 1)) diff --git a/aisdc/safemodel/classifiers/new_model_template.py b/aisdc/safemodel/classifiers/new_model_template.py index 3a38cbc6..dad8d4e5 100644 --- a/aisdc/safemodel/classifiers/new_model_template.py +++ b/aisdc/safemodel/classifiers/new_model_template.py @@ -10,14 +10,14 @@ from __future__ import annotations import copy -from typing import Any import numpy as np from dictdiffer import diff from sklearn.ensemble import ModelToMakeSafer # pylint: disable=E0611 from sklearn.tree import DecisionTreeClassifier -from ..safemodel import SafeModel +from aisdc.safemodel.safemodel import SafeModel + from .safedecisiontreeclassifier import decision_trees_are_equal @@ -44,7 +44,7 @@ def check_present( class SafeModelToMakeSafe(SafeModel, ModelToMakeSafer): """Privacy protected ModelToMakeSafer.""" - def __init__(self, **kwargs: Any) -> None: + def __init__(self, **kwargs: dict) -> None: """Create model and apply constraints to params.""" SafeModel.__init__(self) self.k_anonymity = 0 @@ -94,11 +94,14 @@ def additional_checks( # pylint: disable=too-many-nested-blocks,too-many-branch This example shows how to deal with instances of sklearn's tree class as base estimators in a forest (line 99) or as single estimators (lines 114-118). + + Notes + ----- + Call the super function to deal with any items that are lists. For example: + >>> msg, disclosive = super().additional_checks(curr_separate, saved_separate) """ msg = "" disclosive = False - ## call the super function to deal with any items that are lists - # msg, disclosive = super().additional_checks(curr_separate, saved_separate) # now the relevant ModelToMakeSafer specific things for item in self.examine_seperately_items: if item == "base_estimator": @@ -182,7 +185,7 @@ def get_k_anonymity(self, x: np.ndarray) -> int: for record in range(num_records): # start by assuming everything co-occurs - appears_together = list(range(0, num_records)) + appears_together = list(range(num_records)) # iterate through trees for this_tree in range(num_trees): this_leaf = all_leaves[record][this_tree] diff --git a/aisdc/safemodel/classifiers/safedecisiontreeclassifier.py b/aisdc/safemodel/classifiers/safedecisiontreeclassifier.py index c960341d..be6c637f 100644 --- a/aisdc/safemodel/classifiers/safedecisiontreeclassifier.py +++ b/aisdc/safemodel/classifiers/safedecisiontreeclassifier.py @@ -9,8 +9,8 @@ from dictdiffer import diff from sklearn.tree import DecisionTreeClassifier -from ..reporting import get_reporting_string -from ..safemodel import SafeModel +from aisdc.safemodel.reporting import get_reporting_string +from aisdc.safemodel.safemodel import SafeModel def decision_trees_are_equal( @@ -99,12 +99,11 @@ def decision_tree_internal_trees_are_equal( name="internal_attribute_differs", attr=attr ) same = False - else: - if t1val != t2val: - msg += get_reporting_string( - name="internal_attribute_differs", attr=attr - ) - same = False + elif t1val != t2val: + msg += get_reporting_string( + name="internal_attribute_differs", attr=attr + ) + same = False except BaseException as error: # pylint:disable=broad-except #pragma:no cover msg += get_reporting_string(name="exception_occurred", error=error) return same, msg @@ -114,14 +113,13 @@ def get_tree_k_anonymity(thetree: DecisionTreeClassifier, X: Any) -> int: """Return the smallest number of data items in any leaf.""" leaves = thetree.apply(X) uniqs_counts = np.unique(leaves, return_counts=True) - k_anonymity = np.min(uniqs_counts[1]) - return k_anonymity + return np.min(uniqs_counts[1]) class SafeDecisionTreeClassifier(SafeModel, DecisionTreeClassifier): # pylint: disable=too-many-ancestors """Privacy protected Decision Tree classifier.""" - def __init__(self, **kwargs: Any) -> None: + def __init__(self, **kwargs: dict) -> None: """Create model and apply constraints to params.""" SafeModel.__init__(self) self.basemodel_paramnames = [ diff --git a/aisdc/safemodel/classifiers/safekeras.py b/aisdc/safemodel/classifiers/safekeras.py index ae449527..7e23fc48 100644 --- a/aisdc/safemodel/classifiers/safekeras.py +++ b/aisdc/safemodel/classifiers/safekeras.py @@ -2,7 +2,7 @@ import os import warnings -from typing import Any, Tuple +from typing import Any import numpy as np import tensorflow as tf @@ -11,8 +11,8 @@ from tensorflow.keras import Model as KerasModel # pylint: disable = import-error from tensorflow_privacy import compute_dp_sgd_privacy -from ..reporting import get_reporting_string -from ..safemodel import SafeModel +from aisdc.safemodel.reporting import get_reporting_string +from aisdc.safemodel.safemodel import SafeModel warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) @@ -30,7 +30,7 @@ ) -def same_configs(m1: Any, m2: Any) -> Tuple[bool, str]: +def same_configs(m1: Any, m2: Any) -> tuple[bool, str]: """Check if two models have the same architecture.""" num_layers = len(m1.layers) if len(m2.layers) != num_layers: @@ -47,7 +47,6 @@ def same_configs(m1: Any, m2: Any) -> Tuple[bool, str]: msg = get_reporting_string( name="layer_configs_differ", layer=layer, length=num_diffs ) - # f"Layer {layer} configs differ in {len(match)} places:\n" for i in range(num_diffs): if match[i][0] == "change": msg += get_reporting_string( @@ -63,7 +62,7 @@ def same_configs(m1: Any, m2: Any) -> Tuple[bool, str]: return True, get_reporting_string(name="same_ann_config") -def same_weights(m1: Any, m2: Any) -> Tuple[bool, str]: +def same_weights(m1: Any, m2: Any) -> tuple[bool, str]: """Check if two nets with same architecture have the same weights.""" num_layers = len(m1.layers) if num_layers != len(m2.layers): @@ -82,7 +81,7 @@ def same_weights(m1: Any, m2: Any) -> Tuple[bool, str]: return True, "weights match" -def check_checkpoint_equality(v1: str, v2: str) -> Tuple[bool, str]: +def check_checkpoint_equality(v1: str, v2: str) -> tuple[bool, str]: """Compare two checkpoints saved with tensorflow save_model. On the assumption that the optimiser is not going to be saved, @@ -118,31 +117,31 @@ def check_checkpoint_equality(v1: str, v2: str) -> Tuple[bool, str]: return same, msg -def check_DP_used(optimizer) -> Tuple[bool, str]: +def check_dp_used(optimizer) -> tuple[bool, str]: """Check whether the DP optimizer was actually the one used.""" key_needed = "_was_dp_gradients_called" critical_val = optimizer.__dict__.get(key_needed, "missing") if critical_val is True: reason = get_reporting_string(name="dp_optimizer_run") - DPused = True + dp_used = True elif critical_val == "missing": reason = get_reporting_string(name="no_dp_gradients_key") - DPused = False + dp_used = False elif critical_val is False: reason = get_reporting_string(name="changed_opt_no_fit") - DPused = False + dp_used = False else: # pragma: no cover # not currently reachable because optimizer class does # not support assignment # but leave in to future-proof reason = get_reporting_string(name="unrecognised_combination") - DPused = False + dp_used = False - return DPused, reason + return dp_used, reason -def check_optimizer_allowed(optimizer) -> Tuple[bool, str]: +def check_optimizer_allowed(optimizer) -> tuple[bool, str]: """Check if the model's optimizer is in our white-list. Default setting is not allowed. @@ -157,19 +156,19 @@ def check_optimizer_allowed(optimizer) -> Tuple[bool, str]: return allowed, reason -def check_optimizer_is_DP(optimizer) -> Tuple[bool, str]: +def check_optimizer_is_dp(optimizer) -> tuple[bool, str]: """Check whether optimizer is one of tensorflow's DP versions.""" - DPused = False + dp_used = False reason = "None" if "_was_dp_gradients_called" not in optimizer.__dict__: reason = get_reporting_string(name="no_dp_gradients_key") else: reason = get_reporting_string(name="found_dp_gradients_key") - DPused = True - return DPused, reason + dp_used = True + return dp_used, reason -def load_safe_keras_model(name: str = "undefined") -> Tuple[bool, Any]: +def load_safe_keras_model(name: str = "undefined") -> tuple[bool, Any]: """Read model from file in appropriate format. Optimizer is deliberately excluded in the save. @@ -209,12 +208,12 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: # initialise all the values that get provided as options to keras # and also l2 norm clipping and learning rates, batch sizes inputs = None - if "inputs" in kwargs.keys(): # pylint: disable=consider-iterating-dictionary + if "inputs" in kwargs: inputs = the_kwargs["inputs"] elif len(args) == 3: # defaults is for Model(input,outputs,names) inputs = args[0] self.outputs = None - if "outputs" in kwargs.keys(): # pylint: disable=consider-iterating-dictionary + if "outputs" in kwargs: outputs = the_kwargs["outputs"] elif len(args) == 3: outputs = args[1] @@ -262,7 +261,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: def dp_epsilon_met( self, num_examples: int, batch_size: int = 0, epochs: int = 0 - ) -> Tuple[bool, str]: + ) -> tuple[bool, str]: """Check if epsilon is sufficient for Differential Privacy. Provides feedback to user if epsilon is not sufficient. @@ -279,7 +278,7 @@ def dp_epsilon_met( def check_epsilon( self, num_samples: int, batch_size: int, epochs: int - ) -> Tuple[bool, str]: + ) -> tuple[bool, str]: """Check if the level of privacy guarantee is within recommended limits.""" msg = "" ok = False @@ -304,9 +303,7 @@ def check_epsilon( print(msg) return ok, msg - def compile( - self, optimizer=None, loss="categorical_crossentropy", metrics=["accuracy"] - ): # pylint:disable=dangerous-default-value) + def compile(self, optimizer=None, loss="categorical_crossentropy", metrics=None): """Compile the safe Keras model. Replaces the optimiser with a DP variant if needed and creates the @@ -314,31 +311,30 @@ def compile( Allow None as default value for optimizer param because we explicitly deal with it. """ + if metrics is None: + metrics = ["accuracy"] + replace_message = get_reporting_string(name="warn_possible_disclosure_risk") - # "WARNING: model parameters may present a disclosure risk" - using_DP_SGD = get_reporting_string(name="using_dp_sgd") - # "Changed parameter optimizer = 'DPKerasSGDOptimizer'" - Using_DP_Adagrad = get_reporting_string(name="using_dp_adagrad") - # "Changed parameter optimizer = 'DPKerasAdagradOptimizer'" - using_DP_Adam = get_reporting_string(name="using_dp_adam") - # "Changed parameter optimizer = 'DPKerasAdamOptimizer'" + using_dp_sgd = get_reporting_string(name="using_dp_sgd") + using_dp_adagrad = get_reporting_string(name="using_dp_adagrad") + using_dp_adam = get_reporting_string(name="using_dp_adam") optimizer_dict = { - None: (using_DP_SGD, tfp.DPKerasSGDOptimizer), + None: (using_dp_sgd, tfp.DPKerasSGDOptimizer), tfp.DPKerasSGDOptimizer: ("", tfp.DPKerasSGDOptimizer), tfp.DPKerasAdagradOptimizer: ("", tfp.DPKerasAdagradOptimizer), tfp.DPKerasAdamOptimizer: ("", tfp.DPKerasAdamOptimizer), "Adagrad": ( - replace_message + Using_DP_Adagrad, + replace_message + using_dp_adagrad, tfp.DPKerasAdagradOptimizer, ), - "Adam": (replace_message + using_DP_Adam, tfp.DPKerasAdamOptimizer), - "SGD": (replace_message + using_DP_SGD, tfp.DPKerasSGDOptimizer), + "Adam": (replace_message + using_dp_adam, tfp.DPKerasAdamOptimizer), + "SGD": (replace_message + using_dp_sgd, tfp.DPKerasSGDOptimizer), } val = optimizer_dict.get(optimizer, "unknown") if val == "unknown": - opt_msg = using_DP_SGD + opt_msg = using_dp_sgd opt_used = tfp.DPKerasSGDOptimizer else: opt_msg = val[0] @@ -360,7 +356,7 @@ def compile( def fit( # pylint:disable=too-many-arguments self, X: Any, - Y: Any, + y: Any, validation_data: Any, epochs: int, batch_size: int, @@ -393,7 +389,7 @@ def fit( # pylint:disable=too-many-arguments returnval = super().fit( X, - Y, + y, validation_data=validation_data, epochs=epochs, batch_size=batch_size, @@ -404,11 +400,11 @@ def fit( # pylint:disable=too-many-arguments os.mkdir("tfsaves") self.save("tfsaves/fit_model.tf") # pylint: disable=attribute-defined-outside-init - self.saved_was_dpused, self.saved_reason = check_DP_used(self.optimizer) + self.saved_was_dpused, self.saved_reason = check_dp_used(self.optimizer) self.saved_epsilon = self.current_epsilon return returnval - def posthoc_check(self, verbose: bool = True) -> Tuple[str, bool]: + def posthoc_check(self, verbose: bool = True) -> tuple[str, bool]: """Check whether the model should be considered unsafe. For example, has been changed since fit() was last run, @@ -434,14 +430,14 @@ def posthoc_check(self, verbose: bool = True) -> Tuple[str, bool]: disclosive = True # was the dp-optimiser used during fit() - dpused, dpusedmessage = check_DP_used(self.optimizer) - if not dpused: + dp_used, dpusedmessage = check_dp_used(self.optimizer) + if not dp_used: msg += dpusedmessage disclosive = True # have values been changed since saved immediately after fit()? if ( - dpused != self.saved_was_dpused + dp_used != self.saved_was_dpused or dpusedmessage != self.saved_reason or self.saved_epsilon != self.current_epsilon ): diff --git a/aisdc/safemodel/classifiers/saferandomforestclassifier.py b/aisdc/safemodel/classifiers/saferandomforestclassifier.py index 0934830d..cd1a1edd 100644 --- a/aisdc/safemodel/classifiers/saferandomforestclassifier.py +++ b/aisdc/safemodel/classifiers/saferandomforestclassifier.py @@ -3,13 +3,13 @@ from __future__ import annotations import copy -from typing import Any import numpy as np from sklearn.ensemble import RandomForestClassifier -from ..reporting import get_reporting_string -from ..safemodel import SafeModel +from aisdc.safemodel.reporting import get_reporting_string +from aisdc.safemodel.safemodel import SafeModel + from .safedecisiontreeclassifier import decision_trees_are_equal # pylint: disable=too-many-ancestors, unidiomatic-typecheck @@ -18,7 +18,7 @@ class SafeRandomForestClassifier(SafeModel, RandomForestClassifier): """Privacy protected Random Forest classifier.""" - def __init__(self, **kwargs: Any) -> None: + def __init__(self, **kwargs: dict) -> None: """Create model and apply constraints to params.""" SafeModel.__init__(self) self.basemodel_paramnames = [ @@ -144,7 +144,7 @@ def get_k_anonymity(self, x: np.ndarray) -> int: for record in range(num_records): # start by assuming everything co-occurs - appears_together = list(range(0, num_records)) + appears_together = list(range(num_records)) # iterate through trees for this_tree in range(num_trees): this_leaf = all_leaves[record][this_tree] diff --git a/aisdc/safemodel/classifiers/safesvc.py b/aisdc/safemodel/classifiers/safesvc.py index bac81a0e..5fe804bc 100644 --- a/aisdc/safemodel/classifiers/safesvc.py +++ b/aisdc/safemodel/classifiers/safesvc.py @@ -7,14 +7,22 @@ import numpy as np from dictdiffer import diff -from ..safemodel import SafeModel +from aisdc.safemodel.safemodel import SafeModel + from .dp_svc import DPSVC class SafeSVC(SafeModel, DPSVC): """Privacy protected Support Vector Classifier.""" - def __init__(self, C=1.0, gamma="scale", dhat=1000, eps=10, **kwargs) -> None: + def __init__( + self, + C: float = 1.0, + gamma: str | float = "scale", + dhat: int = 1000, + eps: float = 10, + **kwargs: dict, + ) -> None: """Initialise a differentially private SVC.""" SafeModel.__init__(self) DPSVC.__init__(self, C=C, gamma=gamma, dhat=dhat, eps=eps, **kwargs) diff --git a/aisdc/safemodel/classifiers/safetf.py b/aisdc/safemodel/classifiers/safetf.py index d8776f60..558ac3db 100644 --- a/aisdc/safemodel/classifiers/safetf.py +++ b/aisdc/safemodel/classifiers/safetf.py @@ -1,18 +1,11 @@ """Privacy protected TensorFlow model.""" -# pylint: disable=unused-import -from typing import Any - -import tensorflow as tf -import tensorflow_privacy as tf_privacy from tensorflow_privacy import DPModel -from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy -from tensorflow_privacy.privacy.optimizers import dp_optimizer_keras -from ..safemodel import SafeModel +from aisdc.safemodel.safemodel import SafeModel -class Safe_tf_DPModel(SafeModel, DPModel): +class SafeTFModel(SafeModel, DPModel): """Privacy Protected tensorflow_privacy DP-SGD subclass of Keras model.""" # pylint:disable=super-init-not-called diff --git a/aisdc/safemodel/reporting.py b/aisdc/safemodel/reporting.py index f182e276..a273eae9 100644 --- a/aisdc/safemodel/reporting.py +++ b/aisdc/safemodel/reporting.py @@ -1,7 +1,7 @@ """Methods to producte standard reporting strings.""" -def get_reporting_string(**kwargs): +def get_reporting_string(**kwargs: dict) -> str: """Return a standard formatted string from a dictionary of f-strings. Parameters diff --git a/aisdc/safemodel/rules.json b/aisdc/safemodel/rules.json index 9d6eeae7..77c98730 100644 --- a/aisdc/safemodel/rules.json +++ b/aisdc/safemodel/rules.json @@ -35,12 +35,12 @@ "operator": "or", "subexpr": [ { - "keyword": "keyA", + "keyword": "key_a", "operator": "equals", "value": true }, { - "keyword": "keyB", + "keyword": "key_b", "operator": "equals", "value": true } diff --git a/aisdc/safemodel/safemodel.py b/aisdc/safemodel/safemodel.py index 718f777f..cfeb1f81 100644 --- a/aisdc/safemodel/safemodel.py +++ b/aisdc/safemodel/safemodel.py @@ -220,7 +220,7 @@ def __init__(self) -> None: except (ImportError, KeyError, OSError): # pragma: no cover self.researcher = "unknown" - def get_params(self, deep=True): + def get_params(self, deep: bool = True) -> dict: """Get a dictionary of parameter values restricted to those expected.""" the_params = {} for key, val in self.__dict__.items(): @@ -233,34 +233,30 @@ def get_params(self, deep=True): def save(self, name: str = "undefined") -> None: """Write model to file in appropriate format. - Note this is overloaded in SafeKerasClassifer + Note this is overloaded in `SafeKerasClassifer` to deal with tensorflow specifics. Parameters ---------- name : string - The name of the file to save + The name of the file to save. Notes ----- - Optimizer is deliberately excluded. - To prevent possible to restart training and thus - possible back door into attacks. + Optimizer is deliberately excluded to prevent possible restart to + training and thus possible back door into attacks. """ self.model_save_file = name if self.model_save_file == "undefined": print("You must input a name with extension to save the model.") else: thename = self.model_save_file.split(".") - # print(f'in save(), parsed filename is {thename}') if len(thename) == 1: print("file name must indicate type as a suffix") else: suffix = self.model_save_file.split(".")[-1] - - if ( - suffix == "pkl" and self.model_type != "KerasModel" - ): # save to pickle + # save to pickle + if suffix == "pkl" and self.model_type != "KerasModel": with open(self.model_save_file, "wb") as file: try: pickle.dump(self, file) @@ -270,10 +266,8 @@ def save(self, name: str = "undefined") -> None: f"{self.model_type}." f"Error message was {type_err}" ) - - elif ( - suffix == "sav" and self.model_type != "KerasModel" - ): # save to joblib + # save to joblib + elif suffix == "sav" and self.model_type != "KerasModel": try: joblib.dump(self, self.model_save_file) except (TypeError, AttributeError, PicklingError) as type_err: @@ -392,7 +386,7 @@ def preliminary_check( Returns ------- msg : string - A message string + A message string. disclosive : bool A boolean value indicating whether the model is potentially disclosive. @@ -416,10 +410,7 @@ def preliminary_check( if temp_disc: disclosive = True - if disclosive: - msg = notok_start + msg - else: - msg = ok_start + msg + msg = notok_start + msg if disclosive else ok_start + msg if verbose: print("Preliminary checks: " + msg) @@ -434,7 +425,7 @@ def get_current_and_saved_models(self) -> tuple[dict, dict]: for key in attribute_names_as_list: if key not in self.ignore_items: try: - value = self.__dict__[key] # jim added + value = self.__dict__[key] current_model[key] = copy.deepcopy(value) except (copy.Error, TypeError) as key_type: logger.warning("%s cannot be copied", key) @@ -469,11 +460,11 @@ def examine_seperate_items( disclosive = True msg += get_reporting_string(name="both_item_removed", item=item) - if curr_vals[item] == "Absent" and not saved_vals[item] == "Absent": + if curr_vals[item] == "Absent" and saved_vals[item] != "Absent": msg += get_reporting_string(name="current_item_removed", item=item) disclosive = True - if saved_vals[item] == "Absent" and not curr_vals[item] == "Absent": + if saved_vals[item] == "Absent" and curr_vals[item] != "Absent": disclosive = True msg += get_reporting_string(name="saved_item_removed", item=item) @@ -540,25 +531,25 @@ def additional_checks( ) -> tuple[str, bool]: """Perform additional posthoc checks. - Placeholder function for additional posthoc checks e.g. keras this + Placeholder function for additional posthoc checks e.g. keras. This version just checks that any lists have the same contents. Parameters ---------- - curr_separate : python dictionary - saved_separate : python dictionary + curr_separate : dict + saved_separate : dict Returns ------- msg : string - A message string + A message string. disclosive : bool - A boolean value to indicate whether the model is potentially disclosive. + A boolean value to indicate whether the model is potentially disclosive. Notes ----- - posthoc checking makes sure that the two dicts have the same set of - keys as defined in the list self.examine_separately + Posthoc checking makes sure that the two dicts have the same set of + keys as defined in the list self.examine_separately. """ msg = "" disclosive = False @@ -662,10 +653,10 @@ def run_attack( Notes ----- - Currently implement attack types are: - Likelihood Ratio: lira - Worst_Case Membership inference: worst_case - Single Attribute Inference: attributes + Currently implemented attack types are: + - Likelihood Ratio: lira. + - Worst_Case Membership inference: worst_case. + - Single Attribute Inference: attribute. """ if attack_name == "worst_case": attack_obj = WorstCaseAttack( diff --git a/examples/MIAandAIA_attacks_example.py b/examples/MIAandAIA_attacks_example.py index 0af3eb0b..f9f4c532 100644 --- a/examples/MIAandAIA_attacks_example.py +++ b/examples/MIAandAIA_attacks_example.py @@ -16,20 +16,17 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from aisdc.attacks.multiple_attacks import ( # pylint: disable = import-error - ConfigFile, - MultipleAttacks, -) -from aisdc.attacks.target import Target # pylint: disable = import-error +from aisdc.attacks.multiple_attacks import ConfigFile, MultipleAttacks +from aisdc.attacks.target import Target sys.path.append(os.path.dirname(os.path.dirname(__file__))) if __name__ == "__main__": # [Researcher] Access a dataset nursery_data = fetch_openml(data_id=26, as_frame=True) - x = np.asarray(nursery_data.data, dtype=str) + X = np.asarray(nursery_data.data, dtype=str) y = np.asarray(nursery_data.target, dtype=str) - n_features = np.shape(x)[1] + n_features = np.shape(X)[1] indices: list[list[int]] = [ [0, 1, 2], # parents [3, 4, 5, 6, 7], # has_nurs @@ -44,12 +41,12 @@ # [Researcher] Split into training and test sets # target model train / test split - these are strings ( - x_train_orig, - x_test_orig, + X_train_orig, + X_test_orig, y_train_orig, y_test_orig, ) = train_test_split( - x, + X, y, test_size=0.5, stratify=y, @@ -60,22 +57,22 @@ # one-hot encoding of features and integer encoding of labels label_enc = LabelEncoder() feature_enc = OneHotEncoder() - x_train = feature_enc.fit_transform(x_train_orig).toarray() + X_train = feature_enc.fit_transform(X_train_orig).toarray() y_train = label_enc.fit_transform(y_train_orig) - x_test = feature_enc.transform(x_test_orig).toarray() + X_test = feature_enc.transform(X_test_orig).toarray() y_test = label_enc.transform(y_test_orig) # [Researcher] Define the classifier model = RandomForestClassifier(bootstrap=False) # [Researcher] Train the classifier - model.fit(x_train, y_train) + model.fit(X_train, y_train) # [TRE / Researcher] Wrap the model and data in a Target object target = Target(model=model) target.name = "nursery" - target.add_processed_data(x_train, y_train, x_test, y_test) - target.add_raw_data(x, y, x_train_orig, y_train_orig, x_test_orig, y_test_orig) + target.add_processed_data(X_train, y_train, X_test, y_test) + target.add_raw_data(X, y, X_train_orig, y_train_orig, X_test_orig, y_test_orig) for i in range(n_features): target.add_feature(nursery_data.feature_names[i], indices[i], "onehot") diff --git a/examples/attribute_inference_example.py b/examples/attribute_inference_example.py index fd571451..02e09b2e 100644 --- a/examples/attribute_inference_example.py +++ b/examples/attribute_inference_example.py @@ -10,17 +10,15 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from aisdc.attacks import attribute_attack # pylint: disable = import-error -from aisdc.attacks.target import Target # pylint: disable = import-error - -# pylint: disable = duplicate-code +from aisdc.attacks import attribute_attack +from aisdc.attacks.target import Target if __name__ == "__main__": # [Researcher] Access a dataset nursery_data = fetch_openml(data_id=26, as_frame=True) - x = np.asarray(nursery_data.data, dtype=str) + X = np.asarray(nursery_data.data, dtype=str) y = np.asarray(nursery_data.target, dtype=str) - n_features = np.shape(x)[1] + n_features = np.shape(X)[1] indices: list[list[int]] = [ [0, 1, 2], # parents [3, 4, 5, 6, 7], # has_nurs @@ -35,12 +33,12 @@ # [Researcher] Split into training and test sets # target model train / test split - these are strings ( - x_train_orig, - x_test_orig, + X_train_orig, + X_test_orig, y_train_orig, y_test_orig, ) = train_test_split( - x, + X, y, test_size=0.5, stratify=y, @@ -51,42 +49,38 @@ # one-hot encoding of features and integer encoding of labels label_enc = LabelEncoder() feature_enc = OneHotEncoder() - x_train = feature_enc.fit_transform(x_train_orig).toarray() + X_train = feature_enc.fit_transform(X_train_orig).toarray() y_train = label_enc.fit_transform(y_train_orig) - x_test = feature_enc.transform(x_test_orig).toarray() + X_test = feature_enc.transform(X_test_orig).toarray() y_test = label_enc.transform(y_test_orig) # [Researcher] Define the classifier model = RandomForestClassifier(bootstrap=False) # [Researcher] Train the classifier - model.fit(x_train, y_train) - acc_train = model.score(x_train, y_train) - acc_test = model.score(x_test, y_test) + model.fit(X_train, y_train) + acc_train = model.score(X_train, y_train) + acc_test = model.score(X_test, y_test) print(f"Base model train accuracy: {acc_train}") print(f"Base model test accuracy: {acc_test}") # [TRE / Researcher] Wrap the model and data in a Target object target = Target(model=model) target.name = "nursery" - target.add_processed_data(x_train, y_train, x_test, y_test) - target.add_raw_data(x, y, x_train_orig, y_train_orig, x_test_orig, y_test_orig) + target.add_processed_data(X_train, y_train, X_test, y_test) + target.add_raw_data(X, y, X_train_orig, y_train_orig, X_test_orig, y_test_orig) for i in range(n_features): target.add_feature(nursery_data.feature_names[i], indices[i], "onehot") print(f"Dataset: {target.name}") print(f"Features: {target.features}") - print(f"x_train shape = {np.shape(target.x_train)}") + print(f"X_train shape = {np.shape(target.X_train)}") print(f"y_train shape = {np.shape(target.y_train)}") - print(f"x_test shape = {np.shape(target.x_test)}") + print(f"X_test shape = {np.shape(target.X_test)}") print(f"y_test shape = {np.shape(target.y_test)}") # [TRE] Create the attack object with attack parameters - attack_obj = attribute_attack.AttributeAttack( - n_cpu=2, - output_dir="outputs_aia", - # report_name="report_aia", - ) + attack_obj = attribute_attack.AttributeAttack(n_cpu=2, output_dir="outputs_aia") # [TRE] Run the attack attack_obj.attack(target) @@ -122,7 +116,6 @@ config = { "n_cpu": 2, "output_dir": "outputs_aia", - # "report_name": "report_aia", } with open("config_aia_cmd.json", "w", encoding="utf-8") as f: diff --git a/examples/lira_attack_example.py b/examples/lira_attack_example.py index 42430384..dfccb7e2 100644 --- a/examples/lira_attack_example.py +++ b/examples/lira_attack_example.py @@ -16,7 +16,7 @@ *Programmatically* 1. The TRE calls the attack code. 2. The TRE computes and inspects attack metrics. - *Command line + *Command line* 3. The researcher writes out their training and testing data, as well as the predictions that their target model makes on this data. 4. The TRE create a config file for the attack, specifying the file names for the files created @@ -26,8 +26,6 @@ Below, [Researcher] and [TRE] are used to denote which task is performed by whom. """ -# pylint: disable = duplicate-code - import json import os import sys @@ -37,23 +35,23 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split -from aisdc.attacks.likelihood_attack import LIRAAttack # pylint: disable = import-error -from aisdc.attacks.target import Target # pylint: disable = import-error +from aisdc.attacks.likelihood_attack import LIRAAttack +from aisdc.attacks.target import Target # [Researcher] Access a dataset X, y = load_breast_cancer(return_X_y=True, as_frame=False) # [Researcher] Split into training and test sets -train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # [Researcher] Define the classifier target_model = RandomForestClassifier(min_samples_split=2, min_samples_leaf=1) # [Researcher] Train the classifier -target_model.fit(train_X, train_y) +target_model.fit(X_train, y_train) # [Researcher] Provide the model and the train and test data to the TRE target = Target(model=target_model) -target.add_processed_data(train_X, train_y, test_X, test_y) +target.add_processed_data(X_train, y_train, X_test, y_test) # [TRE] Creates a config file for the likelihood attack config = { @@ -72,7 +70,6 @@ attack_obj = LIRAAttack( n_shadow_models=100, output_dir="outputs_lira", - # report_name="report_lira", attack_config_json_file_name="lira_config.json", ) @@ -108,7 +105,6 @@ attack_obj = LIRAAttack( n_shadow_models=100, output_dir="outputs_lira", - # report_name="report_lira", attack_config_json_file_name="lira_config.json", shadow_models_fail_fast=True, n_shadow_rows_confidences_min=10, @@ -150,12 +146,12 @@ # the command line rather than programmatically # [Researcher] Dump the training and test predictions to .csv files -np.savetxt("train_preds.csv", target_model.predict_proba(train_X), delimiter=",") -np.savetxt("test_preds.csv", target_model.predict_proba(test_X), delimiter=",") +np.savetxt("train_preds.csv", target_model.predict_proba(X_train), delimiter=",") +np.savetxt("test_preds.csv", target_model.predict_proba(X_test), delimiter=",") # [Researcher] Dump the training and test data to a .csv file -np.savetxt("train_data.csv", np.hstack((train_X, train_y[:, None])), delimiter=",") -np.savetxt("test_data.csv", np.hstack((test_X, test_y[:, None])), delimiter=",") +np.savetxt("train_data.csv", np.hstack((X_train, y_train[:, None])), delimiter=",") +np.savetxt("test_data.csv", np.hstack((X_test, y_test[:, None])), delimiter=",") # [Researcher] Dump the target model and target data target.save(path="target_model_for_lira") @@ -195,7 +191,6 @@ config = { "n_shadow_models": 150, "output_dir": "outputs_lira", - # "report_name": "report_lira", "training_data_filename": "train_data.csv", "test_data_filename": "test_data.csv", "training_preds_filename": "train_preds.csv", @@ -217,7 +212,6 @@ config = { "n_shadow_models": 150, "output_dir": "outputs_lira", - # "report_name": "report_lira", "shadow_models_fail_fast": True, "n_shadow_rows_confidences_min": 10, "training_data_filename": "train_data.csv", diff --git a/examples/safemodel_attack_integration_bothcalls.py b/examples/safemodel_attack_integration_bothcalls.py index 58d43790..64f86bfa 100644 --- a/examples/safemodel_attack_integration_bothcalls.py +++ b/examples/safemodel_attack_integration_bothcalls.py @@ -7,18 +7,16 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from aisdc.attacks.target import Target # pylint: disable=import-error -from aisdc.safemodel.classifiers import ( # pylint: disable=import-error - SafeDecisionTreeClassifier, -) +from aisdc.attacks.target import Target +from aisdc.safemodel.classifiers import SafeDecisionTreeClassifier if __name__ == "__main__": # [Researcher] Access a dataset nursery_data = fetch_openml(data_id=26, as_frame=True) - x = np.asarray(nursery_data.data, dtype=str) + X = np.asarray(nursery_data.data, dtype=str) y = np.asarray(nursery_data.target, dtype=str) - n_features = np.shape(x)[1] + n_features = np.shape(X)[1] indices: list[list[int]] = [ [0, 1, 2], # parents [3, 4, 5, 6, 7], # has_nurs @@ -33,12 +31,12 @@ # [Researcher] Split into training and test sets # target model train / test split - these are strings ( - x_train_orig, - x_test_orig, + X_train_orig, + X_test_orig, y_train_orig, y_test_orig, ) = train_test_split( - x, + X, y, test_size=0.5, stratify=y, @@ -49,29 +47,29 @@ # one-hot encoding of features and integer encoding of labels label_enc = LabelEncoder() feature_enc = OneHotEncoder() - x_train = feature_enc.fit_transform(x_train_orig).toarray() + X_train = feature_enc.fit_transform(X_train_orig).toarray() y_train = label_enc.fit_transform(y_train_orig) - x_test = feature_enc.transform(x_test_orig).toarray() + X_test = feature_enc.transform(X_test_orig).toarray() y_test = label_enc.transform(y_test_orig) # [Researcher] Build a model model = SafeDecisionTreeClassifier(random_state=1) - model.fit(x_train, y_train) + model.fit(X_train, y_train) msg, disclosive = model.preliminary_check() # [TRE / Researcher] Wrap the model and data in a Target object target = Target(model=model) target.name = "nursery" - target.add_processed_data(x_train, y_train, x_test, y_test) - target.add_raw_data(x, y, x_train_orig, y_train_orig, x_test_orig, y_test_orig) + target.add_processed_data(X_train, y_train, X_test, y_test) + target.add_raw_data(X, y, X_train_orig, y_train_orig, X_test_orig, y_test_orig) for i in range(n_features): target.add_feature(nursery_data.feature_names[i], indices[i], "onehot") logging.info("Dataset: %s", target.name) logging.info("Features: %s", target.features) - logging.info("x_train shape = %s", np.shape(target.x_train)) + logging.info("X_train shape = %s", np.shape(target.X_train)) logging.info("y_train shape = %s", np.shape(target.y_train)) - logging.info("x_test shape = %s", np.shape(target.x_test)) + logging.info("X_test shape = %s", np.shape(target.X_test)) logging.info("y_test shape = %s", np.shape(target.y_test)) # [TRE / Researcher] Perform disclosure checks diff --git a/examples/worst_case_attack_example.py b/examples/worst_case_attack_example.py index abcf7589..e23e4574 100644 --- a/examples/worst_case_attack_example.py +++ b/examples/worst_case_attack_example.py @@ -25,8 +25,8 @@ from sklearn.model_selection import train_test_split from sklearn.svm import SVC -from aisdc.attacks import worst_case_attack # pylint: disable = import-error -from aisdc.attacks.target import Target # pylint: disable = import-error +from aisdc.attacks import worst_case_attack +from aisdc.attacks.target import Target sys.path.append(os.path.dirname(os.path.dirname(__file__))) @@ -34,23 +34,23 @@ X, y = load_breast_cancer(return_X_y=True, as_frame=False) # [Researcher] Split into training and test sets -train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # [Researcher] Define the classifier target_model = SVC(gamma=0.1, probability=True) # [Researcher] Train the classifier -target_model.fit(train_X, train_y) +target_model.fit(X_train, y_train) # [Researcher] Provide the model and the train and test data to the TRE # [TRE] Compute the predictions on the training and test sets -train_preds = target_model.predict_proba(train_X) -test_preds = target_model.predict_proba(test_X) +train_preds = target_model.predict_proba(X_train) +test_preds = target_model.predict_proba(X_test) # [TRE / Researcher] Wrap the model and data in a Target object target = Target(model=target_model) -target.add_processed_data(train_X, train_y, test_X, test_y) +target.add_processed_data(X_train, y_train, X_test, y_test) # [TRE] Create the attack object attack_obj = worst_case_attack.WorstCaseAttack( @@ -74,7 +74,7 @@ output_dir="outputs_worstcase", # # If report_name is given, it creates pdf and json files with the specified name; # # otherwise it create output files with default name 'report_worstcase' - # report_name="programmatically_worstcase_example1_report", + # e.g., report_name="programmatically_worstcase_example1_report", attack_metric_success_name="P_HIGHER_AUC", # threshold for a given metric for failure/success counters attack_metric_success_thresh=0.05, @@ -285,7 +285,6 @@ "train_beta": 5, "test_beta": 2, "output_dir": "outputs_worstcase", - # "report_name": "report_worstcase", "training_preds_filename": "train_preds.csv", "test_preds_filename": "test_preds.csv", "attack_metric_success_name": "P_HIGHER_AUC", @@ -313,7 +312,6 @@ "train_beta": 5, "test_beta": 2, "output_dir": "outputs_worstcase", - # "report_name": "report_worstcase", "training_preds_filename": "train_preds.csv", "test_preds_filename": "test_preds.csv", } diff --git a/pyproject.toml b/pyproject.toml index 77f6499a..5c02de7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,40 +40,52 @@ extend-include = ["*.ipynb"] lint.select = [ # "ANN", # flake8-annotations "ARG", # flake8-unused-arguments -# "B", # flake8-bugbear -# "C4", # flake8-comprehensions + "B", # flake8-bugbear + "C4", # flake8-comprehensions # "C90", # mccabe "D", # pydocstyle # "DTZ", # flake8-datetimez # "E", # pycodestyle -# "EM", # flake8-errmsg -# "ERA", # eradicate -# "F", # Pyflakes + "EM", # flake8-errmsg + "ERA", # eradicate + "F", # Pyflakes "I", # isort "ICN", # flake8-import-conventions -# "N", # pep8-naming + "N", # pep8-naming # "PD", # pandas-vet "PGH", # pygrep-hooks -# "PIE", # flake8-pie + "PIE", # flake8-pie # "PL", # Pylint "PLC", # Pylint "PLE", # Pylint # "PLR", # Pylint -# "PLW", # Pylint + "PLW", # Pylint "PT", # flake8-pytest-style "Q", # flake8-quotes -# "RET", # flake8-return + "RET", # flake8-return "RUF100", # Ruff-specific # "S", # flake8-bandit -# "SIM", # flake8-simplify + "SIM", # flake8-simplify # "T20", # flake8-print -# "TID", # flake8-tidy-imports -# "UP", # pyupgrade + "TID", # flake8-tidy-imports + "UP", # pyupgrade "W", # pycodestyle "YTT", # flake8-2020 ] lint.ignore = [ + "ANN101", # missing-type-self + "EM101", # raw-string-in-exception + "EM102", # f-string-in-exception + "N818", # error-suffix-on-exception-name +] + +[tool.ruff.lint.pep8-naming] +extend-ignore-names = [ + "X", "X_train", "X_val", "X_test", "X_predict", + "X_target_train", "X_shadow_train", "N", "C", + "X_train_orig", "X_test_orig", "X_orig", "X_transformed", + "X_encoded", ] [tool.ruff.lint.pydocstyle] @@ -84,4 +96,6 @@ docstring-code-format = true docstring-code-line-length = 80 [tool.ruff.lint.extend-per-file-ignores] -"tests/**/*" = ["S101"] +"user_stories/**/*" = ["ANN"] +"tests/**/*" = ["S101", "PLR2004", "ANN"] +"aisdc/attacks/structural_attack.py" = ["PLR2004"] diff --git a/tests/attacks/test_attack_report_formatter.py b/tests/attacks/test_attack_report_formatter.py index 292e4ec3..f2fba435 100644 --- a/tests/attacks/test_attack_report_formatter.py +++ b/tests/attacks/test_attack_report_formatter.py @@ -52,7 +52,7 @@ def get_test_report(): def get_target_report(): """Create a mock target model dictionary for use with tests.""" - target_formatted = { + return { "data_name": "", "n_samples": 12960, "features": {}, @@ -63,8 +63,6 @@ def get_target_report(): "model_params": {"C": 1.0}, } - return target_formatted - class TestGenerateReport(unittest.TestCase): """Class which tests the attack_report_formatter.py file.""" @@ -82,9 +80,7 @@ def process_json_from_file(self, json_formatted): g.export_to_file(output_filename) with open(output_filename, encoding="utf-8") as file: - data = file.read() - - return data + return file.read() def test_not_implemented(self): """Test to make sure analysis module fails expectedly when functions are called directly.""" @@ -383,15 +379,15 @@ def test_univariate_metrics_module(self): returned = f.process_dict() wca_auc = returned["WorstCaseAttack"]["AUC"] - for k in wca_auc.keys(): + for k in wca_auc: assert auc_value == pytest.approx(wca_auc[k]) wca_acc = returned["WorstCaseAttack"]["ACC"] - for k in wca_acc.keys(): + for k in wca_acc: assert acc_value == pytest.approx(wca_acc[k]) wca_fdif = returned["WorstCaseAttack"]["FDIF01"] - for k in wca_fdif.keys(): + for k in wca_fdif: assert fdif01_value == pytest.approx(wca_fdif[k]) assert str(f) == "Summary of Univarite Metrics" diff --git a/tests/attacks/test_attacks_target.py b/tests/attacks/test_attacks_target.py index 7fed9d7e..fa021aae 100644 --- a/tests/attacks/test_attacks_target.py +++ b/tests/attacks/test_attacks_target.py @@ -29,13 +29,13 @@ def test_target(get_target): assert tre_target.n_samples == target.n_samples assert tre_target.n_samples_orig == target.n_samples_orig assert tre_target.n_features == target.n_features - assert np.array_equal(tre_target.x_train, target.x_train) + assert np.array_equal(tre_target.X_train, target.X_train) assert np.array_equal(tre_target.y_train, target.y_train) - assert np.array_equal(tre_target.x_test, target.x_test) + assert np.array_equal(tre_target.X_test, target.X_test) assert np.array_equal(tre_target.y_test, target.y_test) - assert np.array_equal(tre_target.x_orig, target.x_orig) + assert np.array_equal(tre_target.X_orig, target.X_orig) assert np.array_equal(tre_target.y_orig, target.y_orig) - assert np.array_equal(tre_target.x_train_orig, target.x_train_orig) + assert np.array_equal(tre_target.X_train_orig, target.X_train_orig) assert np.array_equal(tre_target.y_train_orig, target.y_train_orig) - assert np.array_equal(tre_target.x_test_orig, target.x_test_orig) + assert np.array_equal(tre_target.X_test_orig, target.X_test_orig) assert np.array_equal(tre_target.y_test_orig, target.y_test_orig) diff --git a/tests/attacks/test_attribute_inference_attack.py b/tests/attacks/test_attribute_inference_attack.py index bcb120f0..1a7bd6bb 100644 --- a/tests/attacks/test_attribute_inference_attack.py +++ b/tests/attacks/test_attribute_inference_attack.py @@ -29,7 +29,7 @@ def pytest_generate_tests(metafunc): def fixture_common_setup(get_target): """Get ready to test some code.""" target = get_target - target.model.fit(target.x_train, target.y_train) + target.model.fit(target.X_train, target.y_train) attack_obj = attribute_attack.AttributeAttack(n_cpu=7, report_name="aia_report") return target, attack_obj @@ -64,7 +64,7 @@ def test_categorical_via_modified_attack_brute_force(common_setup): t_low_train_samples = t_low["train"][4] # Check the number of samples in the dataset - assert len(target.x_train) == t_low_train_samples + assert len(target.X_train) == t_low_train_samples # Check that all samples are correct for this threshold assert t_low_correct == t_low_total @@ -73,14 +73,14 @@ def test_categorical_via_modified_attack_brute_force(common_setup): t_high = _infer_categorical(target, feature, threshold) t_high_correct = t_high["train"][0] t_high_train_samples = t_high["train"][4] - assert len(target.x_train) == t_high_train_samples + assert len(target.X_train) == t_high_train_samples assert t_high_correct == 0 def test_continuous_via_modified_bounds_risk(common_setup): """Test continuous variables get_bounds_risk().""" target, _ = common_setup - returned = _get_bounds_risk(target.model, "dummy", 8, target.x_train, target.x_test) + returned = _get_bounds_risk(target.model, "dummy", 8, target.X_train, target.X_test) # Check the number of parameters returned assert len(returned.keys()) == 3 # Check the value of the returned parameters @@ -88,7 +88,7 @@ def test_continuous_via_modified_bounds_risk(common_setup): assert returned["test"] == 0 -def test_AIA_on_nursery(common_setup): +def test_aia_on_nursery(common_setup): """Test AIA on the nursery data with an added continuous feature.""" target, attack_obj = common_setup attack_obj.attack(target) @@ -99,7 +99,7 @@ def test_AIA_on_nursery(common_setup): assert "categorical" in keys -def test_AIA_on_nursery_from_cmd(common_setup): +def test_aia_on_nursery_from_cmd(common_setup): """Test AIA on the nursery data with an added continuous feature.""" target, _ = common_setup target.save(path="tests/test_aia_target") diff --git a/tests/attacks/test_failfast.py b/tests/attacks/test_failfast.py index 9cacc784..df888632 100644 --- a/tests/attacks/test_failfast.py +++ b/tests/attacks/test_failfast.py @@ -27,8 +27,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.04, attack_metric_success_comp_type="lte", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert not failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert not failfast_obj.check_attack_success(metrics) # Option 2 attack_obj = worst_case_attack.WorstCaseAttack( @@ -36,8 +36,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.06, attack_metric_success_comp_type="lte", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert failfast_obj.check_attack_success(metrics) # Option 3 attack_obj = worst_case_attack.WorstCaseAttack( @@ -45,8 +45,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.04, attack_metric_success_comp_type="lt", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert not failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert not failfast_obj.check_attack_success(metrics) # Option 4 attack_obj = worst_case_attack.WorstCaseAttack( @@ -54,8 +54,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.06, attack_metric_success_comp_type="lt", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert failfast_obj.check_attack_success(metrics) # Option 5 attack_obj = worst_case_attack.WorstCaseAttack( @@ -63,8 +63,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.04, attack_metric_success_comp_type="gte", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert failfast_obj.check_attack_success(metrics) # Option 6 attack_obj = worst_case_attack.WorstCaseAttack( @@ -72,8 +72,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.06, attack_metric_success_comp_type="gte", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert not failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert not failfast_obj.check_attack_success(metrics) # Option 7 attack_obj = worst_case_attack.WorstCaseAttack( @@ -81,8 +81,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.04, attack_metric_success_comp_type="gt", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert failfast_obj.check_attack_success(metrics) # Option 8 attack_obj = worst_case_attack.WorstCaseAttack( @@ -90,8 +90,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.06, attack_metric_success_comp_type="gt", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert not failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert not failfast_obj.check_attack_success(metrics) # Option 9 attack_obj = worst_case_attack.WorstCaseAttack( @@ -99,8 +99,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.05, attack_metric_success_comp_type="eq", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert failfast_obj.check_attack_success(metrics) # Option 10 attack_obj = worst_case_attack.WorstCaseAttack( @@ -108,8 +108,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.06, attack_metric_success_comp_type="eq", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert not failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert not failfast_obj.check_attack_success(metrics) # Option 11 attack_obj = worst_case_attack.WorstCaseAttack( @@ -117,8 +117,8 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.05, attack_metric_success_comp_type="not_eq", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert not failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + assert not failfast_obj.check_attack_success(metrics) # Option 12 attack_obj = worst_case_attack.WorstCaseAttack( @@ -126,9 +126,9 @@ def test_parse_boolean_argument(self): attack_metric_success_thresh=0.06, attack_metric_success_comp_type="not_eq", ) - failfast_Obj = failfast.FailFast(attack_obj) - assert failfast_Obj.check_attack_success(metrics) - assert failfast_Obj.get_fail_count() == 0 + failfast_obj = failfast.FailFast(attack_obj) + assert failfast_obj.check_attack_success(metrics) + assert failfast_obj.get_fail_count() == 0 def test_attack_success_fail_counts_and_overall_attack_success(self): """Test success and fail counts of attacks. @@ -147,18 +147,18 @@ def test_attack_success_fail_counts_and_overall_attack_success(self): attack_metric_success_comp_type="lte", attack_metric_success_count_thresh=3, ) - failfast_Obj = failfast.FailFast(attack_obj) - _ = failfast_Obj.check_attack_success(metrics) + failfast_obj = failfast.FailFast(attack_obj) + _ = failfast_obj.check_attack_success(metrics) metrics["P_HIGHER_AUC"] = 0.07 - _ = failfast_Obj.check_attack_success(metrics) + _ = failfast_obj.check_attack_success(metrics) metrics["P_HIGHER_AUC"] = 0.03 - _ = failfast_Obj.check_attack_success(metrics) - assert not failfast_Obj.check_overall_attack_success(attack_obj) + _ = failfast_obj.check_attack_success(metrics) + assert not failfast_obj.check_overall_attack_success(attack_obj) metrics["P_HIGHER_AUC"] = 0.02 - _ = failfast_Obj.check_attack_success(metrics) + _ = failfast_obj.check_attack_success(metrics) metrics["P_HIGHER_AUC"] = 0.01 - _ = failfast_Obj.check_attack_success(metrics) - assert failfast_Obj.get_success_count() == 3 - assert failfast_Obj.get_fail_count() == 2 - assert failfast_Obj.check_overall_attack_success(attack_obj) + _ = failfast_obj.check_attack_success(metrics) + assert failfast_obj.get_success_count() == 3 + assert failfast_obj.get_fail_count() == 2 + assert failfast_obj.check_overall_attack_success(attack_obj) diff --git a/tests/attacks/test_lira_attack.py b/tests/attacks/test_lira_attack.py index 2f119592..4732f97d 100644 --- a/tests/attacks/test_lira_attack.py +++ b/tests/attacks/test_lira_attack.py @@ -48,28 +48,28 @@ def test_predict(dummy_classifier_setup): def fixture_lira_classifier_setup(): """Set up common things for LiRA.""" X, y = load_breast_cancer(return_X_y=True, as_frame=False) - train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) target_model = RandomForestClassifier( n_estimators=100, min_samples_split=2, min_samples_leaf=1 ) - target_model.fit(train_X, train_y) + target_model.fit(X_train, y_train) target = Target(target_model) - target.add_processed_data(train_X, train_y, test_X, test_y) + target.add_processed_data(X_train, y_train, X_test, y_test) target.save(path="test_lira_target") # Dump training and test data to csv np.savetxt( "train_data.csv", - np.hstack((train_X, train_y[:, None])), + np.hstack((X_train, y_train[:, None])), delimiter=",", ) - np.savetxt("test_data.csv", np.hstack((test_X, test_y[:, None])), delimiter=",") + np.savetxt("test_data.csv", np.hstack((X_test, y_test[:, None])), delimiter=",") # dump the training and test predictions into files np.savetxt( "train_preds.csv", - target_model.predict_proba(train_X), + target_model.predict_proba(X_train), delimiter=",", ) - np.savetxt("test_preds.csv", target_model.predict_proba(test_X), delimiter=",") + np.savetxt("test_preds.csv", target_model.predict_proba(X_test), delimiter=",") return target @@ -105,14 +105,14 @@ def test_check_and_update_dataset(lira_classifier_setup): attack_obj = LIRAAttack(n_shadow_models=N_SHADOW_MODELS) # now make test[0] have a class not present in training set# - local_test_y = np.copy(target.y_test) - local_test_y[0] = 5 + local_y_test = np.copy(target.y_test) + local_y_test[0] = 5 local_target = Target(target.model) local_target.add_processed_data( - target.x_train, target.y_train, target.x_test, local_test_y + target.X_train, target.y_train, target.X_test, local_y_test ) - unique_classes_pre = set(local_test_y) - n_test_examples_pre = len(local_test_y) + unique_classes_pre = set(local_y_test) + n_test_examples_pre = len(local_y_test) local_target = attack_obj._check_and_update_dataset( # pylint: disable=protected-access local_target ) diff --git a/tests/attacks/test_metrics.py b/tests/attacks/test_metrics.py index 11a19741..e95fd75a 100644 --- a/tests/attacks/test_metrics.py +++ b/tests/attacks/test_metrics.py @@ -7,15 +7,7 @@ import numpy as np import pytest -from aisdc.metrics import ( - _div, - _tpr_at_fpr, - get_metrics, - get_probabilities, - min_max_disc, -) - -# pylint: disable = invalid-name +from aisdc.metrics import _div, _tpr_at_fpr, get_metrics, min_max_disc PREDICTED_CLASS = np.array([0, 1, 0, 0, 1, 1]) TRUE_CLASS = np.array([0, 0, 0, 1, 1, 1]) @@ -73,52 +65,16 @@ def test_valid_input(self): assert pytest.approx(tpr) == 0.0 -class TestProbabilities(unittest.TestCase): - """Test the checks on the input parameters of the get_probabilites function.""" - - def test_permute_rows_errors(self): - """Test error when permute_rows is True, but no y_test is supplied.""" - clf = DummyClassifier() - testX = [] - with pytest.raises(ValueError, match="not enough values to unpack.*"): - get_probabilities(clf, testX, permute_rows=True) - - def test_permute_rows_with_permute_rows(self): - """Test permute_rows = True succeeds.""" - clf = DummyClassifier() - testX = np.zeros((4, 2)) - testY = np.zeros((4, 2)) - - returned = get_probabilities(clf, testX, testY, permute_rows=True) - - # Check the function returns two arguments - assert len(returned) == 2 - - # Check that the second argument is the same shape as testY - assert testY.shape == returned[1].shape - - # Check that the function is returning the right thing: predict_proba - assert clf.predict_proba(testX).shape == returned[0].shape - - def test_permute_rows_without_permute_rows(self): - """Test permute_rows = False succeeds.""" - clf = DummyClassifier() - testX = np.zeros((4, 2)) - y_pred_proba = get_probabilities(clf, testX, permute_rows=False) - # Check the function returns pnly y_pred_proba - assert clf.predict_proba(testX).shape == y_pred_proba.shape - - class TestMetrics(unittest.TestCase): """Test the metrics with some dummy predictions.""" def test_metrics(self): """Test each individual metric with dummy data.""" clf = DummyClassifier() - testX = [] - testy = TRUE_CLASS - y_pred_proba = get_probabilities(clf, testX, testy, permute_rows=False) - metrics = get_metrics(y_pred_proba, testy) + X_test = [] + y_test = TRUE_CLASS + y_pred_proba = clf.predict_proba(X_test) + metrics = get_metrics(y_pred_proba, y_test) assert metrics["TPR"] == pytest.approx(2 / 3) assert metrics["FPR"] == pytest.approx(1 / 3) assert metrics["FAR"] == pytest.approx(1 / 3) @@ -171,7 +127,7 @@ def test_tpr(self): assert tpr == pytest.approx(1) -class Test_Div(unittest.TestCase): +class TestDiv(unittest.TestCase): """Test the _div functionality.""" def test_div(self): diff --git a/tests/attacks/test_multiple_attacks.py b/tests/attacks/test_multiple_attacks.py index e10d0dc7..1437e178 100644 --- a/tests/attacks/test_multiple_attacks.py +++ b/tests/attacks/test_multiple_attacks.py @@ -24,7 +24,7 @@ def pytest_generate_tests(metafunc): def fixture_common_setup(get_target): """Get ready to test some code.""" target = get_target - target.model.fit(target.x_train, target.y_train) + target.model.fit(target.X_train, target.y_train) attack_obj = MultipleAttacks(config_filename="test_single_config.json") return target, attack_obj diff --git a/tests/attacks/test_structural_attack.py b/tests/attacks/test_structural_attack.py index 00c5f877..86b71f80 100644 --- a/tests/attacks/test_structural_attack.py +++ b/tests/attacks/test_structural_attack.py @@ -22,7 +22,7 @@ def get_target(modeltype: str, **kwparams: dict) -> Target: """Load dataset and create target of the desired type.""" X, y = load_breast_cancer(return_X_y=True, as_frame=False) - train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # these types should be handled if modeltype == "dt": @@ -42,11 +42,11 @@ def get_target(modeltype: str, **kwparams: dict) -> Target: raise NotImplementedError("model type passed to get_model unknown") # Train the classifier - target_model.fit(train_X, train_y) + target_model.fit(X_train, y_train) # Wrap the model and data in a Target object target = Target(model=target_model) - target.add_processed_data(train_X, train_y, test_X, test_y) + target.add_processed_data(X_train, y_train, X_test, y_test) return target diff --git a/tests/attacks/test_worst_case_attack.py b/tests/attacks/test_worst_case_attack.py index 8b09651f..6860a606 100644 --- a/tests/attacks/test_worst_case_attack.py +++ b/tests/attacks/test_worst_case_attack.py @@ -43,17 +43,17 @@ def test_config_file_arguments_parsin(): def test_attack_from_predictions_cmd(): """Running attack using configuration file and prediction files.""" X, y = load_breast_cancer(return_X_y=True, as_frame=False) - train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) + X_train, X_test, train_y, test_y = train_test_split(X, y, test_size=0.3) model = SVC(gamma=0.1, probability=True) - model.fit(train_X, train_y) + model.fit(X_train, train_y) - ytr_pred = model.predict_proba(train_X) - yte_pred = model.predict_proba(test_X) + ytr_pred = model.predict_proba(X_train) + yte_pred = model.predict_proba(X_test) np.savetxt("ypred_train.csv", ytr_pred, delimiter=",") np.savetxt("ypred_test.csv", yte_pred, delimiter=",") target = Target(model=model) - target.add_processed_data(train_X, train_y, test_X, test_y) + target.add_processed_data(X_train, train_y, X_test, test_y) target.save(path="test_worstcase_target") @@ -88,15 +88,15 @@ def test_attack_from_predictions_cmd(): def test_report_worstcase(): """Tests worst case attack directly.""" X, y = load_breast_cancer(return_X_y=True, as_frame=False) - train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) + X_train, X_test, train_y, test_y = train_test_split(X, y, test_size=0.3) model = SVC(gamma=0.1, probability=True) - model.fit(train_X, train_y) - _ = model.predict_proba(train_X) - _ = model.predict_proba(test_X) + model.fit(X_train, train_y) + _ = model.predict_proba(X_train) + _ = model.predict_proba(X_test) target = Target(model=model) - target.add_processed_data(train_X, train_y, test_X, test_y) + target.add_processed_data(X_train, train_y, X_test, test_y) # with multiple reps attack_obj = worst_case_attack.WorstCaseAttack( @@ -131,13 +131,13 @@ def test_report_worstcase(): def test_attack_with_correct_feature(): """Test the attack when the model correctness feature is used.""" X, y = load_breast_cancer(return_X_y=True, as_frame=False) - train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) + X_train, X_test, train_y, test_y = train_test_split(X, y, test_size=0.3) model = SVC(gamma=0.1, probability=True) - model.fit(train_X, train_y) + model.fit(X_train, train_y) target = Target(model=model) - target.add_processed_data(train_X, train_y, test_X, test_y) + target.add_processed_data(X_train, train_y, X_test, test_y) # with multiple reps attack_obj = worst_case_attack.WorstCaseAttack( @@ -163,17 +163,17 @@ def test_attack_with_correct_feature(): def test_attack_from_predictions(): """Checks code that runs attacks from predictions.""" X, y = load_breast_cancer(return_X_y=True, as_frame=False) - train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) + X_train, X_test, train_y, test_y = train_test_split(X, y, test_size=0.3) model = SVC(gamma=0.1, probability=True) - model.fit(train_X, train_y) - ytr_pred = model.predict_proba(train_X) - yte_pred = model.predict_proba(test_X) + model.fit(X_train, train_y) + ytr_pred = model.predict_proba(X_train) + yte_pred = model.predict_proba(X_test) np.savetxt("ypred_train.csv", ytr_pred, delimiter=",") np.savetxt("ypred_test.csv", yte_pred, delimiter=",") target = Target(model=model) - target.add_processed_data(train_X, train_y, test_X, test_y) + target.add_processed_data(X_train, train_y, X_test, test_y) attack_obj = worst_case_attack.WorstCaseAttack( # How many attacks to run -- in each the attack model is trained on a different @@ -197,17 +197,17 @@ def test_attack_from_predictions(): def test_attack_from_predictions_no_dummy(): """Checks code that runs attacks from predictions.""" X, y = load_breast_cancer(return_X_y=True, as_frame=False) - train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) + X_train, X_test, train_y, test_y = train_test_split(X, y, test_size=0.3) model = SVC(gamma=0.1, probability=True) - model.fit(train_X, train_y) - ytr_pred = model.predict_proba(train_X) - yte_pred = model.predict_proba(test_X) + model.fit(X_train, train_y) + ytr_pred = model.predict_proba(X_train) + yte_pred = model.predict_proba(X_test) np.savetxt("ypred_train.csv", ytr_pred, delimiter=",") np.savetxt("ypred_test.csv", yte_pred, delimiter=",") target = Target(model=model) - target.add_processed_data(train_X, train_y, test_X, test_y) + target.add_processed_data(X_train, train_y, X_test, test_y) attack_obj = worst_case_attack.WorstCaseAttack( # How many attacks to run -- in each the attack model is trained on a different @@ -310,15 +310,15 @@ def test_non_rf_mia(): an AttributeError we now it *is* trying to use the SVC. """ X, y = load_breast_cancer(return_X_y=True, as_frame=False) - train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) + X_train, X_test, train_y, test_y = train_test_split(X, y, test_size=0.3) model = SVC(gamma=0.1, probability=True) - model.fit(train_X, train_y) - ytr_pred = model.predict_proba(train_X) - yte_pred = model.predict_proba(test_X) + model.fit(X_train, train_y) + ytr_pred = model.predict_proba(X_train) + yte_pred = model.predict_proba(X_test) target = Target(model=model) - target.add_processed_data(train_X, train_y, test_X, test_y) + target.add_processed_data(X_train, train_y, X_test, test_y) attack_obj = worst_case_attack.WorstCaseAttack( mia_attack_model=SVC, diff --git a/tests/conftest.py b/tests/conftest.py index 9ecfb436..f7c29180 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ """Common utility functions for testing.""" +import contextlib import os import shutil from datetime import date @@ -77,20 +78,16 @@ def _cleanup(): yield for folder in folders: - try: + with contextlib.suppress(Exception): shutil.rmtree(folder) - except Exception: # pylint: disable=broad-exception-caught - pass files.append( # from attack_report_formater.py "ATTACK_RESULTS" + str(date.today().strftime("%d_%m_%Y")) + ".json" ) for file in files: - try: + with contextlib.suppress(Exception): os.remove(file) - except Exception: # pylint: disable=broad-exception-caught - pass @pytest.fixture() @@ -125,8 +122,8 @@ def get_target(request) -> Target: # pylint: disable=too-many-locals # [Researcher] Split into training and test sets # target model train / test split - these are strings ( - x_train_orig, - x_test_orig, + X_train_orig, + X_test_orig, y_train_orig, y_test_orig, ) = train_test_split( @@ -138,8 +135,8 @@ def get_target(request) -> Target: # pylint: disable=too-many-locals ) # now resample the training data reduce number of examples - _, x_train_orig, _, y_train_orig = train_test_split( - x_train_orig, + _, X_train_orig, _, y_train_orig = train_test_split( + X_train_orig, y_train_orig, test_size=0.05, stratify=y_train_orig, @@ -150,31 +147,30 @@ def get_target(request) -> Target: # pylint: disable=too-many-locals # one-hot encoding of features and integer encoding of labels label_enc = LabelEncoder() feature_enc = OneHotEncoder() - x_train = feature_enc.fit_transform(x_train_orig).toarray() + X_train = feature_enc.fit_transform(X_train_orig).toarray() y_train = label_enc.fit_transform(y_train_orig) - x_test = feature_enc.transform(x_test_orig).toarray() + X_test = feature_enc.transform(X_test_orig).toarray() y_test = label_enc.transform(y_test_orig) # add dummy continuous valued attribute from N(0.5,0.05) - dummy_tr = 0.5 + 0.05 * np.random.randn(x_train.shape[0]) - dummy_te = 0.5 + 0.05 * np.random.randn(x_test.shape[0]) - dummy_all = np.hstack((dummy_tr, dummy_te)).reshape(-1, 1) + dummy_tr = 0.5 + 0.05 * np.random.randn(X_train.shape[0]) + dummy_te = 0.5 + 0.05 * np.random.randn(X_test.shape[0]) dummy_tr = dummy_tr.reshape(-1, 1) dummy_te = dummy_te.reshape(-1, 1) - x_train = np.hstack((x_train, dummy_tr)) - x_train_orig = np.hstack((x_train_orig, dummy_tr)) - x_test = np.hstack((x_test, dummy_te)) - x_test_orig = np.hstack((x_test_orig, dummy_te)) - xmore = np.concatenate((x_train_orig, x_test_orig)) - n_features = np.shape(x_train_orig)[1] + X_train = np.hstack((X_train, dummy_tr)) + X_train_orig = np.hstack((X_train_orig, dummy_tr)) + X_test = np.hstack((X_test, dummy_te)) + X_test_orig = np.hstack((X_test_orig, dummy_te)) + xmore = np.concatenate((X_train_orig, X_test_orig)) + n_features = np.shape(X_train_orig)[1] # wrap target = Target(model=model) target.name = "nursery" - target.add_processed_data(x_train, y_train, x_test, y_test) + target.add_processed_data(X_train, y_train, X_test, y_test) for i in range(n_features - 1): target.add_feature(nursery_data.feature_names[i], indices[i], "onehot") target.add_feature("dummy", indices[n_features - 1], "float") - target.add_raw_data(xmore, y, x_train_orig, y_train_orig, x_test_orig, y_test_orig) + target.add_raw_data(xmore, y, X_train_orig, y_train_orig, X_test_orig, y_test_orig) return target diff --git a/tests/preprocessing/test_loaders.py b/tests/preprocessing/test_loaders.py index 1fde7ae6..34191a3f 100644 --- a/tests/preprocessing/test_loaders.py +++ b/tests/preprocessing/test_loaders.py @@ -113,12 +113,9 @@ def test_data_absent(): def test_mimic(): """Load the mimic2 dataset.""" - # try: x_df, y_df = loaders.get_data_sklearn("mimic2-iaccd", DATA_FOLDER) assert x_df.shape == (1064, 38), f"x_df shape is {x_df.shape}" assert y_df.shape == (1064, 1) - # except DataNotAvailable: - # pass def test_in_hospital(): diff --git a/tests/safemodel/test_attacks.py b/tests/safemodel/test_attacks.py index 18fef622..1027cf30 100644 --- a/tests/safemodel/test_attacks.py +++ b/tests/safemodel/test_attacks.py @@ -24,11 +24,11 @@ def test_superclass(): print(str(my_attack)) -def test_NumpyArrayEncoder(): +def test_numpy_array_encoder(): """Conversion routine from reports.py.""" i32 = np.int32(2) i64 = np.int64(2) - twoDarray = np.zeros((2, 2)) + array_2d = np.zeros((2, 2)) my_encoder = report.NumpyArrayEncoder() retval = my_encoder.default(i32) @@ -37,7 +37,7 @@ def test_NumpyArrayEncoder(): retval = my_encoder.default(i64) assert isinstance(retval, int) - retval = my_encoder.default(twoDarray) + retval = my_encoder.default(array_2d) assert isinstance(retval, list) with pytest.raises(TypeError): diff --git a/tests/safemodel/test_attacks_via_safemodel.py b/tests/safemodel/test_attacks_via_safemodel.py index 99e48db2..7cb25431 100644 --- a/tests/safemodel/test_attacks_via_safemodel.py +++ b/tests/safemodel/test_attacks_via_safemodel.py @@ -20,7 +20,7 @@ def test_attacks_via_request_release(get_target): """Test vulnerable, hacked model then call request_release.""" target = get_target assert target.__str__() == "nursery" # pylint: disable=unnecessary-dunder-call - target.model.fit(target.x_train, target.y_train) + target.model.fit(target.X_train, target.y_train) target.model.min_samples_leaf = 10 target.model.request_release(path=RES_DIR, ext="pkl", target=target) @@ -33,11 +33,11 @@ def test_attacks_via_request_release(get_target): def test_run_attack_lira(get_target): """Test the lira attack via safemodel.""" target = get_target - target.model.fit(target.x_train, target.y_train) + target.model.fit(target.X_train, target.y_train) _, disclosive = target.model.preliminary_check() assert not disclosive print(np.unique(target.y_test, return_counts=True)) - print(np.unique(target.model.predict(target.x_test), return_counts=True)) + print(np.unique(target.model.predict(target.X_test), return_counts=True)) metadata = target.model.run_attack(target, "lira", RES_DIR, "lira_res") assert len(metadata) > 0 # something has been added @@ -50,7 +50,7 @@ def test_run_attack_lira(get_target): def test_run_attack_worstcase(get_target): """Test the worst case attack via safemodel.""" target = get_target - target.model.fit(target.x_train, target.y_train) + target.model.fit(target.X_train, target.y_train) _, disclosive = target.model.preliminary_check() assert not disclosive metadata = target.model.run_attack(target, "worst_case", RES_DIR, "wc_res") @@ -65,7 +65,7 @@ def test_run_attack_worstcase(get_target): def test_run_attack_attribute(get_target): """Test the attribute attack via safemodel.""" target = get_target - target.model.fit(target.x_train, target.y_train) + target.model.fit(target.X_train, target.y_train) _, disclosive = target.model.preliminary_check() assert not disclosive metadata = target.model.run_attack(target, "attribute", RES_DIR, "attr_res") @@ -107,6 +107,6 @@ def test_attack_args(): def test_run_attack_unknown(get_target): """Test an unknown attack via safemodel.""" target = get_target - target.model.fit(target.x_train, target.y_train) + target.model.fit(target.X_train, target.y_train) metadata = target.model.run_attack(target, "unknown", RES_DIR, "unk") assert metadata["outcome"] == "unrecognised attack type requested" diff --git a/tests/safemodel/test_safekeras2.py b/tests/safemodel/test_safekeras2.py index 084c3862..63d602c4 100644 --- a/tests/safemodel/test_safekeras2.py +++ b/tests/safemodel/test_safekeras2.py @@ -21,10 +21,7 @@ EPOCHS = 1 n_classes = 4 -# expected accuracy -# ACC = 0.325 if platform.system() == "Darwin" else 0.3583333492279053 ACC = 0.3583333492279053 -# UNSAFE_ACC = 0.325 if platform.system() == "Darwin" else 0.3583333492279053 UNSAFE_ACC = 0.3583333492279053 RES_DIR = "RES" @@ -36,18 +33,18 @@ def get_data(): yall = np.asarray(iris["target"], dtype=np.float64) xall = np.vstack([xall, (7, 2.0, 4.5, 1)]) yall = np.append(yall, n_classes) - X, Xval, y, yval = train_test_split( + X, X_val, y, y_val = train_test_split( xall, yall, test_size=0.2, shuffle=True, random_state=12345 ) y = tf.one_hot(y, n_classes) - yval = tf.one_hot(yval, n_classes) - return X, y, Xval, yval + y_val = tf.one_hot(y_val, n_classes) + return X, y, X_val, y_val def make_small_model(num_hidden_layers=2): """Make the keras model.""" # get data - X, y, Xval, yval = get_data() + X, y, X_val, y_val = get_data() # set seed and kernel initialisers for repeatability tf.random.set_seed(12345) initializer = tf.keras.initializers.Zeros() @@ -66,7 +63,7 @@ def make_small_model(num_hidden_layers=2): epochs=EPOCHS, ) - return model, X, y, Xval, yval + return model, X, y, X_val, y_val def check_init_completed(model: SafeKerasModel): @@ -215,7 +212,7 @@ def test_same_weights(): # pylint : disable=too-many-locals assert not same3, errstr -def test_DP_optimizer_checks(): +def test_dp_optimizer_checks(): """Test the various checks that DP optimiser was used.""" # make model model1, _, _, _, _ = make_small_model(num_hidden_layers=1) @@ -234,7 +231,7 @@ def test_DP_optimizer_checks(): model.compile(loss=loss, optimizer=oktype) opt_ok, msg = safekeras.check_optimizer_allowed(model.optimizer) assert opt_ok, msg - opt_is_dp, _ = safekeras.check_optimizer_is_DP(model.optimizer) + opt_is_dp, _ = safekeras.check_optimizer_is_dp(model.optimizer) assert opt_is_dp # not ok optimizer @@ -244,43 +241,43 @@ def test_DP_optimizer_checks(): model.optimizer = tf.keras.optimizers.get("SGD") opt_ok, msg = safekeras.check_optimizer_allowed(model1.optimizer) assert not opt_ok, msg - opt_is_dp, msg = safekeras.check_optimizer_is_DP(model1.optimizer) + opt_is_dp, msg = safekeras.check_optimizer_is_dp(model1.optimizer) assert not opt_is_dp, msg -def test_DP_used(): +def test_dp_used(): """Test the various checks that DP optimiser was used.""" # should pass after model compiled **and** fitted with DP optimizer - model1, X, y, Xval, yval = make_small_model(num_hidden_layers=1) + model1, X, y, X_val, y_val = make_small_model(num_hidden_layers=1) loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model1.compile(loss=loss) - dp_used, msg = safekeras.check_DP_used(model1.optimizer) + dp_used, msg = safekeras.check_dp_used(model1.optimizer) assert not dp_used - model1.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) - dp_used, msg = safekeras.check_DP_used(model1.optimizer) + model1.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) + dp_used, msg = safekeras.check_dp_used(model1.optimizer) assert dp_used # this model gets changed to non-DP by calling the superclass compile() # so should fail all the checks model2, _, _, _, _ = make_small_model(num_hidden_layers=1) super(SafeKerasModel, model2).compile(loss=loss, optimizer="SGD") - model2.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) - dp_used, msg = safekeras.check_DP_used(model2.optimizer) + model2.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) + dp_used, msg = safekeras.check_dp_used(model2.optimizer) assert not dp_used, msg def test_checkpoints_are_equal(): """Test the check for checkpoint equality.""" - model1, X, y, Xval, yval = make_small_model(num_hidden_layers=1) + model1, X, y, X_val, y_val = make_small_model(num_hidden_layers=1) loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model1.compile(loss=loss) - model1.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model1.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) model1.save("fit.tf") - model1.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS * 2, batch_size=20) + model1.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS * 2, batch_size=20) model1.save("refit.tf") # same arch, different weights @@ -293,9 +290,9 @@ def test_checkpoints_are_equal(): assert same, msg # different architecture - model2, X, y, Xval, yval = make_small_model(num_hidden_layers=3) + model2, X, y, X_val, y_val = make_small_model(num_hidden_layers=3) model2.compile(loss=loss) - model2.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model2.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) model2.save("fit2.tf") same, msg = safekeras.check_checkpoint_equality("fit.tf", "fit2.tf") @@ -320,12 +317,12 @@ def test_checkpoints_are_equal(): def test_load(): """Test the loading functionality.""" # make a model, train then save it - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) model.save("keras_save.tf") # won't load with invalid names @@ -386,15 +383,15 @@ def test_second_keras_model_created(): assert model2.noise_multiplier == 0.7 -def test_keras_model_compiled_as_DP(): +def test_keras_model_compiled_as_dp(): """Test Compile DP.""" model, X, _, _, _ = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - isDP, _ = safekeras.check_optimizer_is_DP(model.optimizer) - assert isDP, "failed check that optimizer is dP" + is_dp, _ = safekeras.check_optimizer_is_dp(model.optimizer) + assert is_dp, "failed check that optimizer is dP" right_epsilon = 20.363059561511612 model.check_epsilon(X.shape[0], 20, 10) @@ -411,21 +408,21 @@ def test_keras_model_compiled_as_DP(): def test_keras_basic_fit(): """Test SafeKeras using recommended values.""" - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - isDP, msg = safekeras.check_optimizer_is_DP(model.optimizer) - assert isDP, "failed check that optimizer is dP" + is_dp, msg = safekeras.check_optimizer_is_dp(model.optimizer) + assert is_dp, "failed check that optimizer is dP" # first check that stops when not DP if they say refine ok, msg = model.fit( X, y, - validation_data=(Xval, yval), + validation_data=(X_val, y_val), epochs=10, batch_size=X.shape[0], refine_epsilon=True, @@ -433,11 +430,11 @@ def test_keras_basic_fit(): assert not ok # now default (False) - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) - DPused, msg = safekeras.check_DP_used(model.optimizer) + dp_used, msg = safekeras.check_dp_used(model.optimizer) assert ( - DPused + dp_used ), "Failed check that DP version of optimiser was actually used in training" loss, acc = model.evaluate(X, y) @@ -455,13 +452,13 @@ def test_keras_basic_fit(): def test_keras_save_actions(): """Test save action.""" # create, compile and train model - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) # start with .tf and .h5 which should work names = ("safekeras.tf", "safekeras.h5") @@ -479,23 +476,23 @@ def test_keras_save_actions(): def test_keras_unsafe_l2_norm(): """Test SafeKeras using unsafe values.""" - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - isDP, msg = safekeras.check_optimizer_is_DP(model.optimizer) - assert isDP, "failed check that optimizer is dP" + is_dp, msg = safekeras.check_optimizer_is_dp(model.optimizer) + assert is_dp, "failed check that optimizer is dP" model.l2_norm_clip = 0.9 - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) - DPused, msg = safekeras.check_DP_used(model.optimizer) + dp_used, msg = safekeras.check_dp_used(model.optimizer) assert ( - DPused + dp_used ), "Failed check that DP version of optimiser was actually used in training" loss, acc = model.evaluate(X, y) @@ -516,23 +513,23 @@ def test_keras_unsafe_l2_norm(): def test_keras_unsafe_noise_multiplier(): """Test SafeKeras using unsafe values.""" - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - isDP, msg = safekeras.check_optimizer_is_DP(model.optimizer) - assert isDP, "failed check that optimizer is dP" + is_dp, msg = safekeras.check_optimizer_is_dp(model.optimizer) + assert is_dp, "failed check that optimizer is dP" model.noise_multiplier = 1.0 - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) - DPused, msg = safekeras.check_DP_used(model.optimizer) + dp_used, msg = safekeras.check_dp_used(model.optimizer) assert ( - DPused + dp_used ), "Failed check that DP version of optimiser was actually used in training" loss, acc = model.evaluate(X, y) @@ -554,23 +551,23 @@ def test_keras_unsafe_noise_multiplier(): def test_keras_unsafe_min_epsilon(): """Test SafeKeras using unsafe values.""" - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - isDP, msg = safekeras.check_optimizer_is_DP(model.optimizer) - assert isDP, "failed check that optimizer is dP" + is_dp, msg = safekeras.check_optimizer_is_dp(model.optimizer) + assert is_dp, "failed check that optimizer is dP" model.min_epsilon = 4 - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) - DPused, msg = safekeras.check_DP_used(model.optimizer) + dp_used, msg = safekeras.check_dp_used(model.optimizer) assert ( - DPused + dp_used ), "Failed check that DP version of optimiser was actually used in training" loss, acc = model.evaluate(X, y) @@ -591,23 +588,23 @@ def test_keras_unsafe_min_epsilon(): def test_keras_unsafe_delta(): """Test SafeKeras using unsafe values.""" - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - isDP, msg = safekeras.check_optimizer_is_DP(model.optimizer) - assert isDP, "failed check that optimizer is dP" + is_dp, msg = safekeras.check_optimizer_is_dp(model.optimizer) + assert is_dp, "failed check that optimizer is dP" model.delta = 1e-6 - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) - DPused, msg = safekeras.check_DP_used(model.optimizer) + dp_used, msg = safekeras.check_dp_used(model.optimizer) assert ( - DPused + dp_used ), "Failed check that DP version of optimiser was actually used in training" loss, acc = model.evaluate(X, y) @@ -627,23 +624,23 @@ def test_keras_unsafe_delta(): def test_keras_unsafe_batch_size(): """Test SafeKeras using unsafe values.""" - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - isDP, msg = safekeras.check_optimizer_is_DP(model.optimizer) - assert isDP, "failed check that optimizer is dP" + is_dp, msg = safekeras.check_optimizer_is_dp(model.optimizer) + assert is_dp, "failed check that optimizer is dP" model.batch_size = 34 - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) - DPused, msg = safekeras.check_DP_used(model.optimizer) + dp_used, msg = safekeras.check_dp_used(model.optimizer) assert ( - DPused + dp_used ), "Failed check that DP version of optimiser was actually used in training" loss, acc = model.evaluate(X, y) @@ -660,23 +657,23 @@ def test_keras_unsafe_batch_size(): def test_keras_unsafe_learning_rate(): """Test SafeKeras using unsafe values.""" - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - isDP, msg = safekeras.check_optimizer_is_DP(model.optimizer) - assert isDP, "failed check that optimizer is dP" + is_dp, msg = safekeras.check_optimizer_is_dp(model.optimizer) + assert is_dp, "failed check that optimizer is dP" model.learning_rate = 0.2 - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) - DPused, msg = safekeras.check_DP_used(model.optimizer) + dp_used, msg = safekeras.check_dp_used(model.optimizer) assert ( - DPused + dp_used ), "Failed check that DP version of optimiser was actually used in training" loss, acc = model.evaluate(X, y) @@ -695,13 +692,13 @@ def test_keras_unsafe_learning_rate(): def test_create_checkfile(): """Test create checkfile.""" # create, compile and train model - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - model.fit(X, y, validation_data=(Xval, yval), epochs=EPOCHS, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=20) # start with .tf and .h5 which should work exts = ("tf", "h5") @@ -729,12 +726,12 @@ def test_create_checkfile(): def test_posthoc_check(): """Test the posthoc checking function.""" # make a model, train then save it - model, X, y, Xval, yval = make_small_model() + model, X, y, X_val, y_val = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( from_logits=False, reduction=tf.losses.Reduction.NONE ) model.compile(loss=loss, optimizer=None) - model.fit(X, y, validation_data=(Xval, yval), epochs=1, batch_size=20) + model.fit(X, y, validation_data=(X_val, y_val), epochs=1, batch_size=20) # should be ok _, disclosive = model.posthoc_check() diff --git a/tests/safemodel/test_safemodel.py b/tests/safemodel/test_safemodel.py index 88528974..02991758 100644 --- a/tests/safemodel/test_safemodel.py +++ b/tests/safemodel/test_safemodel.py @@ -23,14 +23,14 @@ class DummyClassifier: # pylint: disable=too-many-arguments def __init__( - self, at_least_5f=5.0, at_most_5i=5, exactly_boo="boo", keyA=True, keyB=True + self, at_least_5f=5.0, at_most_5i=5, exactly_boo="boo", key_a=True, key_b=True ): """Instantiate a dummy classifier.""" self.at_least_5f = at_least_5f self.at_most_5i = at_most_5i self.exactly_boo = exactly_boo - self.keyA = keyA - self.keyB = keyB + self.key_a = key_a + self.key_b = key_b def fit(self, x: np.ndarray, y: np.ndarray): """Fit a dummy classifier.""" @@ -60,8 +60,8 @@ def __init__(self, **kwargs) -> None: "at_least_5f", "at_most_5i", "exactly_boo", - "keyA", - "keyB", + "key_a", + "key_b", ) the_kwds = {} for key, val in kwargs.items(): @@ -273,33 +273,33 @@ def test_check_model_param_or(): part1 = get_reporting_string( name="different_than_fixed_value", - key="keyA", + key="key_a", cur_val=False, val="True", ) part2 = get_reporting_string( name="different_than_fixed_value", - key="keyB", + key="key_b", cur_val=False, val="True", ) # or - branch 1 - model = SafeDummyClassifier(keyA=False) + model = SafeDummyClassifier(key_a=False) correct_msg = ok_start + part1 msg, disclosive = model.preliminary_check() assert msg == correct_msg, f"Correct msg:\n{correct_msg}\nActual msg:\n{msg}\n" assert not disclosive # or branch 2 - model = SafeDummyClassifier(keyB=False) + model = SafeDummyClassifier(key_b=False) correct_msg = ok_start + part2 msg, disclosive = model.preliminary_check() assert msg == correct_msg, f"Correct msg:\n{correct_msg}\nActual msg:\n{msg}\n" assert not disclosive # fail or - model = SafeDummyClassifier(keyA=False, keyB=False) + model = SafeDummyClassifier(key_a=False, key_b=False) correct_msg = notok_start + part1 + part2 msg, disclosive = model.preliminary_check() assert msg == correct_msg, f"Correct msg:\n{correct_msg}\nActual msg:\n{msg}\n" @@ -446,15 +446,12 @@ def test_get_saved_model_exception(): """Test the exception handling in get_current_and_saved_models().""" model = SafeDummyClassifier() # add generator which can't be pickled or copied - model.a_generator = ( # pylint: disable=attribute-defined-outside-init i for i in [1, 2, 3] ) current, saved = model.get_current_and_saved_models() assert saved == {} # since we haven;t called fit() - assert ( # pylint: disable=consider-iterating-dictionary - "a_generator" not in current.keys() - ) + assert "a_generator" not in current def test_generic_additional_tests(): @@ -511,11 +508,11 @@ def test_request_release_without_attacks(): # no file provided, has k_anonymity - RES_DIR = "RES" - json_filename = os.path.normpath(os.path.join(f"{RES_DIR}", "target.json")) - model_filename = os.path.normpath(os.path.join(f"{RES_DIR}", "model.pkl")) + res_dir = "RES" + json_filename = os.path.normpath(os.path.join(f"{res_dir}", "target.json")) + model_filename = os.path.normpath(os.path.join(f"{res_dir}", "model.pkl")) - model.request_release(path=RES_DIR, ext="pkl") + model.request_release(path=res_dir, ext="pkl") # check that pikle and the json files have been created assert os.path.isfile(model_filename) diff --git a/tests/safemodel/test_safetf.py b/tests/safemodel/test_safetf.py index d140659f..16023289 100644 --- a/tests/safemodel/test_safetf.py +++ b/tests/safemodel/test_safetf.py @@ -7,8 +7,8 @@ from aisdc.safemodel.classifiers import safetf -def test_Safe_tf_DPModel_l2_and_noise(): +def test_safe_tf_dpmodel_l2_and_noise(): """Test user is informed this is not implemented yet.""" with pytest.raises(NotImplementedError): # with values for the l2 and noise params - safetf.Safe_tf_DPModel(1.5, 2.0, True) + safetf.SafeTFModel(1.5, 2.0, True) diff --git a/user_stories/default_config.yaml b/user_stories/default_config.yaml index 1beefd29..f41a5a79 100644 --- a/user_stories/default_config.yaml +++ b/user_stories/default_config.yaml @@ -28,9 +28,9 @@ data_processing_filename: user_story_2/data_processing_researcher.py data_processing_function_name: process_dataset # User story 3 -x_train_path: "x_train.txt" +X_train_path: "X_train.txt" y_train_path: "y_train.txt" -x_test_path: "x_test.txt" +X_test_path: "X_test.txt" y_test_path: "y_test.txt" attack_output_name: "attack_output" diff --git a/user_stories/user_story_1/user_story_1_researcher_template.py b/user_stories/user_story_1/user_story_1_researcher_template.py index 8437b1a5..33a0af6e 100644 --- a/user_stories/user_story_1/user_story_1_researcher_template.py +++ b/user_stories/user_story_1/user_story_1_researcher_template.py @@ -1,19 +1,20 @@ """RESEARCHER EXAMPLE FOR USER STORY 1. -This file is an example of a researcher creating/training a machine learning model and requesting -for it to be released. +This file is an example of a researcher creating/training a machine learning +model and requesting for it to be released. -This specific example uses the nursery dataset: data is read in and pre-processed, and a classifier -is trained and tested on this dataset. +This specific example uses the nursery dataset: data is read in and +pre-processed, and a classifier is trained and tested on this dataset. -This example follows User Story 1 +This example follows User Story 1. Steps: -- Researcher reads in data and processes it -- Researcher creates and trains a classifier -- Researcher runs experiments themselves to check if their model is disclosive or not -- Once satisfied, researcher calls request_release() to make it ready for TRE output checking +- Researcher reads in data and processes it. +- Researcher creates and trains a classifier. +- Researcher runs experiments themselves to check if their model is disclosive + or not. +- Once satisfied, researcher calls request_release() to make it ready for TRE """ import logging @@ -24,15 +25,14 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from aisdc.attacks.target import Target # pylint: disable=import-error -from aisdc.safemodel.classifiers import ( # pylint: disable=import-error - SafeDecisionTreeClassifier, -) +from aisdc.attacks.target import Target +from aisdc.safemodel.classifiers import SafeDecisionTreeClassifier -def main(): # pylint: disable=too-many-statements, disable=too-many-locals +def main(): """Create and train a model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc + # This section is not necessary but helpful - cleans up files that are + # created by aisdc save_directory = "training_artefacts" print("Creating directory for training artefacts") @@ -43,7 +43,8 @@ def main(): # pylint: disable=too-many-statements, disable=too-many-locals print("Acting as researcher...") print() - # Read in and pre-process the dataset - replace this with your data reading/pre-processing code + # Read in and pre-process the dataset - replace this with your data + # reading/pre-processing code print(os.getcwd()) filename = os.path.join(".", "user_stories_resources", "dataset_26_nursery.csv") print("Reading data from " + filename) @@ -67,8 +68,8 @@ def main(): # pylint: disable=too-many-statements, disable=too-many-locals ] ( - x_train_orig, - x_test_orig, + X_train_orig, + X_test_orig, y_train_orig, y_test_orig, ) = train_test_split( @@ -81,9 +82,9 @@ def main(): # pylint: disable=too-many-statements, disable=too-many-locals label_enc = LabelEncoder() feature_enc = OneHotEncoder() - x_train = feature_enc.fit_transform(x_train_orig).toarray() + X_train = feature_enc.fit_transform(X_train_orig).toarray() y_train = label_enc.fit_transform(y_train_orig) - x_test = feature_enc.transform(x_test_orig).toarray() + X_test = feature_enc.transform(X_test_orig).toarray() y_test = label_enc.transform(y_test_orig) logging.getLogger("attack-reps").setLevel(logging.WARNING) @@ -92,30 +93,31 @@ def main(): # pylint: disable=too-many-statements, disable=too-many-locals # Create and train a SafeDecisionTree classifier on the above data model = SafeDecisionTreeClassifier(random_state=1) - model.fit(x_train, y_train) + model.fit(X_train, y_train) # Run a preliminary check to make sure the model is not disclosive _, _ = model.preliminary_check() - # Wrap the model and data in a Target object -- needed in order to call request_release() + # Wrap the model and data in a Target object + # needed in order to call request_release() target = Target(model=model) target.name = "nursery" - target.add_processed_data(x_train, y_train, x_test, y_test) + target.add_processed_data(X_train, y_train, X_test, y_test) target.add_raw_data( - data, labels, x_train_orig, y_train_orig, x_test_orig, y_test_orig + data, labels, X_train_orig, y_train_orig, X_test_orig, y_test_orig ) for i in range(n_features): target.add_feature(data_df.columns[i], indices[i], "onehot") logging.info("Dataset: %s", target.name) logging.info("Features: %s", target.features) - logging.info("x_train shape = %s", np.shape(target.x_train)) + logging.info("X_train shape = %s", np.shape(target.X_train)) logging.info("y_train shape = %s", np.shape(target.y_train)) - logging.info("x_test shape = %s", np.shape(target.x_test)) + logging.info("X_test shape = %s", np.shape(target.X_test)) logging.info("y_test shape = %s", np.shape(target.y_test)) - # Researcher can check for themselves whether their model passes individual disclosure checks - # Leave this code as-is for output disclosure checking + # Researcher can check for themselves whether their model passes individual + # disclosure checks.Leave this code as-is for output disclosure checking. save_filename = "direct_results" print("==========> first running attacks explicitly via run_attack()") for attack_name in ["worst_case", "attribute", "lira"]: @@ -130,21 +132,23 @@ def main(): # pylint: disable=too-many-statements, disable=too-many-locals else: logging.info(" %s : %s", key, val) - # Modify/re-run all of the above code until you're happy with the model you've created - # If the tests do not pass, try changing the model or hyperparameters until the tests pass + # Modify/re-run all of the above code until you're happy with the model + # you've created. If the tests do not pass, try changing the model or + # hyperparameters until the tests pass. When you are satisfied and ready to + # release your model, call the request release() function with the Target + # class you created above. - # when you are satisfied and ready to release your model, call the request release() function - # with the Target class you created above - # This code will run checks for the TRE staff + # This code will run checks for the TRE staff. + + # NOTE: you should only do this when you have confirmed that the above + # tests pass. You would not normally waste your and TRE time calling this + # unless you have already checked that your model is not disclosive or can + # provide a justification for an exception request. - # NOTE: you should only do this when you have confirmed that the above tests pass - # You would not normally waste your and TRE time calling this unless you have already - # checked that your model is not disclosive or can provide a justification for an exception - # request print("===> now running attacks implicitly via request_release()") model.request_release(path=save_directory, ext="pkl", target=target) - # The files generated can be found in this file location + # The files generated can be found in this file location. print(f"Please see the files generated in: {save_directory}") diff --git a/user_stories/user_story_1/user_story_1_tre.py b/user_stories/user_story_1/user_story_1_tre.py index faf21ff9..5486fb8e 100644 --- a/user_stories/user_story_1/user_story_1_tre.py +++ b/user_stories/user_story_1/user_story_1_tre.py @@ -14,9 +14,7 @@ import yaml -from aisdc.attacks.attack_report_formatter import ( # pylint: disable=import-error - GenerateTextReport, -) +from aisdc.attacks.attack_report_formatter import GenerateTextReport def generate_report(directory, attack_results, target, outfile): diff --git a/user_stories/user_story_2/data_processing_researcher.py b/user_stories/user_story_2/data_processing_researcher.py index e3ad658a..1ac7d152 100644 --- a/user_stories/user_story_2/data_processing_researcher.py +++ b/user_stories/user_story_2/data_processing_researcher.py @@ -1,15 +1,17 @@ """SUPPORTING FILE FOR USER STORY 2. -This file is an example of a function created by a researcher that will pre-process a dataset +This file is an example of a function created by a researcher that will +pre-process a dataset. -To use: write a function that will process your input data, and output the processed version +To use: write a function that will process your input data, and output the +processed version. NOTE: in order to work, this function needs to: - - take a single parameter (the data to be processed) - - return a dictionary - - which contains the keys ] - ['n_features_raw_data', 'x_transformed', 'y_transformed', 'train_indices'] +- take a single parameter (the data to be processed) +- return a dictionary +- which contains the keys ] + ['n_features_raw_data', 'X_transformed', 'y_transformed', 'train_indices'] """ import numpy as np @@ -28,23 +30,23 @@ def process_dataset(data): label_enc = LabelEncoder() feature_enc = OneHotEncoder() - x_transformed = feature_enc.fit_transform(data).toarray() + X_transformed = feature_enc.fit_transform(data).toarray() y_transformed = label_enc.fit_transform(labels) - row_indices = np.arange(np.shape(x_transformed)[0]) + row_indices = np.arange(np.shape(X_transformed)[0]) - # This step is not necessary, however it's the simplest way of getting training indices from - # the data - # Any method of generating indices of samples to be used for training will work here + # This step is not necessary, however it's the simplest way of getting + # training indices from the data. Any method of generating indices of + # samples to be used for training will work here. ( - x_train, - x_test, + X_train, + X_test, y_train, y_test, train_indices, test_indices, - ) = train_test_split( # pylint: disable=unused-variable - x_transformed, + ) = train_test_split( + X_transformed, y_transformed, row_indices, test_size=0.5, @@ -54,7 +56,7 @@ def process_dataset(data): returned = {} returned["n_features_raw_data"] = n_features_raw_data - returned["x_transformed"] = x_transformed + returned["X_transformed"] = X_transformed returned["y_transformed"] = y_transformed returned["train_indices"] = train_indices diff --git a/user_stories/user_story_2/user_story_2_researcher_template.py b/user_stories/user_story_2/user_story_2_researcher_template.py index 1d45a163..084c1ec6 100644 --- a/user_stories/user_story_2/user_story_2_researcher_template.py +++ b/user_stories/user_story_2/user_story_2_researcher_template.py @@ -1,19 +1,20 @@ """RESEARCHER EXAMPLE FOR USER STORY 2. -This file is an example of a researcher creating/training a machine learning model and to -be released form a secure environment +This file is an example of a researcher creating/training a machine learning +model and to be released form a secure environment. -This specific example uses the nursery dataset: data is read in and pre-processed, and a -classifier is trained and tested on this dataset. +This specific example uses the nursery dataset: data is read in and +pre-processed, and a classifier is trained and tested on this dataset. -This example follows User Story 2 +This example follows User Story 2. Steps: -- Researcher creates a function to read and process a dataset, which a TRE can also use and call -- Researcher creates and trains a classifier on this data -- Researcher emails (or otherwise contacts) TRE to request the model be released -- TREs will use this code/functions to test the model themselves +- Researcher creates a function to read and process a dataset, which a TRE can + also use and call. +- Researcher creates and trains a classifier on this data. +- Researcher emails (or otherwise contacts) TRE to request the model be released. +- TREs will use this code/functions to test the model themselves. """ import logging @@ -23,15 +24,14 @@ import pandas as pd from data_processing_researcher import process_dataset -from aisdc.attacks.target import Target # pylint: disable=import-error -from aisdc.safemodel.classifiers import ( # pylint: disable=import-error - SafeDecisionTreeClassifier, -) +from aisdc.attacks.target import Target +from aisdc.safemodel.classifiers import SafeDecisionTreeClassifier -def run_user_story(): # pylint: disable=too-many-locals +def run_user_story(): """Create and train a model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc + # This section is not necessary but helpful - cleans up files that are + # created by aisdc directory = "training_artefacts" print("Creating directory for training artefacts") @@ -47,34 +47,34 @@ def run_user_story(): # pylint: disable=too-many-locals print("Reading data from " + filename) data = pd.read_csv(filename) - # Write a function to pre-process the data that the TRE can call - # (see data_processing_researcher.py) - # Use the output of this function to split the data into training/testing sets + # Write a function to pre-process the data that the TRE can call (see + # data_processing_researcher.py) Use the output of this function to split + # the data into training/testing sets. + # NOTE: to use this user story/script, the process_dataset function MUST: - # take a single parameter (the data to be processed) - # return a dictionary - # which contains the keys - # ['n_features_raw_data', 'x_transformed', 'y_transformed', 'train_indices'] - # as in this example + # take a single parameter (the data to be processed) return a dictionary + # which contains the keys: + # >>> ['n_features_raw_data', 'X_transformed', 'y_transformed', 'train_indices'] + # as in this example. returned = process_dataset(data) - x_transformed = returned["x_transformed"] + X_transformed = returned["X_transformed"] y_transformed = returned["y_transformed"] train_indices = set(returned["train_indices"]) - x_train = [] - x_test = [] + X_train = [] + X_test = [] y_train = [] y_test = [] for i, label in enumerate(y_transformed): if i in train_indices: - x_train.append(x_transformed[i]) + X_train.append(X_transformed[i]) y_train.append(label) else: - x_test.append(x_transformed[i]) + X_test.append(X_transformed[i]) y_test.append(label) logging.getLogger("attack-reps").setLevel(logging.WARNING) @@ -83,24 +83,24 @@ def run_user_story(): # pylint: disable=too-many-locals # Build a model and request its release model = SafeDecisionTreeClassifier(random_state=1) - model.fit(x_train, y_train) + model.fit(X_train, y_train) model.request_release(path=directory, ext="pkl") # Wrap the model and data in a Target object target = Target(model=model) target.name = "nursery" - target.add_processed_data(x_train, y_train, x_test, y_test) + target.add_processed_data(X_train, y_train, X_test, y_test) - # NOTE: we assume here that the researcher does not use the target.save() function and - # instead provides only the model and the list of indices - # which have been used to split the dataset, which will allow a TRE to re-create the input - # data used in training + # NOTE: we assume here that the researcher does not use the target.save() + # function and instead provides only the model and the list of indices + # which have been used to split the dataset, which will allow a TRE to + # re-create the input data used in training. logging.info("Dataset: %s", target.name) logging.info("Features: %s", target.features) - logging.info("x_train shape = %s", np.shape(target.x_train)) + logging.info("X_train shape = %s", np.shape(target.X_train)) logging.info("y_train shape = %s", np.shape(target.y_train)) - logging.info("x_test shape = %s", np.shape(target.x_test)) + logging.info("X_test shape = %s", np.shape(target.X_test)) logging.info("y_test shape = %s", np.shape(target.y_test)) diff --git a/user_stories/user_story_2/user_story_2_tre.py b/user_stories/user_story_2/user_story_2_tre.py index c1148267..ef632c89 100644 --- a/user_stories/user_story_2/user_story_2_tre.py +++ b/user_stories/user_story_2/user_story_2_tre.py @@ -18,12 +18,8 @@ import pandas as pd import yaml -from aisdc.attacks.attack_report_formatter import ( # pylint: disable=import-error - GenerateTextReport, -) -from aisdc.attacks.target import Target # pylint: disable=import-error - -# from .data_processing_researcher import process_dataset +from aisdc.attacks.attack_report_formatter import GenerateTextReport +from aisdc.attacks.target import Target def process_dataset(filename, function_name, data_to_be_processed): @@ -35,10 +31,8 @@ def process_dataset(filename, function_name, data_to_be_processed): spec = importlib.util.spec_from_file_location(function_name, filename) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) - function = getattr(module, function_name) - result = function(data_to_be_processed) - return result + return function(data_to_be_processed) def generate_report( @@ -50,7 +44,7 @@ def generate_report( attack_results, target_filename, outfile, -): # pylint: disable=too-many-locals, disable=too-many-arguments +): """Generate report based on target model.""" print() print("Acting as TRE...") @@ -73,31 +67,31 @@ def generate_report( returned = process_dataset( data_processing_filename, data_processing_function_name, data ) - x_transformed = returned["x_transformed"] + X_transformed = returned["X_transformed"] y_transformed = returned["y_transformed"] train_indices = set(returned["train_indices"]) - x_train = [] - x_test = [] + X_train = [] + X_test = [] y_train = [] y_test = [] for i, label in enumerate(y_transformed): if i in train_indices: - x_train.append(x_transformed[i]) + X_train.append(X_transformed[i]) y_train.append(label) else: - x_test.append(x_transformed[i]) + X_test.append(X_transformed[i]) y_test.append(label) - x_train = np.array(x_train) + X_train = np.array(X_train) y_train = np.array(y_train) - x_test = np.array(x_test) + X_test = np.array(X_test) y_test = np.array(y_test) # Wrap the model and data in a Target object target = Target(model=target_model) - target.add_processed_data(x_train, y_train, x_test, y_test) + target.add_processed_data(X_train, y_train, X_test, y_test) # TRE calls request_release() print("===> now running attacks implicitly via request_release()") @@ -133,10 +127,11 @@ def run_user_story(release_config: dict): ) -if __name__ == "__main__": # pragma:no cover +if __name__ == "__main__": parser = argparse.ArgumentParser( description=( - "Generate a risk report after request_release() has been called by researcher" + "Generate a risk report after request_release() " + "has been called by researcher" ) ) @@ -155,7 +150,7 @@ def run_user_story(release_config: dict): try: with open(args.config_file, encoding="utf-8") as handle: config = yaml.load(handle, Loader=yaml.loader.SafeLoader) - except AttributeError as error: # pragma:no cover + except AttributeError as error: print( "Invalid command. Try --help to get more details" f"error message is {error}" diff --git a/user_stories/user_story_3/user_story_3_researcher_template.py b/user_stories/user_story_3/user_story_3_researcher_template.py index d9b0a804..a768dbaf 100644 --- a/user_stories/user_story_3/user_story_3_researcher_template.py +++ b/user_stories/user_story_3/user_story_3_researcher_template.py @@ -1,20 +1,21 @@ """RESEARCHER EXAMPLE FOR USER STORY 3. -This file is an example of a researcher creating/training a machine learning model and to be -released form a secure environment +This file is an example of a researcher creating/training a machine learning +model and to be released form a secure environment. -This specific example uses the nursery dataset: data is read in and pre-processed, and a classifier -is trained and tested on this dataset. +This specific example uses the nursery dataset: data is read in and +pre-processed, and a classifier is trained and tested on this dataset. -This example follows User Story 3 +This example follows User Story 3. Steps: -- Researcher creates and pre-processes a dataset -- Researcher creates and trains a classifier on this data -- Reasercher saves the model manually (e.g. using pickle, not through request_release() or similar) -- Researcher emails (or otherwise contacts) TRE to request the model be released -- TREs will use this model and data to test the model themselves +- Researcher creates and pre-processes a dataset. +- Researcher creates and trains a classifier on this data. +- Reasercher saves the model manually (e.g. using pickle, not through + request_release() or similar). +- Researcher emails (or otherwise contacts) TRE to request the model be released. +- TREs will use this model and data to test the model themselves. """ import os @@ -28,16 +29,18 @@ from sklearn.preprocessing import LabelEncoder, OneHotEncoder -def run_user_story(): # pylint: disable=too-many-locals +def run_user_story(): """Create and train a model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc + # This section is not necessary but helpful - cleans up files that are + # created by aisdc directory = "training_artefacts" print("Creating directory for training artefacts") if not os.path.exists(directory): os.makedirs(directory) - # Read in and pre-process the dataset - replace this with your data reading/pre-processing code + # Read in and pre-process the dataset - replace this with your data + # reading/pre-processing code. filename = os.path.join(".", "user_stories_resources", "dataset_26_nursery.csv") print("Reading data from " + filename) data = pd.read_csv(filename) @@ -48,12 +51,12 @@ def run_user_story(): # pylint: disable=too-many-locals data = data.drop(columns=["class"], inplace=False) feature_encoder = OneHotEncoder() - x_encoded = feature_encoder.fit_transform(data).toarray() + X_encoded = feature_encoder.fit_transform(data).toarray() feature_dataframe = pd.DataFrame( - x_encoded, columns=feature_encoder.get_feature_names_out() + X_encoded, columns=feature_encoder.get_feature_names_out() ) - x_train, x_test, y_train, y_test = train_test_split( + X_train, X_test, y_train, y_test = train_test_split( feature_dataframe.values, target_dataframe.values.flatten(), test_size=0.7, @@ -62,12 +65,13 @@ def run_user_story(): # pylint: disable=too-many-locals # Save the training and test data to a file which a TRE can access print("Saving training/testing data to ./" + directory) - np.savetxt(os.path.join(directory, "x_train.txt"), x_train, fmt="%d") + np.savetxt(os.path.join(directory, "X_train.txt"), X_train, fmt="%d") np.savetxt(os.path.join(directory, "y_train.txt"), y_train, fmt="%d") - np.savetxt(os.path.join(directory, "x_test.txt"), x_test, fmt="%d") + np.savetxt(os.path.join(directory, "X_test.txt"), X_test, fmt="%d") np.savetxt(os.path.join(directory, "y_test.txt"), y_test, fmt="%d") - # Create, train and test a model - replace this with your training and testing code + # Create, train and test a model + # Replace this with your training and testing code hyperparameters = {} hyperparameters["min_samples_split"] = 5 hyperparameters["min_samples_leaf"] = 5 @@ -75,10 +79,10 @@ def run_user_story(): # pylint: disable=too-many-locals hyperparameters["bootstrap"] = False target_model = RandomForestClassifier(**hyperparameters) - target_model.fit(x_train, y_train) + target_model.fit(X_train, y_train) - train_acc = accuracy_score(y_train, target_model.predict(x_train)) - test_acc = accuracy_score(y_test, target_model.predict(x_test)) + train_acc = accuracy_score(y_train, target_model.predict(X_train)) + test_acc = accuracy_score(y_test, target_model.predict(X_test)) print(f"Training accuracy on model: {train_acc:.2f}") print(f"Testing accuracy on model: {test_acc:.2f}") diff --git a/user_stories/user_story_3/user_story_3_tre.py b/user_stories/user_story_3/user_story_3_tre.py index 5f91ea92..1613c557 100644 --- a/user_stories/user_story_3/user_story_3_tre.py +++ b/user_stories/user_story_3/user_story_3_tre.py @@ -18,27 +18,23 @@ import numpy as np import yaml -from aisdc.attacks.attack_report_formatter import ( # pylint: disable=import-error - GenerateTextReport, -) -from aisdc.attacks.likelihood_attack import LIRAAttack # pylint: disable=import-error -from aisdc.attacks.target import Target # pylint: disable=import-error -from aisdc.attacks.worst_case_attack import ( # pylint: disable=import-error - WorstCaseAttack, -) +from aisdc.attacks.attack_report_formatter import GenerateTextReport +from aisdc.attacks.likelihood_attack import LIRAAttack +from aisdc.attacks.target import Target +from aisdc.attacks.worst_case_attack import WorstCaseAttack def generate_report( directory, target_model, - x_train, + X_train, y_train, - x_test, + X_test, y_test, attack_output_name, target_filename, outfile, -): # pylint: disable=too-many-arguments, disable=too-many-locals +): """Generate report based on target model.""" print() print("Acting as TRE...") @@ -61,9 +57,9 @@ def generate_report( # Read the training/testing data as supplied by the researcher print("Reading training/testing data from ./" + directory) - train_x = np.loadtxt(os.path.join(directory, x_train)) + train_x = np.loadtxt(os.path.join(directory, X_train)) train_y = np.loadtxt(os.path.join(directory, y_train)) - test_x = np.loadtxt(os.path.join(directory, x_test)) + test_x = np.loadtxt(os.path.join(directory, X_test)) test_y = np.loadtxt(os.path.join(directory, y_test)) # Wrap the training and test data into the Target object @@ -126,9 +122,9 @@ def run_user_story(release_config: dict): generate_report( release_config["training_artefacts_dir"], release_config["target_model"], - release_config["x_train_path"], + release_config["X_train_path"], release_config["y_train_path"], - release_config["x_test_path"], + release_config["X_test_path"], release_config["y_test_path"], release_config["attack_output_name"], release_config["target_results"], @@ -139,7 +135,8 @@ def run_user_story(release_config: dict): if __name__ == "__main__": parser = argparse.ArgumentParser( description=( - "Generate a risk report after request_release() has been called by researcher" + "Generate a risk report after request_release() " + "has been called by researcher" ) ) diff --git a/user_stories/user_story_4/user_story_4_tre.py b/user_stories/user_story_4/user_story_4_tre.py index 00c97873..f8e7b0a9 100644 --- a/user_stories/user_story_4/user_story_4_tre.py +++ b/user_stories/user_story_4/user_story_4_tre.py @@ -10,23 +10,14 @@ """ import argparse -import json -import logging import os -import pickle import numpy as np import pandas as pd import yaml -from aisdc.attacks.attack_report_formatter import ( # pylint: disable=import-error - GenerateTextReport, -) -from aisdc.attacks.likelihood_attack import LIRAAttack # pylint: disable=import-error -from aisdc.attacks.target import Target # pylint: disable=import-error -from aisdc.attacks.worst_case_attack import ( # pylint: disable=import-error - WorstCaseAttack, -) +from aisdc.attacks.attack_report_formatter import GenerateTextReport +from aisdc.attacks.worst_case_attack import WorstCaseAttack def generate_report( @@ -35,7 +26,7 @@ def generate_report( test_probabilities, attack_output_name, outfile, -): # pylint: disable=too-many-arguments, disable=too-many-locals +): """Generate report based on target model.""" print() print("Acting as TRE...") @@ -108,7 +99,8 @@ def run_user_story(release_config: dict): if __name__ == "__main__": parser = argparse.ArgumentParser( description=( - "Generate a risk report after request_release() has been called by researcher" + "Generate a risk report after request_release() " + "has been called by researcher" ) ) diff --git a/user_stories/user_story_7/user_story_7_researcher_template.py b/user_stories/user_story_7/user_story_7_researcher_template.py index a6becb40..f949f7fb 100644 --- a/user_stories/user_story_7/user_story_7_researcher_template.py +++ b/user_stories/user_story_7/user_story_7_researcher_template.py @@ -1,22 +1,24 @@ """RESEARCHER EXAMPLE FOR USER STORY 7. -This file is an example of a researcher creating/training a machine learning model and to be -released form a secure environment +This file is an example of a researcher creating/training a machine learning +model and to be released form a secure environment. -This specific example uses the nursery dataset: data is read in and pre-processed, and a classifier -is trained and tested on this dataset. +This specific example uses the nursery dataset: data is read in and +pre-processed, and a classifier is trained and tested on this dataset. -This example follows User Story 7 +This example follows User Story 7. -NOTE: this user story is an example of a model that cannot be released since the researcher has not -provided enough data +NOTE: this user story is an example of a model that cannot be released since +the researcher has not provided enough data. Steps: -- Researcher creates and pre-processes a dataset -- Researcher creates and trains a classifier on this data -- Reasercher saves the model manually (e.g. using pickle, not through request_release() or similar) -- Researcher does not save the training/testing data, and therefore the TRE cannot verify the model +- Researcher creates and pre-processes a dataset. +- Researcher creates and trains a classifier on this data. +- Reasercher saves the model manually (e.g. using pickle, not through + request_release() or similar). +- Researcher does not save the training/testing data, and therefore the TRE + cannot verify the model. """ import logging @@ -27,15 +29,14 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from aisdc.attacks.target import Target # pylint: disable=import-error -from aisdc.safemodel.classifiers import ( # pylint: disable=import-error - SafeDecisionTreeClassifier, -) +from aisdc.attacks.target import Target +from aisdc.safemodel.classifiers import SafeDecisionTreeClassifier -def run_user_story(): # pylint: disable=too-many-locals +def run_user_story(): """Create and train model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc + # This section is not necessary but helpful - cleans up files that are + # created by aisdc directory = "training_artefacts" print("Creating directory for training artefacts") @@ -46,7 +47,8 @@ def run_user_story(): # pylint: disable=too-many-locals print("Acting as researcher...") print() - # Read in and pre-process the dataset - replace this with your data reading/pre-processing code + # Read in and pre-process the dataset - replace this with your data + # reading/pre-processing code filename = os.path.join(".", "user_stories_resources", "dataset_26_nursery.csv") print("Reading data from " + filename) data_df = pd.read_csv(filename) @@ -67,7 +69,7 @@ def run_user_story(): # pylint: disable=too-many-locals ] # Split into training and test sets - (x_train_orig, x_test_orig, y_train_orig, y_test_orig) = train_test_split( + (X_train_orig, X_test_orig, y_train_orig, y_test_orig) = train_test_split( data, labels, test_size=0.5, @@ -78,9 +80,9 @@ def run_user_story(): # pylint: disable=too-many-locals # Preprocess dataset label_enc = LabelEncoder() feature_enc = OneHotEncoder() - x_train = feature_enc.fit_transform(x_train_orig).toarray() + X_train = feature_enc.fit_transform(X_train_orig).toarray() y_train = label_enc.fit_transform(y_train_orig) - x_test = feature_enc.transform(x_test_orig).toarray() + X_test = feature_enc.transform(X_test_orig).toarray() y_test = label_enc.transform(y_test_orig) logging.getLogger("attack-reps").setLevel(logging.WARNING) @@ -89,27 +91,28 @@ def run_user_story(): # pylint: disable=too-many-locals # Create, train and test a model - replace this with your training and testing code model = SafeDecisionTreeClassifier(random_state=1) - model.fit(x_train, y_train) + model.fit(X_train, y_train) model.request_release(path=directory, ext="pkl") # Wrap the model and data in a Target object target = Target(model=model) target.name = "nursery" - target.add_processed_data(x_train, y_train, x_test, y_test) + target.add_processed_data(X_train, y_train, X_test, y_test) target.add_raw_data( - data, labels, x_train_orig, y_train_orig, x_test_orig, y_test_orig + data, labels, X_train_orig, y_train_orig, X_test_orig, y_test_orig ) for i in range(n_features): target.add_feature(data_df.columns[i], indices[i], "onehot") - # NOTE: we assume here that the researcher does not use the target.save() function - # and instead provides only the model, preventing this model from being checked by the TRE + # NOTE: we assume here that the researcher does not use the target.save() + # function and instead provides only the model, preventing this model from + # being checked by the TRE. logging.info("Dataset: %s", target.name) logging.info("Features: %s", target.features) - logging.info("x_train shape = %s", np.shape(target.x_train)) + logging.info("X_train shape = %s", np.shape(target.X_train)) logging.info("y_train shape = %s", np.shape(target.y_train)) - logging.info("x_test shape = %s", np.shape(target.x_test)) + logging.info("X_test shape = %s", np.shape(target.X_test)) logging.info("y_test shape = %s", np.shape(target.y_test)) diff --git a/user_stories/user_story_7/user_story_7_tre.py b/user_stories/user_story_7/user_story_7_tre.py index 8860a4c7..654e1e7c 100644 --- a/user_stories/user_story_7/user_story_7_tre.py +++ b/user_stories/user_story_7/user_story_7_tre.py @@ -50,7 +50,8 @@ def run_user_story(release_config: dict): if __name__ == "__main__": parser = argparse.ArgumentParser( description=( - "Generate a risk report after request_release() has been called by researcher" + "Generate a risk report after request_release() " + "has been called by researcher" ) ) diff --git a/user_stories/user_story_8/data_processing_researcher.py b/user_stories/user_story_8/data_processing_researcher.py index e3ad658a..e039ffaf 100644 --- a/user_stories/user_story_8/data_processing_researcher.py +++ b/user_stories/user_story_8/data_processing_researcher.py @@ -9,7 +9,7 @@ - take a single parameter (the data to be processed) - return a dictionary - which contains the keys ] - ['n_features_raw_data', 'x_transformed', 'y_transformed', 'train_indices'] + ['n_features_raw_data', 'X_transformed', 'y_transformed', 'train_indices'] """ import numpy as np @@ -28,23 +28,23 @@ def process_dataset(data): label_enc = LabelEncoder() feature_enc = OneHotEncoder() - x_transformed = feature_enc.fit_transform(data).toarray() + X_transformed = feature_enc.fit_transform(data).toarray() y_transformed = label_enc.fit_transform(labels) - row_indices = np.arange(np.shape(x_transformed)[0]) + row_indices = np.arange(np.shape(X_transformed)[0]) # This step is not necessary, however it's the simplest way of getting training indices from # the data # Any method of generating indices of samples to be used for training will work here ( - x_train, - x_test, + X_train, + X_test, y_train, y_test, train_indices, test_indices, ) = train_test_split( # pylint: disable=unused-variable - x_transformed, + X_transformed, y_transformed, row_indices, test_size=0.5, @@ -54,7 +54,7 @@ def process_dataset(data): returned = {} returned["n_features_raw_data"] = n_features_raw_data - returned["x_transformed"] = x_transformed + returned["X_transformed"] = X_transformed returned["y_transformed"] = y_transformed returned["train_indices"] = train_indices diff --git a/user_stories/user_story_8/user_story_8_researcher_template.py b/user_stories/user_story_8/user_story_8_researcher_template.py index fd3eb7bd..7203afe9 100644 --- a/user_stories/user_story_8/user_story_8_researcher_template.py +++ b/user_stories/user_story_8/user_story_8_researcher_template.py @@ -1,35 +1,36 @@ """RESEARCHER EXAMPLE FOR USER STORY 8. -This file is an example of a researcher creating/training a machine learning model and to -be released form a secure environment +This file is an example of a researcher creating/training a machine learning +model and to be released form a secure environment. -This specific example uses the nursery dataset: data is read in and pre-processed, and a -classifier is trained and tested on this dataset. +This specific example uses the nursery dataset: data is read in and +pre-processed, and a classifier is trained and tested on this dataset. -This example follows User Story 8 +This example follows User Story 8. Steps: -- Researcher creates a function to read and process a dataset, which a TRE can also use and call -- Researcher creates and trains a classifier on this data -- Researcher emails (or otherwise contacts) TRE to request the model be released -- TREs will use this code/functions to test the model themselves +- Researcher creates a function to read and process a dataset, which a TRE can + also use and call. +- Researcher creates and trains a classifier on this data. +- Researcher emails (or otherwise contacts) TRE to request the model be released. +- TREs will use this code/functions to test the model themselves. """ import logging import os import pickle -import numpy as np import pandas as pd from data_processing_researcher import process_dataset from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score -def run_user_story(): # pylint: disable=too-many-locals +def run_user_story(): """Create and train a model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc + # This section is not necessary but helpful - cleans up files that are + # created by aisdc directory = "training_artefacts" print("Creating directory for training artefacts") @@ -52,27 +53,27 @@ def run_user_story(): # pylint: disable=too-many-locals # take a single parameter (the data to be processed) # return a dictionary # which contains the keys - # ['n_features_raw_data', 'x_transformed', 'y_transformed', 'train_indices'] + # >>> ['n_features_raw_data', 'X_transformed', 'y_transformed', 'train_indices'] # as in this example returned = process_dataset(data) - x_transformed = returned["x_transformed"] + X_transformed = returned["X_transformed"] y_transformed = returned["y_transformed"] train_indices = set(returned["train_indices"]) - x_train = [] - x_test = [] + X_train = [] + X_test = [] y_train = [] y_test = [] for i, label in enumerate(y_transformed): if i in train_indices: - x_train.append(x_transformed[i]) + X_train.append(X_transformed[i]) y_train.append(label) else: - x_test.append(x_transformed[i]) + X_test.append(X_transformed[i]) y_test.append(label) logging.getLogger("attack-reps").setLevel(logging.WARNING) @@ -88,10 +89,10 @@ def run_user_story(): # pylint: disable=too-many-locals # Build a model target_model = RandomForestClassifier(**hyperparameters) - target_model.fit(x_train, y_train) + target_model.fit(X_train, y_train) - train_acc = accuracy_score(y_train, target_model.predict(x_train)) - test_acc = accuracy_score(y_test, target_model.predict(x_test)) + train_acc = accuracy_score(y_train, target_model.predict(X_train)) + test_acc = accuracy_score(y_test, target_model.predict(X_test)) print(f"Training accuracy on model: {train_acc:.2f}") print(f"Testing accuracy on model: {test_acc:.2f}") diff --git a/user_stories/user_story_8/user_story_8_tre.py b/user_stories/user_story_8/user_story_8_tre.py index 41d71bf3..e676ff41 100644 --- a/user_stories/user_story_8/user_story_8_tre.py +++ b/user_stories/user_story_8/user_story_8_tre.py @@ -50,7 +50,8 @@ def run_user_story(release_config: dict): if __name__ == "__main__": parser = argparse.ArgumentParser( description=( - "Generate a risk report after request_release() has been called by researcher" + "Generate a risk report after request_release() " + "has been called by researcher" ) )