Merge pull request #103 from fetchai/docs_improvements

COLE-379 Docs improvements
fetchai · Feb 10, 2021 · 1e98fd0 · 1e98fd0
2 parents cb0a856 + 8173fa7
commit 1e98fd0
Show file tree

Hide file tree

Showing 24 changed files with 133 additions and 155 deletions.
diff --git a/colearn/utils/plot.py b/colearn/utils/plot.py
@@ -1,65 +1,20 @@
 import matplotlib.axes._axes as mpl_ax
+import matplotlib.patches as mpatches
 import matplotlib.pyplot as plt
-import matplotlib.ticker as mticker
 import numpy as np
 
 from colearn.utils.results import Results
 
 
 class ColearnPlot:
-    def __init__(self, n_learners: int, score_name: str = "user-defined score"):
+    def __init__(self, score_name: str = "user-defined score"):
         self.score_name = score_name
-        self.n_learners = n_learners
         self.results_axes: mpl_ax.Axes = plt.subplot(2, 1, 1, label="sub1")
         self.votes_axes: mpl_ax.Axes = plt.subplot(2, 1, 2, label="sub2")
 
-    def _process_statistics(self, results: Results):
-        results.h_test_scores = []
-        results.h_vote_scores = []
-
-        results.mean_test_scores = []
-        results.mean_vote_scores = []
-
-        results.max_test_scores = []
-        results.max_vote_scores = []
-
-        for r in range(len(results.data)):
-            results.mean_test_scores.append(
-                np.mean(np.array(results.data[r].test_scores))
-            )
-            results.mean_vote_scores.append(
-                np.mean(np.array(results.data[r].vote_scores))
-            )
-            results.max_test_scores.append(np.max(np.array(results.data[r].test_scores)))
-            results.max_vote_scores.append(np.max(np.array(results.data[r].vote_scores)))
-
-        # gather individual scores
-        for i in range(self.n_learners):
-            results.h_test_scores.append([])
-            results.h_vote_scores.append([])
-
-            for r in range(len(results.data)):
-                results.h_test_scores[i].append(results.data[r].test_scores[i])
-                results.h_vote_scores[i].append(results.data[r].vote_scores[i])
-
-        results.highest_test_score = np.max(np.array(results.h_test_scores))
-        results.highest_vote_score = np.max(np.array(results.h_vote_scores))
-
-        results.highest_mean_test_score = np.max(results.mean_test_scores)
-        results.highest_mean_vote_score = np.max(results.mean_vote_scores)
-
-        results.current_mean_test_score = results.mean_test_scores[-1]
-        results.current_mean_vote_score = results.mean_vote_scores[-1]
-
-        results.current_max_test_score = results.max_test_scores[-1]
-        results.current_max_vote_score = results.max_vote_scores[-1]
-
-        results.mean_mean_test_score = np.mean(np.array(results.h_test_scores))
-        results.mean_mean_vote_score = np.mean(np.array(results.h_vote_scores))
-
     def plot_results(self, results, block=False):
         # Prepare data for plotting
-        self._process_statistics(results)
+        results.process_statistics()
 
         plt.ion()
         plt.show(block=False)
@@ -72,33 +27,33 @@ def plot_results(self, results, block=False):
         self.results_axes.set_xlim(-0.5, len(results.mean_test_scores) - 0.5)
         self.results_axes.set_xticks(np.arange(0, len(results.mean_test_scores), step=1))
 
-        rounds = range(len(results.mean_test_scores))
-
-        for i in range(self.n_learners):
+        n_rounds = len(results.data)
+        n_learners = len(results.data[0].vote_scores)
+        for i in range(n_learners):
             self.results_axes.plot(
-                rounds,
+                range(n_rounds),
                 results.h_test_scores[i],
                 "b--",
                 alpha=0.5,
                 label=f"test {self.score_name}",
             )
             self.results_axes.plot(
-                rounds,
+                range(n_rounds),
                 results.h_vote_scores[i],
                 "r--",
                 alpha=0.5,
                 label=f"vote {self.score_name}",
             )
 
         (line_mean_test_score,) = self.results_axes.plot(
-            rounds,
+            range(n_rounds),
             results.mean_test_scores,
             "b",
             linewidth=3,
             label=f"mean test {self.score_name}",
         )
         (line_mean_vote_score,) = self.results_axes.plot(
-            rounds,
+            range(n_rounds),
             results.mean_vote_scores,
             "r",
             linewidth=3,
@@ -120,20 +75,30 @@ def plot_votes(self, results: Results, block=False):
 
         results_list = results.data
 
-        data = np.array([res.votes for res in results_list])
+        votes_array = np.array([res.votes for res in results_list])
+
+        votes_array = votes_array.transpose()
 
-        data = data.transpose()
-        self.votes_axes.matshow(data, aspect="auto", vmin=0, vmax=1)
+        coloured_votes_array = np.zeros((votes_array.shape[0], votes_array.shape[1], 3), dtype=np.int)
 
-        n_learners = data.shape[0]
-        n_rounds = data.shape[1]
+        green_colour = np.array([204, 255, 204], dtype=np.int)
+        red_colour = np.array([255, 153, 153], dtype=np.int)
+        coloured_votes_array[votes_array == 1] = green_colour
+        coloured_votes_array[votes_array == 0] = red_colour
+
+        # make extra legend entries
+        red_patch_handle = mpatches.Patch(color=red_colour / 256, label='Negative vote')
+        green_patch_handle = mpatches.Patch(color=green_colour / 256, label='Positive vote')
+
+        self.votes_axes.imshow(coloured_votes_array, aspect="auto", interpolation='nearest')
+
+        n_learners = votes_array.shape[0]
+        n_rounds = votes_array.shape[1]
 
-        # draw gridlines
         self.votes_axes.set_xticks(range(n_rounds))
 
-        ticks = [""] + ["Learner " + str(i) for i in range(n_learners)] + [""]
-        ticks_loc = self.votes_axes.get_yticks().tolist()
-        self.votes_axes.yaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
+        ticks = ["Learner " + str(i) for i in range(n_learners)]
+        self.votes_axes.set_yticks(range(n_learners))
         self.votes_axes.set_yticklabels(ticks)
 
         pos_xs = []
@@ -148,10 +113,14 @@ def plot_votes(self, results: Results, block=False):
                 neg_xs.append(i + 1)
                 neg_ys.append(res.block_proposer)
 
-        self.votes_axes.scatter(pos_xs, pos_ys, marker="*", s=150, label="Positive overall vote")
-        self.votes_axes.scatter(neg_xs, neg_ys, marker="X", s=150, label="Negative overall vote")
+        self.votes_axes.scatter(pos_xs, pos_ys, marker=r"$\checkmark$", c="green", s=150,
+                                label="Proposer and positive overall vote")
+        self.votes_axes.scatter(neg_xs, neg_ys, marker="X", c="red", s=150,
+                                label="Proposer and negative overall vote")
         self.votes_axes.set_xlabel("training round")
-        self.votes_axes.legend()
+
+        handles, _ = self.votes_axes.get_legend_handles_labels()
+        self.votes_axes.legend(handles=handles + [green_patch_handle, red_patch_handle])
 
         # Gridlines based on minor ticks
         self.votes_axes.set_xticks(np.arange(-0.5, n_rounds, 1), minor=True)

diff --git a/colearn/utils/results.py b/colearn/utils/results.py
@@ -1,4 +1,5 @@
 from typing import List
+import numpy as np
 
 
 class Result:
@@ -21,23 +22,19 @@ def __init__(self):
         self.mean_test_scores = []
         self.mean_vote_scores = []
 
-        self.max_test_scores = []
-        self.max_vote_scores = []
-
-        self.highest_test_score = 0
-        self.highest_vote_score = 0
-
-        self.highest_mean_test_score = 0
-        self.highest_mean_vote_score = 0
-
-        self.current_mean_test_score = 0
-        self.current_mean_vote_score = 0
+    def process_statistics(self):
+        self.h_test_scores = []
+        self.h_vote_scores = []
 
-        self.current_max_test_score = 0
-        self.current_max_vote_score = 0
+        n_rounds = len(self.data)
+        self.mean_test_scores = [np.mean(np.array(self.data[r].test_scores)) for r in range(n_rounds)]
+        self.mean_vote_scores = [np.mean(np.array(self.data[r].vote_scores)) for r in range(n_rounds)]
 
-        self.mean_mean_test_score = 0
-        self.mean_mean_vote_score = 0
+        # gather individual scores
+        n_learners = len(self.data[0].vote_scores)
+        for i in range(n_learners):
+            self.h_test_scores.append([self.data[r].test_scores[i] for r in range(n_rounds)])
+            self.h_vote_scores.append([self.data[r].vote_scores[i] for r in range(n_rounds)])
 
 
 def print_results(results: Results):

diff --git a/colearn_grpc/test_example_mli_factory.py b/colearn_grpc/test_example_mli_factory.py
@@ -15,28 +15,24 @@ def factory() -> ExampleMliFactory:
     return ExampleMliFactory()
 
 
-@pytest.mark.slow
 def test_setup(factory):
     assert len(factory.get_models()) > 0
     assert len(factory.get_dataloaders()) > 0
     assert len(factory.get_compatibilities()) > 0
 
 
-@pytest.mark.slow
 def test_model_names(factory):
     for task in TaskType:
         assert task.name in factory.get_models().keys()
 
 
-@pytest.mark.slow
 def test_dataloader_names(factory):
     for task in TaskType:
         assert task.name in factory.get_dataloaders().keys()
 
     assert len(factory.get_dataloaders()[TaskType.KERAS_MNIST.name]) > 0
 
 
-@pytest.mark.slow
 def test_compatibilities(factory):
     for task in TaskType:
         assert task.name in factory.get_models().keys()
@@ -56,7 +52,6 @@ def mnist_config():
     }
 
 
-@pytest.mark.slow
 def test_get_mnist(factory, mnist_config):
 
     model_params = json.dumps({'model_type': mnist_config['model_type']})

diff --git a/docs/demo.md b/docs/demo.md
@@ -10,7 +10,7 @@ There are five potential datasets for the demo
 * PYTORCH_XRAY is Pytorch implementation of a binary classification task that requires predicting pneumonia from images of chest X-rays. 
   The data need to be downloaded from [kaggle](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia)
 * PYTORCH_COVID_XRAY is Pytorch implementation of a 3 class classification task that requires predicting no finding, covid or pneumonia from images of chest X-rays. 
-  This dataset is currently unavailable.
+  This dataset is not currently publicly available.
 * FRAUD The fraud dataset consists of information about credit card transactions, and the task is to predict whether 
   transactions are fraudulent or not. 
   The data need to be downloaded from [kaggle](https://www.kaggle.com/c/ieee-fraud-detection)
@@ -36,14 +36,14 @@ Arguments to run the demo:
 ```
 
 ## Running MNIST
-The simplest task to run is MNIST because this doesn't require downloading the data. 
-This runs the MNIST task with five learners for 15 rounds.
+The simplest task to run is MNIST because the data are downloaded automatically from `tensorflow_datasets`.
+The command below runs the MNIST task with five learners for 15 rounds.
 ```bash
 examples/run_demo.py --task KERAS_MNIST --n_learners 5 --n_rounds 15
 ```
 You should see a graph of the vote score and the test score (the score used here is categorical accuracy).
-New model is accepted (blue star) if amount of possitive votes (yellow color) is higher than 0.5. 
-New model is rejected (orange cross) if amount of negative votes (purple color) is lower than 0.5. 
+The new model is accepted if the fraction of positive votes (green colour) is higher than 0.5. 
+The new model is rejected if the fraction of negative votes (red color) is lower than 0.5. 
 
 ![Alt text](images/mnist_plot.png?raw=true "Collective learning graph")
 

diff --git a/docs/differential_privacy.md b/docs/differential_privacy.md
@@ -1,11 +1,37 @@
 # What is differential privacy?
-Differential privacy (DP) is a system for publicly sharing information about a dataset by describing the patterns
- of groups within the dataset while withholding information about individuals in the dataset. 
- The idea behind differential privacy is that if the effect of making an arbitrary single substitution in 
- the database is small enough, the query result cannot be used to infer much about any single individual,
-  and therefore provides privacy.
+To make a machine learning system that protects privacy we first need to have a definition of what privacy is. 
+Differential privacy (DP) is one such definition. 
+First we need to have three concepts: the _database_ is a collection of data about _individuals_ (for example, their medical records), and we want to make a _query_ about that data (for example "How much does smoking increase someone's risk of cancer?").
+DP says that privacy is preserved if the result of the query cannot be used to determine if any particular individual is present in the database.
+
+So if person A has their medical data in a database, and the query that we want to make on that database is 
+"How much does smoking increase someone's risk of cancer" then the result of that query shouldn't disclose whether or not person A's details are in the database.
+
+From this comes the idea of _sensitivity_ of a query. 
+The _sensitivity_ of a query determines how much the result of the query depends on an individual's data. 
+For example, the query "How much does smoking increase the risk of cancer for adults in the UK?" is less sensitive than the query "How much does smoking increase the risk of cancer for men aged 50-55 in Cambridge?" because the second query uses a smaller set of individuals.
+
+## Epsilon-differential privacy
+EDP is a scheme for preserving differential privacy. 
+In EDP all queries have random noise added to them, so they are no longer deterministic.
+So if the query was "What fraction of people in the database are male", and the true result is 0.5 then the results of calling this query three times might be 0.53, 0.49 and 0.51. 
+This makes it harder to tell if an individual's data is in the database, because the effect of adding a person can't be distinguished from the effect of the random noise.
+Intuitively this is a bit like blurring an image: adding noise obscures personal information.
+The amount of personal information that is revealed isn't zero, but it is guaranteed to be below a certain threshold.
+
+The level of privacy that is provided is controlled by the parameter epsilon; the greater epsilon is the more noise is added and the more privacy is preserved.
+Queries that are more sensitive have more noise added, because they reveal more information about individuals.
+It is important to add as little noise as possible, because adding more noise obscures the patterns that you want to extract from the data.
+
+## Differential privacy when training neural netowrks
+Each training step for a neural network can be though of as a complicated query on a database of training data.
+Differential privacy mechanisms tell you how much noise you need to add to guarantee a certain level of privacy.
+The `opacus` and `tensorflow-privacy` libraries implement epsilon-differential privacy for training neural networks for pytorch and keras respectively.
+
 
 # How to use differential privacy with colearn
-The opacus and tensorflow-privacy libraries implement DP for pytorch and keras respectively.
-To see an example of using them see [dp_pytorch]({{ repo_root }}/examples/pytorch_mnist_diffpriv.py) 
+By using `opacus` and `tensorflow-privacy` we can make colleactive learning use differential privacy.
+The learner that is proposing weights does so using a DP-enabled optimiser.
+
+To see an example of using this see [dp_pytorch]({{ repo_root }}/examples/pytorch_mnist_diffpriv.py) 
 and [dp_keras]({{ repo_root }}/examples/keras_mnist_diffpriv.py).
diff --git a/docs/examples.md b/docs/examples.md
@@ -13,7 +13,8 @@ This is a list of examples that we've implemented to show you how to use Collect
 ### Fraud
   The fraud dataset consists of information about credit card transactions. 
   The task is to predict whether transactions are fraudulent or not.
-  The data needs to be downloaded from [Kaggle](https://www.kaggle.com/c/ieee-fraud-detection)
+  The data needs to be downloaded from [Kaggle](https://www.kaggle.com/c/ieee-fraud-detection), 
+  and the data directory passed in with the flag `--data_dir`. 
 
 * [fraud_mli]({{ repo_root }}/examples/mli_fraud.py).
   Uses the `MachineLearningInterface` directly and detects fraud in bank transactions.
@@ -28,7 +29,8 @@ This is a list of examples that we've implemented to show you how to use Collect
   Uses the `PytorchLearner` helper class.
 ### Xray
   A binary classification task that requires predicting pneumonia from images of chest X-rays. 
-  The data need to be downloaded from [Kaggle](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia)
+  The data need to be downloaded from [Kaggle](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia), 
+  and the data directory passed in with the flag `--data_dir`
 
 * [xray_keras]({{ repo_root }}/examples/keras_xray.py). 
   Uses the `KerasLearner` helper class.

diff --git a/docs/images/mnist_plot.png b/docs/images/mnist_plot.png
diff --git a/docs/installation.md b/docs/installation.md
@@ -15,7 +15,7 @@ pip install .[keras]
 pip install .[pytorch]
 ```
 
-To install all the extras, including the ones required for the examples, use:
+To install both the keras and pytorch extras use:
 ```
 pip install .[all]
 ```

diff --git a/docs/intro_tutorial_keras.md b/docs/intro_tutorial_keras.md
@@ -90,5 +90,3 @@ plot_results(results, n_learners, block=False,
              score_name=all_learner_models[0].criterion)
 plot_votes(results, block=True)
 ```
-
-Simple!
diff --git a/docs/intro_tutorial_pytorch.md b/docs/intro_tutorial_pytorch.md
@@ -96,5 +96,3 @@ plot_results(results, n_learners, score_name=score_name)
 plot_votes(results, block=True)
 
 ```
-
-Simple!
diff --git a/examples/keras_cifar.py b/examples/keras_cifar.py
@@ -115,8 +115,7 @@ def get_model():
 # Get initial score
 results.data.append(initial_result(all_learner_models))
 
-plot = ColearnPlot(n_learners=n_learners,
-                   score_name=all_learner_models[0].criterion)
+plot = ColearnPlot(score_name=all_learner_models[0].criterion)
 
 for round_index in range(n_rounds):
     results.data.append(

diff --git a/examples/keras_fraud.py b/examples/keras_fraud.py
@@ -115,8 +115,7 @@ def get_model():
 # Get initial score
 results.data.append(initial_result(all_learner_models))
 
-plot = ColearnPlot(n_learners=n_learners,
-                   score_name="loss")
+plot = ColearnPlot(score_name="loss")
 
 for round_index in range(n_rounds):
     results.data.append(

diff --git a/examples/keras_mnist.py b/examples/keras_mnist.py
@@ -100,8 +100,7 @@ def get_model():
 results = Results()
 results.data.append(initial_result(all_learner_models))
 
-plot = ColearnPlot(n_learners=n_learners,
-                   score_name=all_learner_models[0].criterion)
+plot = ColearnPlot(score_name=all_learner_models[0].criterion)
 
 for round_index in range(n_rounds):
     results.data.append(