Skip to content

Commit

Permalink
Merge pull request #103 from fetchai/docs_improvements
Browse files Browse the repository at this point in the history
COLE-379 Docs improvements
  • Loading branch information
evsmithx authored Feb 10, 2021
2 parents cb0a856 + 8173fa7 commit 1e98fd0
Show file tree
Hide file tree
Showing 24 changed files with 133 additions and 155 deletions.
103 changes: 36 additions & 67 deletions colearn/utils/plot.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,20 @@
import matplotlib.axes._axes as mpl_ax
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np

from colearn.utils.results import Results


class ColearnPlot:
def __init__(self, n_learners: int, score_name: str = "user-defined score"):
def __init__(self, score_name: str = "user-defined score"):
self.score_name = score_name
self.n_learners = n_learners
self.results_axes: mpl_ax.Axes = plt.subplot(2, 1, 1, label="sub1")
self.votes_axes: mpl_ax.Axes = plt.subplot(2, 1, 2, label="sub2")

def _process_statistics(self, results: Results):
results.h_test_scores = []
results.h_vote_scores = []

results.mean_test_scores = []
results.mean_vote_scores = []

results.max_test_scores = []
results.max_vote_scores = []

for r in range(len(results.data)):
results.mean_test_scores.append(
np.mean(np.array(results.data[r].test_scores))
)
results.mean_vote_scores.append(
np.mean(np.array(results.data[r].vote_scores))
)
results.max_test_scores.append(np.max(np.array(results.data[r].test_scores)))
results.max_vote_scores.append(np.max(np.array(results.data[r].vote_scores)))

# gather individual scores
for i in range(self.n_learners):
results.h_test_scores.append([])
results.h_vote_scores.append([])

for r in range(len(results.data)):
results.h_test_scores[i].append(results.data[r].test_scores[i])
results.h_vote_scores[i].append(results.data[r].vote_scores[i])

results.highest_test_score = np.max(np.array(results.h_test_scores))
results.highest_vote_score = np.max(np.array(results.h_vote_scores))

results.highest_mean_test_score = np.max(results.mean_test_scores)
results.highest_mean_vote_score = np.max(results.mean_vote_scores)

results.current_mean_test_score = results.mean_test_scores[-1]
results.current_mean_vote_score = results.mean_vote_scores[-1]

results.current_max_test_score = results.max_test_scores[-1]
results.current_max_vote_score = results.max_vote_scores[-1]

results.mean_mean_test_score = np.mean(np.array(results.h_test_scores))
results.mean_mean_vote_score = np.mean(np.array(results.h_vote_scores))

def plot_results(self, results, block=False):
# Prepare data for plotting
self._process_statistics(results)
results.process_statistics()

plt.ion()
plt.show(block=False)
Expand All @@ -72,33 +27,33 @@ def plot_results(self, results, block=False):
self.results_axes.set_xlim(-0.5, len(results.mean_test_scores) - 0.5)
self.results_axes.set_xticks(np.arange(0, len(results.mean_test_scores), step=1))

rounds = range(len(results.mean_test_scores))

for i in range(self.n_learners):
n_rounds = len(results.data)
n_learners = len(results.data[0].vote_scores)
for i in range(n_learners):
self.results_axes.plot(
rounds,
range(n_rounds),
results.h_test_scores[i],
"b--",
alpha=0.5,
label=f"test {self.score_name}",
)
self.results_axes.plot(
rounds,
range(n_rounds),
results.h_vote_scores[i],
"r--",
alpha=0.5,
label=f"vote {self.score_name}",
)

(line_mean_test_score,) = self.results_axes.plot(
rounds,
range(n_rounds),
results.mean_test_scores,
"b",
linewidth=3,
label=f"mean test {self.score_name}",
)
(line_mean_vote_score,) = self.results_axes.plot(
rounds,
range(n_rounds),
results.mean_vote_scores,
"r",
linewidth=3,
Expand All @@ -120,20 +75,30 @@ def plot_votes(self, results: Results, block=False):

results_list = results.data

data = np.array([res.votes for res in results_list])
votes_array = np.array([res.votes for res in results_list])

votes_array = votes_array.transpose()

data = data.transpose()
self.votes_axes.matshow(data, aspect="auto", vmin=0, vmax=1)
coloured_votes_array = np.zeros((votes_array.shape[0], votes_array.shape[1], 3), dtype=np.int)

n_learners = data.shape[0]
n_rounds = data.shape[1]
green_colour = np.array([204, 255, 204], dtype=np.int)
red_colour = np.array([255, 153, 153], dtype=np.int)
coloured_votes_array[votes_array == 1] = green_colour
coloured_votes_array[votes_array == 0] = red_colour

# make extra legend entries
red_patch_handle = mpatches.Patch(color=red_colour / 256, label='Negative vote')
green_patch_handle = mpatches.Patch(color=green_colour / 256, label='Positive vote')

self.votes_axes.imshow(coloured_votes_array, aspect="auto", interpolation='nearest')

n_learners = votes_array.shape[0]
n_rounds = votes_array.shape[1]

# draw gridlines
self.votes_axes.set_xticks(range(n_rounds))

ticks = [""] + ["Learner " + str(i) for i in range(n_learners)] + [""]
ticks_loc = self.votes_axes.get_yticks().tolist()
self.votes_axes.yaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
ticks = ["Learner " + str(i) for i in range(n_learners)]
self.votes_axes.set_yticks(range(n_learners))
self.votes_axes.set_yticklabels(ticks)

pos_xs = []
Expand All @@ -148,10 +113,14 @@ def plot_votes(self, results: Results, block=False):
neg_xs.append(i + 1)
neg_ys.append(res.block_proposer)

self.votes_axes.scatter(pos_xs, pos_ys, marker="*", s=150, label="Positive overall vote")
self.votes_axes.scatter(neg_xs, neg_ys, marker="X", s=150, label="Negative overall vote")
self.votes_axes.scatter(pos_xs, pos_ys, marker=r"$\checkmark$", c="green", s=150,
label="Proposer and positive overall vote")
self.votes_axes.scatter(neg_xs, neg_ys, marker="X", c="red", s=150,
label="Proposer and negative overall vote")
self.votes_axes.set_xlabel("training round")
self.votes_axes.legend()

handles, _ = self.votes_axes.get_legend_handles_labels()
self.votes_axes.legend(handles=handles + [green_patch_handle, red_patch_handle])

# Gridlines based on minor ticks
self.votes_axes.set_xticks(np.arange(-0.5, n_rounds, 1), minor=True)
Expand Down
27 changes: 12 additions & 15 deletions colearn/utils/results.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
import numpy as np


class Result:
Expand All @@ -21,23 +22,19 @@ def __init__(self):
self.mean_test_scores = []
self.mean_vote_scores = []

self.max_test_scores = []
self.max_vote_scores = []

self.highest_test_score = 0
self.highest_vote_score = 0

self.highest_mean_test_score = 0
self.highest_mean_vote_score = 0

self.current_mean_test_score = 0
self.current_mean_vote_score = 0
def process_statistics(self):
self.h_test_scores = []
self.h_vote_scores = []

self.current_max_test_score = 0
self.current_max_vote_score = 0
n_rounds = len(self.data)
self.mean_test_scores = [np.mean(np.array(self.data[r].test_scores)) for r in range(n_rounds)]
self.mean_vote_scores = [np.mean(np.array(self.data[r].vote_scores)) for r in range(n_rounds)]

self.mean_mean_test_score = 0
self.mean_mean_vote_score = 0
# gather individual scores
n_learners = len(self.data[0].vote_scores)
for i in range(n_learners):
self.h_test_scores.append([self.data[r].test_scores[i] for r in range(n_rounds)])
self.h_vote_scores.append([self.data[r].vote_scores[i] for r in range(n_rounds)])


def print_results(results: Results):
Expand Down
5 changes: 0 additions & 5 deletions colearn_grpc/test_example_mli_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,24 @@ def factory() -> ExampleMliFactory:
return ExampleMliFactory()


@pytest.mark.slow
def test_setup(factory):
assert len(factory.get_models()) > 0
assert len(factory.get_dataloaders()) > 0
assert len(factory.get_compatibilities()) > 0


@pytest.mark.slow
def test_model_names(factory):
for task in TaskType:
assert task.name in factory.get_models().keys()


@pytest.mark.slow
def test_dataloader_names(factory):
for task in TaskType:
assert task.name in factory.get_dataloaders().keys()

assert len(factory.get_dataloaders()[TaskType.KERAS_MNIST.name]) > 0


@pytest.mark.slow
def test_compatibilities(factory):
for task in TaskType:
assert task.name in factory.get_models().keys()
Expand All @@ -56,7 +52,6 @@ def mnist_config():
}


@pytest.mark.slow
def test_get_mnist(factory, mnist_config):

model_params = json.dumps({'model_type': mnist_config['model_type']})
Expand Down
10 changes: 5 additions & 5 deletions docs/demo.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ There are five potential datasets for the demo
* PYTORCH_XRAY is Pytorch implementation of a binary classification task that requires predicting pneumonia from images of chest X-rays.
The data need to be downloaded from [kaggle](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia)
* PYTORCH_COVID_XRAY is Pytorch implementation of a 3 class classification task that requires predicting no finding, covid or pneumonia from images of chest X-rays.
This dataset is currently unavailable.
This dataset is not currently publicly available.
* FRAUD The fraud dataset consists of information about credit card transactions, and the task is to predict whether
transactions are fraudulent or not.
The data need to be downloaded from [kaggle](https://www.kaggle.com/c/ieee-fraud-detection)
Expand All @@ -36,14 +36,14 @@ Arguments to run the demo:
```

## Running MNIST
The simplest task to run is MNIST because this doesn't require downloading the data.
This runs the MNIST task with five learners for 15 rounds.
The simplest task to run is MNIST because the data are downloaded automatically from `tensorflow_datasets`.
The command below runs the MNIST task with five learners for 15 rounds.
```bash
examples/run_demo.py --task KERAS_MNIST --n_learners 5 --n_rounds 15
```
You should see a graph of the vote score and the test score (the score used here is categorical accuracy).
New model is accepted (blue star) if amount of possitive votes (yellow color) is higher than 0.5.
New model is rejected (orange cross) if amount of negative votes (purple color) is lower than 0.5.
The new model is accepted if the fraction of positive votes (green colour) is higher than 0.5.
The new model is rejected if the fraction of negative votes (red color) is lower than 0.5.

![Alt text](images/mnist_plot.png?raw=true "Collective learning graph")

Expand Down
40 changes: 33 additions & 7 deletions docs/differential_privacy.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,37 @@
# What is differential privacy?
Differential privacy (DP) is a system for publicly sharing information about a dataset by describing the patterns
of groups within the dataset while withholding information about individuals in the dataset.
The idea behind differential privacy is that if the effect of making an arbitrary single substitution in
the database is small enough, the query result cannot be used to infer much about any single individual,
and therefore provides privacy.
To make a machine learning system that protects privacy we first need to have a definition of what privacy is.
Differential privacy (DP) is one such definition.
First we need to have three concepts: the _database_ is a collection of data about _individuals_ (for example, their medical records), and we want to make a _query_ about that data (for example "How much does smoking increase someone's risk of cancer?").
DP says that privacy is preserved if the result of the query cannot be used to determine if any particular individual is present in the database.

So if person A has their medical data in a database, and the query that we want to make on that database is
"How much does smoking increase someone's risk of cancer" then the result of that query shouldn't disclose whether or not person A's details are in the database.

From this comes the idea of _sensitivity_ of a query.
The _sensitivity_ of a query determines how much the result of the query depends on an individual's data.
For example, the query "How much does smoking increase the risk of cancer for adults in the UK?" is less sensitive than the query "How much does smoking increase the risk of cancer for men aged 50-55 in Cambridge?" because the second query uses a smaller set of individuals.

## Epsilon-differential privacy
EDP is a scheme for preserving differential privacy.
In EDP all queries have random noise added to them, so they are no longer deterministic.
So if the query was "What fraction of people in the database are male", and the true result is 0.5 then the results of calling this query three times might be 0.53, 0.49 and 0.51.
This makes it harder to tell if an individual's data is in the database, because the effect of adding a person can't be distinguished from the effect of the random noise.
Intuitively this is a bit like blurring an image: adding noise obscures personal information.
The amount of personal information that is revealed isn't zero, but it is guaranteed to be below a certain threshold.

The level of privacy that is provided is controlled by the parameter epsilon; the greater epsilon is the more noise is added and the more privacy is preserved.
Queries that are more sensitive have more noise added, because they reveal more information about individuals.
It is important to add as little noise as possible, because adding more noise obscures the patterns that you want to extract from the data.

## Differential privacy when training neural netowrks
Each training step for a neural network can be though of as a complicated query on a database of training data.
Differential privacy mechanisms tell you how much noise you need to add to guarantee a certain level of privacy.
The `opacus` and `tensorflow-privacy` libraries implement epsilon-differential privacy for training neural networks for pytorch and keras respectively.


# How to use differential privacy with colearn
The opacus and tensorflow-privacy libraries implement DP for pytorch and keras respectively.
To see an example of using them see [dp_pytorch]({{ repo_root }}/examples/pytorch_mnist_diffpriv.py)
By using `opacus` and `tensorflow-privacy` we can make colleactive learning use differential privacy.
The learner that is proposing weights does so using a DP-enabled optimiser.

To see an example of using this see [dp_pytorch]({{ repo_root }}/examples/pytorch_mnist_diffpriv.py)
and [dp_keras]({{ repo_root }}/examples/keras_mnist_diffpriv.py).
6 changes: 4 additions & 2 deletions docs/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ This is a list of examples that we've implemented to show you how to use Collect
### Fraud
The fraud dataset consists of information about credit card transactions.
The task is to predict whether transactions are fraudulent or not.
The data needs to be downloaded from [Kaggle](https://www.kaggle.com/c/ieee-fraud-detection)
The data needs to be downloaded from [Kaggle](https://www.kaggle.com/c/ieee-fraud-detection),
and the data directory passed in with the flag `--data_dir`.

* [fraud_mli]({{ repo_root }}/examples/mli_fraud.py).
Uses the `MachineLearningInterface` directly and detects fraud in bank transactions.
Expand All @@ -28,7 +29,8 @@ This is a list of examples that we've implemented to show you how to use Collect
Uses the `PytorchLearner` helper class.
### Xray
A binary classification task that requires predicting pneumonia from images of chest X-rays.
The data need to be downloaded from [Kaggle](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia)
The data need to be downloaded from [Kaggle](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia),
and the data directory passed in with the flag `--data_dir`

* [xray_keras]({{ repo_root }}/examples/keras_xray.py).
Uses the `KerasLearner` helper class.
Expand Down
Binary file modified docs/images/mnist_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pip install .[keras]
pip install .[pytorch]
```

To install all the extras, including the ones required for the examples, use:
To install both the keras and pytorch extras use:
```
pip install .[all]
```
Expand Down
2 changes: 0 additions & 2 deletions docs/intro_tutorial_keras.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,3 @@ plot_results(results, n_learners, block=False,
score_name=all_learner_models[0].criterion)
plot_votes(results, block=True)
```

Simple!
2 changes: 0 additions & 2 deletions docs/intro_tutorial_pytorch.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,5 +96,3 @@ plot_results(results, n_learners, score_name=score_name)
plot_votes(results, block=True)

```

Simple!
3 changes: 1 addition & 2 deletions examples/keras_cifar.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,7 @@ def get_model():
# Get initial score
results.data.append(initial_result(all_learner_models))

plot = ColearnPlot(n_learners=n_learners,
score_name=all_learner_models[0].criterion)
plot = ColearnPlot(score_name=all_learner_models[0].criterion)

for round_index in range(n_rounds):
results.data.append(
Expand Down
3 changes: 1 addition & 2 deletions examples/keras_fraud.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,7 @@ def get_model():
# Get initial score
results.data.append(initial_result(all_learner_models))

plot = ColearnPlot(n_learners=n_learners,
score_name="loss")
plot = ColearnPlot(score_name="loss")

for round_index in range(n_rounds):
results.data.append(
Expand Down
3 changes: 1 addition & 2 deletions examples/keras_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,7 @@ def get_model():
results = Results()
results.data.append(initial_result(all_learner_models))

plot = ColearnPlot(n_learners=n_learners,
score_name=all_learner_models[0].criterion)
plot = ColearnPlot(score_name=all_learner_models[0].criterion)

for round_index in range(n_rounds):
results.data.append(
Expand Down
Loading

0 comments on commit 1e98fd0

Please sign in to comment.