-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexample_rf.py
112 lines (86 loc) · 3.31 KB
/
example_rf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from lib.dataset import EnvironmentalDataset
from lib.raster import PatchExtractor
from lib.evaluation import evaluate
from lib.metrics import ValidationAccuracyMultipleBySpecies
from lib.metrics import ValidationAccuracyMultiple
# This is an example code how to use a scikit-learn model with our dataset (here a random forest classifier)
# SETTINGS
# files
TRAINSET_PATH = './data/train_dataset.csv'
TESTSET_PATH = './data/test_dataset.csv'
RASTER_PATH = './data/rasters/'
# csv columns
ID = 'id'
LABEL = 'Label'
LATITUDE = 'Latitude'
LONGITUDE = 'Longitude'
# dataset construction
VAL_SIZE = 0.1
# environmental patches
PATCH_SIZE = 1
# model params
N_LABELS = 4520
MAX_DEPTH = 17
N_TREES = 100
# evaluation
METRICS = (ValidationAccuracyMultipleBySpecies([1, 10, 30]), ValidationAccuracyMultiple([1, 10, 30]))
# READ DATASET
df = pd.read_csv(TRAINSET_PATH, header='infer', sep=';', low_memory=False)
ids = df[ID].to_numpy()
labels = df[LABEL].to_numpy()
positions = df[[LATITUDE, LONGITUDE]].to_numpy()
# splitting train val test
train_labels, val_labels, train_positions, val_positions, train_ids, val_ids\
= train_test_split(labels, positions, ids, test_size=VAL_SIZE, random_state=42)
# create patch extractor
extractor = PatchExtractor(RASTER_PATH, size=PATCH_SIZE, verbose=True)
# add all default rasters
extractor.add_all()
# constructing pytorch dataset
train_set = EnvironmentalDataset(train_labels, train_positions, train_ids, patch_extractor=extractor)
validation_set = EnvironmentalDataset(val_labels, val_positions, val_ids, patch_extractor=extractor)
# CONSTRUCT MODEL
clf = RandomForestClassifier(n_estimators=N_TREES, max_depth=MAX_DEPTH, n_jobs=16)
# TRAINING
print('Training...')
X, y = train_set.numpy()
clf.fit(X, y)
# save model
path = 'rf.skl'
print('Saving SKL model: ' + path)
joblib.dump(clf, path)
# VALIDATION
print('Validation: ')
inputs, labels = validation_set.numpy()
restricted_predictions = clf.predict_proba(inputs)
# sklearn fit() doesn't take as input the labels or the number of classes. It's infer with the training data.
# Labels order in prediction of the clf model is given in clf.classes_.
# With random split, some species can be only on the test or validation set and will not be present in the prediction.
# So, the prediction need to be reshape to covers all species as following.
predictions = np.zeros((restricted_predictions.shape[0], N_LABELS))
predictions[:, clf.classes_] = restricted_predictions
print(evaluate(predictions, labels, METRICS))
# FINAL EVALUATION ON TEST SET
# read test set
df = pd.read_csv(TESTSET_PATH, header='infer', sep=';', low_memory=False)
ids = df[ID].to_numpy()
labels = df[LABEL].to_numpy()
positions = df[[LATITUDE, LONGITUDE]].to_numpy()
test_set = EnvironmentalDataset(labels, positions, ids, patch_extractor=extractor)
# load model
path = 'rf.skl'
print('Loading SKL model: ' + path)
clf = joblib.load(path)
# predict
inputs, labels = test_set.numpy()
restricted_predictions = clf.predict_proba(inputs)
predictions = np.zeros((restricted_predictions.shape[0], N_LABELS))
predictions[:, clf.classes_] = restricted_predictions
# evaluate
print('Final test:')
print(evaluate(predictions, labels, METRICS, final=True))