-
Notifications
You must be signed in to change notification settings - Fork 0
/
dnnclassifier.py
196 lines (149 loc) · 7.74 KB
/
dnnclassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# -*- coding: utf-8 -*-
"""DNNClassifier.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1jqNYPuEMMv6TgDlwcj4doNJH5p1e7V7Z
A program to run tensorflow's dnn classifier using the estimator API, enhancing the feature selection by using feature_column module of tensorflow and
feeding the data using tensorflow's dataset API.
"""
# import necessary packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import itertools
import numpy as np
import requests
import os
import gzip
import shutil
# Set ipython's max row display
pd.set_option('display.max_row', 1000)
# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)
# download the dataset and extract it
f = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz")
with open('g.gz', 'wb') as fs:
for chunks in f.iter_content(chunk_size=128):
fs.write(chunks)
with gzip.open("g.gz", 'rb') as f_in:
with open('covtype.csv', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
#!gunzip covtype.data.gz
#!mv covtype.data covtype.csv
# read the dataset
df = pd.read_csv('covtype.csv')
# extract the only needed columns using the column names, here the column names are given by numbers
cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 54]
df = df[df.columns[cols]]
# assign new column names to extracted dataset
column_names = ['elevation', 'aspect', 'slope', 'hd_to_hydrology', 'vd_to_hydrology', 'hd_to_roadways',
'hillshade_9am', 'hillshade_noon', 'hillshade_3pm', 'hd_to_firepoints', 'cover_type']
df.columns = column_names
# shuffle the data and reset the index
df.sample(frac=1).reset_index(drop=True)
# print the first 5 rows of the dataframe
print(df.head())
# get the information regarding the dataframe
print(df.info())
# get the statistics of the data
print(df.describe().transpose())
# plot the boxplot using seaborn
sns.set(style='ticks')
sns.boxplot(data=df)
# plot the pairplot of the dataframe
sns.pairplot(data=df)
# compute the correlation among the features in the dataset
corr_data = df.corr(method='pearson')
print(corr_data)
# plot the correlation matrix data using matplotlib
plt.matshow(corr_data)
plt.xticks(range(len(corr_data.columns)), corr_data.columns)
plt.yticks(range(len(corr_data.columns)), corr_data.columns)
plt.colorbar()
plt.show()
# define the columns to get scaled
df_subcategories = df.loc[:, df.columns.isin(['aspect', 'slope', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm'])]
# instantiate contructor for the MinMaxScaler for the independent variables
x_scaler = MinMaxScaler()
#fit and transform the data
df_scaled = x_scaler.fit_transform(df_subcategories)
df.loc[:, df.columns.isin(['aspect', 'slope', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm'])] = df_scaled
print(df.head())
# define the various feature column names
continuous_columns = ['aspect', 'slope', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm']
bucketized_columns = ['elevation', 'hd_to_hydrology', 'vd_to_hydrology', 'hd_to_roadways', 'hd_to_firepoints']
crossed_columns = [('hd_to_hydrology', 'vd_to_hydrology'), ('hillshade_9am', 'hillshade_noon', 'hillshade_3pm')]
# convert the continous columns into numeric feature columns
continuous_cols = [tf.feature_column.numeric_column(c) for c in continuous_columns]
print(continuous_cols)
# for bucketizing the columns first convert the column into numeric feature column and then into a bucketized feature column with the boundaries set in the 'boundaries' parameter
bucketized_cols = [tf.feature_column.numeric_column(c) for c in bucketized_columns]
elevation_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[0], boundaries=[i for i in range(1800, 4000, 100)])
print(elevation_buckt_cols)
hd_to_hydrology_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[1], boundaries=[i for i in range(0, 1500, 100)])
print(hd_to_hydrology_buckt_cols)
vd_to_hydrology_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[2], boundaries=[i for i in range(-180, 700, 100)])
print(vd_to_hydrology_buckt_cols)
hd_to_roadways_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[3], boundaries=[i for i in range(0, 8000, 100)])
print(hd_to_roadways_buckt_cols)
hd_to_firepoints_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[4], boundaries=[i for i in range(0, 8000, 100)])
print(hd_to_firepoints_buckt_cols)
buckt_cols = [elevation_buckt_cols, hd_to_hydrology_buckt_cols, vd_to_hydrology_buckt_cols, hd_to_roadways_buckt_cols, hd_to_firepoints_buckt_cols]
# convert the columns to crossed feature columns
hydrology_crossed_cols = tf.feature_column.crossed_column([hd_to_hydrology_buckt_cols, vd_to_hydrology_buckt_cols], hash_bucket_size=1000)
print(hydrology_crossed_cols)
hillshade_crossed_cols = tf.feature_column.crossed_column(['hillshade_9am', 'hillshade_noon', 'hillshade_3pm'], hash_bucket_size=500)
print(hillshade_crossed_cols)
crossed_cols = [hydrology_crossed_cols, hillshade_crossed_cols]
# check if any null values are present in the dataset
df.columns.isnull().sum()
# instantiate estimator dnn classifier model with the feature columns defined earlier, the number of neurons on each hidden layer by passing on a list and the number of classes
estimator = tf.estimator.DNNClassifier(feature_columns=continuous_cols+buckt_cols+crossed_indicator_cols, hidden_units=[2056, 1024, 512, 256, 128], n_classes=8)
# split the dataset into train and test sets
train_data = df.sample(frac=0.8, random_state=101)
test_data = df.drop(train_data.index)
print(train_data.shape, test_data.shape)
print(df.head())
# convert the dataframes into csv files to be passed into the tensorflow dataset functions
train_data_csv = train_data.to_csv('train_data.csv', index=None, header=True)
test_data_csv = test_data.to_csv('test_data.csv', index=None, header=True)
trains = 'train_data.csv'
tests = 'test_data.csv'
# define a list having the length of the columns with zeros float numbers except for the label because the label should be an integer type
record_default = [[0.0] for i in range(len(column_names) - 1)]
record_default.append([0])
# build the input function to be used for training, evaulation and prediction
def input_fn(csv_data, batch_size, num_epochs=None):
# define a parser function to be applied to the later
def parse_csv(value):
# decode the csv file using tf.decode_csv
columns = tf.decode_csv(value, record_defaults=record_default)
# get the features and the label
features = dict(zip(column_names, columns))
labels = features.pop('cover_type')
return features, labels
# use the TextLineDataset function of tensorflow to read off csv
dataset = tf.data.TextLineDataset(csv_data)
# skip the header
dataset = dataset.skip(1)
# map the dataset using the parser function defined
dataset = dataset.map(parse_csv)
# allow the dataset to continue indefinetely to feed the model
dataset = dataset.repeat(num_epochs)
# define the batch size of the dataset
dataset = dataset.batch(batch_size)
# create an iterator to iterate over the dataset
iterator = dataset.make_one_shot_iterator()
# get the next batch of data off dataset
features, labels = iterator.get_next()
return features, labels
# train the estimator model
estimator.train(steps=1000, input_fn=lambda: input_fn(trains, batch_size=128, num_epochs=100))
# evaluate the estimator model
ev = estimator.evaluate(steps=None, input_fn=lambda: input_fn(tests, batch_size=128, num_epochs=1))
# get the evaluation loss
loss = ev['loss']
print('Loss: ', loss)