dnnclassifier.py

# -*- coding: utf-8 -*-
"""DNNClassifier.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1jqNYPuEMMv6TgDlwcj4doNJH5p1e7V7Z

A program to run tensorflow's dnn classifier using the estimator API, enhancing the feature selection by using feature_column module of tensorflow and 
feeding the data using tensorflow's dataset API.
"""

# import necessary packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import itertools
import numpy as np
import requests
import os
import gzip
import shutil

# Set ipython's max row display
pd.set_option('display.max_row', 1000)

# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)

# download the dataset and extract it 
f = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz")

with open('g.gz', 'wb') as fs:
  for chunks in f.iter_content(chunk_size=128):
    fs.write(chunks)

with gzip.open("g.gz", 'rb') as f_in:
    with open('covtype.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
#!gunzip covtype.data.gz
#!mv covtype.data covtype.csv

# read the dataset
df = pd.read_csv('covtype.csv')

# extract the only needed columns using the column names, here the column names are given by numbers
cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 54]
df = df[df.columns[cols]]
# assign new column names to extracted dataset
column_names = ['elevation', 'aspect', 'slope', 'hd_to_hydrology', 'vd_to_hydrology', 'hd_to_roadways', 
                'hillshade_9am', 'hillshade_noon', 'hillshade_3pm', 'hd_to_firepoints', 'cover_type']
df.columns = column_names

# shuffle the data and reset the index
df.sample(frac=1).reset_index(drop=True)

# print the first 5 rows of the dataframe
print(df.head())

# get the information regarding the dataframe
print(df.info())

# get the statistics of the data 
print(df.describe().transpose())

# plot the boxplot using seaborn
sns.set(style='ticks')
sns.boxplot(data=df)

# plot the pairplot of the dataframe
sns.pairplot(data=df)

# compute the correlation among the features in the dataset
corr_data = df.corr(method='pearson')
print(corr_data)

# plot the correlation matrix data using matplotlib
plt.matshow(corr_data)
plt.xticks(range(len(corr_data.columns)), corr_data.columns)
plt.yticks(range(len(corr_data.columns)), corr_data.columns)
plt.colorbar()
plt.show()

# define the columns to get scaled  
df_subcategories = df.loc[:, df.columns.isin(['aspect', 'slope', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm'])]

# instantiate contructor for the MinMaxScaler for the independent variables
x_scaler = MinMaxScaler()
#fit and transform the data 
df_scaled = x_scaler.fit_transform(df_subcategories)
df.loc[:, df.columns.isin(['aspect', 'slope', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm'])] = df_scaled

print(df.head())

# define the various feature column names
continuous_columns = ['aspect', 'slope', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm']
bucketized_columns = ['elevation', 'hd_to_hydrology', 'vd_to_hydrology', 'hd_to_roadways', 'hd_to_firepoints']
crossed_columns = [('hd_to_hydrology', 'vd_to_hydrology'), ('hillshade_9am', 'hillshade_noon', 'hillshade_3pm')]

# convert the continous columns into numeric feature columns
continuous_cols = [tf.feature_column.numeric_column(c) for c in continuous_columns]
print(continuous_cols)

# for bucketizing the columns first convert the column into numeric feature column and then into a bucketized feature column with the boundaries set in the 'boundaries' parameter 
bucketized_cols = [tf.feature_column.numeric_column(c) for c in bucketized_columns]

elevation_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[0], boundaries=[i for i in range(1800, 4000, 100)])
print(elevation_buckt_cols)

hd_to_hydrology_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[1], boundaries=[i for i in range(0, 1500, 100)])
print(hd_to_hydrology_buckt_cols)

vd_to_hydrology_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[2], boundaries=[i for i in range(-180, 700, 100)])
print(vd_to_hydrology_buckt_cols)

hd_to_roadways_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[3], boundaries=[i for i in range(0, 8000, 100)])
print(hd_to_roadways_buckt_cols)

hd_to_firepoints_buckt_cols = tf.feature_column.bucketized_column(source_column=bucketized_cols[4], boundaries=[i for i in range(0, 8000, 100)])
print(hd_to_firepoints_buckt_cols)

buckt_cols = [elevation_buckt_cols, hd_to_hydrology_buckt_cols, vd_to_hydrology_buckt_cols, hd_to_roadways_buckt_cols, hd_to_firepoints_buckt_cols]

# convert the columns to crossed feature columns
hydrology_crossed_cols = tf.feature_column.crossed_column([hd_to_hydrology_buckt_cols, vd_to_hydrology_buckt_cols], hash_bucket_size=1000)
print(hydrology_crossed_cols)

hillshade_crossed_cols = tf.feature_column.crossed_column(['hillshade_9am', 'hillshade_noon', 'hillshade_3pm'], hash_bucket_size=500)
print(hillshade_crossed_cols)

crossed_cols = [hydrology_crossed_cols, hillshade_crossed_cols]

# check if any null values are present in the dataset
df.columns.isnull().sum()

# instantiate estimator dnn classifier model with the feature columns defined earlier, the number of neurons on each hidden layer by passing on a list and the number of classes
estimator = tf.estimator.DNNClassifier(feature_columns=continuous_cols+buckt_cols+crossed_indicator_cols, hidden_units=[2056, 1024, 512, 256, 128], n_classes=8)

# split the dataset into train and test sets
train_data = df.sample(frac=0.8, random_state=101)
test_data = df.drop(train_data.index)
print(train_data.shape, test_data.shape)
print(df.head())

# convert the dataframes into csv files to be passed into the tensorflow dataset functions
train_data_csv = train_data.to_csv('train_data.csv', index=None, header=True)
test_data_csv = test_data.to_csv('test_data.csv', index=None, header=True)

trains = 'train_data.csv'
tests = 'test_data.csv'

# define a list having the length of the columns with  zeros float numbers except for the label because the label should be an integer type
record_default = [[0.0] for i in range(len(column_names) - 1)]
record_default.append([0])

# build the input function to be used for training, evaulation and prediction
def input_fn(csv_data, batch_size, num_epochs=None):
  # define a parser function to be applied to the later
  def parse_csv(value):
    # decode the csv file using tf.decode_csv  
    columns = tf.decode_csv(value, record_defaults=record_default)
    # get the features and the label
    features = dict(zip(column_names, columns))
    labels = features.pop('cover_type')
    return features, labels
  
  # use the TextLineDataset function of tensorflow to read off csv
  dataset = tf.data.TextLineDataset(csv_data)
  # skip the header
  dataset = dataset.skip(1)
  # map the dataset using the parser function defined
  dataset = dataset.map(parse_csv)
  # allow the dataset to continue indefinetely to feed the model 
  dataset = dataset.repeat(num_epochs)
  # define the batch size of the dataset 
  dataset = dataset.batch(batch_size)
  
  # create an iterator to iterate over the dataset
  iterator = dataset.make_one_shot_iterator()
  # get the next batch of data off dataset
  features, labels = iterator.get_next()
  return features, labels

# train the estimator model
estimator.train(steps=1000, input_fn=lambda: input_fn(trains, batch_size=128, num_epochs=100))

# evaluate the estimator model
ev = estimator.evaluate(steps=None, input_fn=lambda: input_fn(tests, batch_size=128, num_epochs=1))

# get the evaluation loss
loss = ev['loss']
print('Loss: ', loss)