-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_model.py
115 lines (87 loc) · 3.68 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# for data
import numpy as np
import pandas as pd
# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# for training testing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
# for saving regression model as file for reuseability
from joblib import Parallel, delayed
import joblib
import os.path
# make directories if it does not exist
def check_folder_exists(path_to_folder):
if not os.path.exists(path_to_folder):
os.makedirs("./" + path_to_folder)
def main():
random_state_value = 1
gold_data = pd.read_csv("gld_price_data.csv")
check_folder_exists("pickle_files")
check_folder_exists("media_plots")
# get a overview of csv data
# print first and last five rows of data to
print(gold_data.head(), end="\n\n")
print(gold_data.tail(), end="\n\n")
print("Number of columns and rows -", gold_data.shape, end="\n\n")
print("Column name, type, null count and data type - ", end="\n\n")
print(gold_data.info(), end="\n\n")
# check number of missing values
missing_values = gold_data.isnull().sum()
print("Missing values -")
print(missing_values, end="\n\n")
# statiscal measure of data
print("Statiscal measures of data-")
print(gold_data.describe(), end="\n\n")
gold_data = gold_data.drop(["Date"], axis=1) # date is irrelevant
# check what values correlate with each other
correlation = gold_data.corr()
print("Corelation of all values with eachother")
print(correlation, end="\n\n")
# visualuize this correlation
sns.heatmap(
correlation,
cbar=True,
square=True,
fmt=".1f",
annot=True,
annot_kws={"size": 8},
cmap="rocket",
)
plt.savefig("media_plots/correlation_heatmap.png", bbox_inches="tight")
# plt.show()
# correaltion values for gold
print("Correlation of gold to the rest of the values -")
print(correlation["GLD"], end="\n\n")
# how gold prices are distributed
sns.displot(gold_data["GLD"], color="gold")
plt.savefig("media_plots/distribution_plot.png", bbox_inches="tight")
# plt.show()
X = gold_data.drop(["GLD"], axis=1) # date is irrelevant
Y = gold_data["GLD"]
print("datatype of non-gold values -", type(X))
print("datatype of gold values -", type(Y))
print("Value of non gold entites - ")
print(X, end="\n\n") # silver, uso , sp500 , usd/eur
print("Value of gold entites - ")
print(Y, end="\n\n") # gold column
# split data into training data and testing data - test_size is in percentages OF 80-20%, change random state later
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2, random_state=random_state_value
)
# check count of training and testing data ie if data is split 80-20
print("size of training data for non-gold values -", X_train.shape[0])
print("size of testing data for non-gold values -", X_test.shape[0])
print("size of training data for gold values -", Y_train.shape[0])
print("size of testing data for non-gold values -", Y_test.shape[0], end="\n\n")
# MODeL Training - n_estimators means number of decision trees ----------------------------------------------
regressor = RandomForestRegressor(n_estimators=100, random_state=random_state_value)
regressor.fit(X_train, Y_train)
# model trainig done ---------------------------------------------------------------------------------------
# save the model as a pickle file
joblib.dump(regressor, "pickle_files/regressor.pkl")
X_test.to_pickle("pickle_files/X_test.pkl")
Y_test.to_pickle("pickle_files/Y_test.pkl")
if __name__ == "__main__":
main()