-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml_imdb.py
83 lines (61 loc) · 2.98 KB
/
ml_imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
from matplotlib import pyplot
import os
os.chdir('/Desktop/web_scraping/')
data=pd.read_csv('final_movie_details.csv')
# plot
pyplot.scatter(data['imdb'],data['metascore'])
pyplot.show()
# plot
pyplot.scatter(data['metascore'],data['votes'])
pyplot.show()
# We can see a bit of linear relationship between imdb score and metascore lets try linear regression on it
## ML model
X = data.loc[:, 'metascore'].values
y = data.loc[:, 'imdb'].values
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()#making object for reg package
regressor.fit(X_train.reshape(-1,1), y_train.reshape(-1,1))#to fit the regressor to our training data
#predict the test results
y_pred =regressor.predict(X_test.reshape(-1,1))
#Now if we compare y_Pred and y_test we can see the current salary and model predicted salary in y_pred
pyplot.scatter(X_train, y_train, color = 'red')
pyplot.plot(X_train, regressor.predict(X_train.reshape(-1,1)), color = 'blue')
#we have plotted the line where real salary in x axis and
#predicted salary in y axis and we observe thatfew obs which are on line means its quite accurate i.e. real salary approx equal to predcted salary
pyplot.title('IMDB V/S METASCORE (Training set)')
pyplot.xlabel('Metascore')
pyplot.ylabel('IMDB')
pyplot.show()
# Visualising the Test set results
pyplot.scatter(X_test, y_test, color = 'red')
pyplot.plot(X_train, regressor.predict(X_train.reshape(-1,1)), color = 'blue')
pyplot.title('IMDB V/S METASCORE (Training set)')
pyplot.xlabel('Metascore')
pyplot.ylabel('IMDB')
pyplot.show()
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)
# 0.18041462828221905
# Its a good score we are getting it means the meta score is having quite good linear relation with imdb
## Let try with imdb and votes
X1 = data.loc[:, ['metascore','votes']].values
y1 = data.loc[:, 'imdb'].values
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.33, random_state = 0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()#making object for reg package
regressor.fit(X_train, y_train)#to fit the regressor to our training data
#predict the test results
y_pred =regressor.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)
# 0.15729132122310804 good score
dur=data['movie duration'].value_counts()
#https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/
#https://www.analyticsvidhya.com/blog/2019/04/predicting-movie-genres-nlp-multi-label-classification/
### Soon will make a multilable text classfier as the movie description has multiple tags like some are action plus comedy etc.