-
Notifications
You must be signed in to change notification settings - Fork 0
/
making_models.py
31 lines (18 loc) · 914 Bytes
/
making_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
def combineFeatures(row):
return row['genres'] + " " + row['director'] + " " + row['keywords'] + " " + row['cast'] + " " + str(
row['popularity'])
data = pd.read_csv('movie_dataset.csv')
features = ['genres', 'director', 'keywords', 'cast', 'popularity']
for f in features:
data[f] = data[f].fillna('') # Filling all the Null value to empty string
data['combinedFeatures'] = data.apply(combineFeatures, axis = 1)
# TF-IDF Vectorization of genres
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combinedFeatures'])
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
with open('movie_similarity.pkl', 'wb') as file:
pickle.dump(cosine_similarities, file)