-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
102 lines (91 loc) · 3.35 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import requests
from bs4 import BeautifulSoup
import re
import json
import numpy as np
import pandas as pd
import collections
from sklearn.feature_extraction.text import CountVectorizer
def getjsons(filename):
with open(filename, 'r') as fp:
return json.load(fp)
def getsentiment(df):
sentiments = []
for val in df['Rating']:
if float(val) >= 3.0:
sentiments.append("Positive")
else:
sentiments.append("Negative")
return np.array(sentiments)
def createdf(dictionary):
df = pd.DataFrame(dictionary.items(), columns = ['Review', 'Rating'], dtype=None)
df['Sentiment'] = getsentiment(df)
return df
def mostcommon(df, name):
counts = []
vector = CountVectorizer()
words = df['Review']
dtm = vector.fit(words.values.astype('U'))
names = dtm.get_feature_names()
dtm = vector.fit_transform(words.values.astype('U'))
dtm_dense = dtm.todense()
df = pd.DataFrame(dtm_dense, columns=names)
for col in df:
counts.append((df[col].sum(), col))
print "\nTop 20 words to describe", name
for i in sorted(counts)[-20:]:
print i[1], ":", i[0]
starwars = getjsons("movies_data/starwars.json")
abouttime = getjsons("movies_data/abouttime.json")
taken = getjsons("movies_data/taken.json")
toystory = getjsons("movies_data/toystory.json")
cloudatlas = getjsons("movies_data/cloudatlas.json")
stepbrothers = getjsons("movies_data/stepbrothers.json")
saw = getjsons("movies_data/saw.json")
saw2 = getjsons("movies_data/saw2.json")
titanic = getjsons("movies_data/titanic.json")
piratesofthecaribbean = getjsons("movies_data/piratesofthecaribbean.json")
starwars = createdf(starwars)
abouttime = createdf(abouttime)
taken = createdf(taken)
toystory = createdf(toystory)
cloudatlas = createdf(cloudatlas)
stepbrothers = createdf(stepbrothers)
saw = createdf(saw)
saw2 = createdf(saw2)
titanic = createdf(titanic)
piratesofthecaribbean = createdf(piratesofthecaribbean)
mostcommon(starwars, "Star Wars")
mostcommon(abouttime, "About Time")
mostcommon(taken, "Taken")
mostcommon(toystory, "Toy Story")
mostcommon(cloudatlas, "Cloud Atlas")
mostcommon(stepbrothers, "Stepbrothers")
mostcommon(saw, "Saw")
mostcommon(saw2, "Saw 2")
mostcommon(titanic, "Titanic")
mostcommon(piratesofthecaribbean, "Pirates of the Caribbean")
dataframes = [starwars, abouttime, taken, toystory, cloudatlas, stepbrothers, saw, saw2, titanic, piratesofthecaribbean]
t2 = [starwars, abouttime, taken, toystory, cloudatlas, stepbrothers, titanic, piratesofthecaribbean]
t3 = [starwars, abouttime, taken, toystory, saw, saw2, titanic, piratesofthecaribbean]
t4 = [starwars, abouttime, cloudatlas, stepbrothers, saw, saw2, titanic, piratesofthecaribbean]
train1 = pd.concat(dataframes[0:8])
test1 = pd.concat(dataframes[8:10])
train2 = pd.concat(t2)
test2 = pd.concat(dataframes[6:8])
train3 = pd.concat(t3)
test3 = pd.concat(dataframes[4:6])
train4 = pd.concat(t4)
test4 = pd.concat(dataframes[2:4])
train5 = pd.concat(dataframes[2:10])
test5 = pd.concat(dataframes[0:2])
train1.to_csv("csv_files/train1.csv")
test1.to_csv("csv_files/test1.csv")
train2.to_csv("csv_files/train2.csv")
test2.to_csv("csv_files/test2.csv")
train3.to_csv("csv_files/train3.csv")
test3.to_csv("csv_files/test3.csv")
train4.to_csv("csv_files/train4.csv")
test4.to_csv("csv_files/test4.csv")
train5.to_csv("csv_files/train5.csv")
test5.to_csv("csv_files/test5.csv")