-
Notifications
You must be signed in to change notification settings - Fork 3
/
format_data.py
30 lines (23 loc) · 1.29 KB
/
format_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import pandas # Works with data in table format
import glob # Lets you essentially do a wildcard filename search
import pickle # Lets you easily dump and load python objects to disk
### Data should be in folder in same directory as this file named 'Raw Data'
#
raw_data = []
for f in glob.glob('Raw Data/*.tsv'):
d = pandas.read_csv(open(f,'rU'), sep="\t",
header=None,
names = ["id", "text", "user", "unknown feature 1",
"location", "time","maybe latitude?",
"maybe longitude?", "unknown feature 2"],
# warn_bad_lines=False,
error_bad_lines=False) # Skip lines that don't have
# enough columns
raw_data+=d["text"].tolist() # Grab the column containing an
# actual tweet and turn it into a list - add it to the giant list
# not sure why there are multiple files...
# TODO: Figure out how many lines we are throwing away because of bad
# formatting - if we are throwing a lot away figure out why
pickle.dump(raw_data, open('raw_text_data.pkl','wb'))
# Note from David: This is literally 8 lines of code - Python has killer
# libraries - try doing this much work in 8 lines of C++ code!