-
Notifications
You must be signed in to change notification settings - Fork 0
/
IOTweets.py
146 lines (119 loc) · 5.85 KB
/
IOTweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import pandas as pd
from collections import Counter
from datetime import datetime
from nltk.tokenize import TweetTokenizer
import csv
#----------------------------build_df(filepath)------------------------------------
#build a dataframe containing columns "word" and "occurence" from a vocabulary.
# filepath : the path of the file in which the vocabulary is written
def build_df(filepath, bitri):
tknzr = TweetTokenizer(preserve_case=False)
#load the vocabulary
df = pd.read_table(filepath_or_buffer = filepath, encoding="utf-8", header=None, names=["word"])
#remove blank space words (pollution)
df["len"] = df["word"].map(lambda x : len(tokenize(tknzr, x)))
if bitri:
df = df[df["len"]>=2]
else:
df = df[df["len"]==2]
df = df.drop(labels=["len"], axis = 1)
#build the dataframe
if bitri:
df["occurence"] = df["word"].map(lambda x: tokenize(tknzr, x)[0])
df["word"] = df["word"].map(lambda x: tuple(tokenize(tknzr, x)[1:]))
return df
else :
df["occurence"] = df["word"].map(lambda x: tokenize(tknzr, x)[0])
df["word"] = df["word"].map(lambda x: str(tokenize(tknzr, x)[1]))
return df
#----------------------------tokenize(tknzr, line)------------------------------------
# transforms a line into a list of its tokens
# tknzr : the tokenizer to be used to tokenize
# line : the line to be tokenized
def tokenize(tknzr, line):
tokens = tknzr.tokenize(line)
tokens = [tok for t in tokens for tok in t.split()] # split at spaces and flatten
return tokens
#----------------------------import_(path)------------------------------------
#import tweets written in file stocked under given path as an array of tweets (string)
# path : the path of the file in which the tweets are written
def import_(path):
with open(path, 'r', encoding="utf-8") as f:
tweets = [line.strip() for line in f]
return tweets
#----------------------------import_without_id(path)------------------------------------
#import tweets written in file stocked under given path as an array of tweets (string), but remove the id in file
# path : the path of the file in which the tweets are written
def import_without_id(path):
with open(path, 'r', encoding="utf-8") as f:
tweets = [line.strip()[line.find(",")+1:] for line in f] # Make sure to withdraw the "nbr",
return tweets
#----------------------------export(tweets, name)------------------------------------
#export an array of tweets (strings) in a file which name is given
#tweets : the tweets that we want to export (an array of strings)
#name : the name of the file in which tweets should be written
def export(tweets, name):
with open(name, 'w', encoding="utf-8") as f:
f.write("\n".join(tweets))
#----------------------------write_vocab_to_file(vocab_counter, dest_file_name)------------------------------------
#export the count of a given counter in a file which name is dest_file_name
#vocab_counter : the counter that contains words and their occurences
#dest_file_name : the name of the file in which vocabulary should be written
def write_vocab_to_file(vocab_counter, dest_file_name):
with open(dest_file_name, "w", encoding="utf-8") as inputfile:
for token, count in vocab_counter.most_common():
inputfile.write(str(count))
inputfile.write(" ")
if type(token) is tuple:
inputfile.write(" ".join(token))
else:
inputfile.write(str(token))
inputfile.write("\n")
#----------------------------write_vocab_to_file(vocab_counter, dest_file_name)------------------------------------
#vocab_counter : the couter that counted the words
#dest_file_name : the name of the file in which vocab should be written
def write_vocab(tweets, cut_threshold, file_name , bitri):
counter = build_vocab_counter(tweets, cut_threshold, bitri)
write_vocab_to_file(counter, (file_name + "_cut=" +str(cut_threshold) +"_bitri="+str(bitri)))
#----------------------------write_relevance_to_file(relevant, dest_file_name)------------------------------------
#relevant : the dataframe of relevance that will be writen
#dest_file_name : the name of the file in which relevance should be written
def write_relevance_to_file(relevant, dest_file_name):
with open(dest_file_name, "w", encoding="utf-8") as inputfile:
for index, row in relevant.iterrows():
inputfile.write(str(row["ratio"]))
inputfile.write("\t")
if type(row["word"]) is tuple:
inputfile.write(" ".join(row["word"]))
else:
inputfile.write(str(row["word"]))
inputfile.write("\n")
def write_index_to_file(vocab, dest_file_name):
with open(dest_file_name, "w", encoding="utf-8") as inputfile:
for index, row in relevant.iterrows():
inputfile.write(str(row["index"]))
inputfile.write("\t")
if type(row["word"]) is tuple:
inputfile.write(" ".join(row["word"]))
else:
inputfile.write(str(row["word"]))
inputfile.write("\n")
#----------------------------extract_relevance(relevant_filename)------------------------------------
#returns the relevance etracted from a file
#relevant_filename : the name of the file that contains the relevance
def extract_relevant(relevant_filename):
with open(relevant_filename, 'r', encoding="utf-8") as f:
relevance = {}
for line in f:
split = line.split()
token = tuple(split[1:])
relevance[token] = split[0]
return relevance
def extract_index(relevant_filename):
with open(relevant_filename, 'r', encoding="utf-8") as f:
relevance = {}
for index, line in enumerate(f):
split = line.split()
token = tuple(split[1:])
relevance[token] = index
return relevance