-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tweets.py
256 lines (202 loc) · 8.44 KB
/
Tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import tweepy
import json
from string import punctuation
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import six
from google.cloud import translate_v2 as translate
class Tweets:
"""The Tweets class represents a collection of tweets comprised of the id and the tweet text content.
Various tools for processing are provided in the class to remove unwanted text features and translation.
Version 2
Attributes
----------
api : Tweepy API
This is the Tweepy API object used by the class to retrieve tweets by their id.
translate_client : Google Cloud API
This is the Google Cloud API object used by the class to translate tweet text to english.
tweets : {id: text}
A dict containing the tweets
stopwords : {language: set(nltk.stopwords.words(language))}
A dict of sets comprising of the nltk stopwords for each language that will be analyzed in this project
punctuation : str
A string of punctuations
"""
def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, path_to_service_account):
"""Initialises the Tweets object using for the Twitter and Google Cloud API, aswell for the languages chosen.
:param consumer_key: Tweepy consumer key
:param consumer_secret: Tweepy consumer secret
:param access_token: Tweepy access token
:param access_token_secret: Tweepy access token secret
:param path_to_service_account: Path to Google API service_account.json
"""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
self.api = tweepy.API(auth)
self.translate_client = translate.Client.from_service_account_json(path_to_service_account)
self.tweets = {}
# dictionary of sets, very fast indeed
self.stopwords = {
"en": set(stopwords.words('english')),
"fr": set(stopwords.words('french')),
"de": set(stopwords.words('german')),
"es": set(stopwords.words('spanish')),
"nl": set(stopwords.words('dutch')),
"it": set(stopwords.words('italian'))
}
self.punctuation = punctuation + '΄´’…“”–—―»«'
# Tokenize: Change to lowercase, reduce length and remove handles
self.tknzr = TweetTokenizer(preserve_case=False, reduce_len=True,
strip_handles=True) # reduce_len changes, for example, waaaaaayyyy to waaayyy.
def reset(self):
""" Resets the tweets object
"""
self.tweets = {}
def addTweets(self, ids: list):
""" adds tweets from the ids list to tweets
:param ids: list of twitter tweet ids
"""
tweets = self.api.statuses_lookup(ids)
for t in tweets:
self.tweets[str(t.id)] = t.text
def removeTweets(self, ids: list):
""" removes tweets with an id in ids from tweets
:param ids: list of twitter tweet ids
"""
for id in ids:
if str(id) in self:
self.tweets.pop(str(id))
def saveJSON(self, file_name):
"""Saves the tweets dictionary.
:param file_name: file name/ directory for saving
"""
if '.json' not in file_name:
file_name += '.json'
to_save = {}
for i, tweet in enumerate(self):
to_save[i] = {'id': tweet, 'text': self[tweet]}
json.dump(to_save, open(file_name, 'w+'))
def loadJSON(self, file_name):
"""Loads the tweets dictionary.
:param file_name: file name/ directory for loading
"""
if '.json' not in file_name:
file_name += '.json'
jsontweets = json.load(open(file_name))
tweets = {jsontweets[id]['id']: jsontweets[id]['text'] for id in jsontweets}
self.tweets.update(tweets)
def _pp(self, tweet, lang):
"""pre-processes tweet
hashtags, emojis, case, length and handles are kept in this version
:param tweet: tweet
:param lang: language of tweet
:return: pre processed tweet
"""
# Remove HTML special entities (e.g. &)
tweet_no_special_entities = re.sub(r'&\w*;', '', tweet)
# Remove tickers
tweet_no_tickers = re.sub(r'\$\w*', '', tweet_no_special_entities)
# Remove hyperlinks
tweet_no_hyperlinks = re.sub(r'https?://.*/\w*', '', tweet_no_tickers)
# Remove Punctuation and split 's, 't, 've with a space for filter
tweet_no_punctuation = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet_no_hyperlinks)
# Remove words with 2 or fewer letters
tweet_no_small_words = re.sub(r'\b\w{1,2}\b', '', tweet_no_punctuation)
# Remove whitespace (including new line characters)
tweet_no_whitespace = re.sub(r'\s\s+', ' ', tweet_no_small_words)
# Remove single space remaining at the front of the tweet.
tweet_no_whitespace = tweet_no_whitespace.lstrip(' ')
tw_list = self.tknzr.tokenize(tweet_no_whitespace)
# Remove stopwords
list_no_stopwords = [i for i in tw_list if i not in self.stopwords[lang]]
# Final filtered tweet
tweet_filtered = ' '.join(list_no_stopwords)
'''
_pp(tweet=' RT @Amila #Test\nTom\'s newly listed Co. & Mary\'s unlisted Group to supply tech for nlTK.\nh.. $TSLA $AAPL https:// t.co/x34afsfQsh', lang='en')
'''
return tweet_filtered
def preProcess(self, language):
""" preProcess all the tweets in tweets
:param language: language of the tweets
"""
temp_tweets = {id: self._pp(tweet=self[id], lang=language) for id in self}
self.tweets = temp_tweets
return self.tweets
def _translate(self, tweet, source_language):
"""Translates text into the target language.
:param tweet: tweet
:param source_language: language of tweet
:return: translated tweet from source_language to english
"""
if isinstance(tweet, six.binary_type):
tweet = tweet.decode("utf-8")
# Text can also be a sequence of strings, in which case this method
# will return a sequence of results for each text.
return self.translate_client.translate(tweet, target_language='en', source_language=source_language)[
"translatedText"]
def translate(self, language):
""" translates all the tweets in tweets to english
:param language: language of the tweets
"""
temp_tweets = {id: self._translate(tweet=self[id], source_language=language) for id in self}
self.tweets = temp_tweets
return self.tweets
def __repr__(self):
"""Allows the representation of Tweets as the tweets dict
Returns
-------
tweets
"""
return self.tweets
def __iter__(self):
"""Gives functionality to iterate over tweets
"""
for tweet in self.tweets:
yield tweet
def __getitem__(self, item):
"""
Returns
-------
item in item at index
"""
return self.tweets[item]
def values(self):
"""
Returns
-------
item' values
"""
return self.tweets.values()
if __name__ == '__main__':
# Updated demo
from decouple import config
tweets = Tweets(config('TWITTER_API_KEY'),
config('TWITTER_API_SECRET'),
config('TWITTER_ACCESS_TOKEN_KEY'),
config('TWITTER_ACCESS_TOKEN_SECRET'),
'service_account.json')
# tweet ids taken from 2021-01-01_clean-dataset.tsv
tweets.addTweets(['1344871397026361345', '1344871397286359041', '1344871407654731777'])
print('Added')
for t in tweets:
print(t, ' ', tweets[t])
tweets.removeTweets(['1344871397026361345', '1344871397286359041'])
print('After removal')
for t in tweets:
print(t, ' ', tweets[t])
tweets.saveJSON('test')
print('After Saving')
for t in tweets:
print(t, ' ', tweets[t])
tweets.addTweets(['1344871397026361345', '1344871397286359041'])
print('After Adding')
for t in tweets:
print(t, ' ', tweets[t])
tweets.loadJSON('test.json')
print('After Loading')
for t in tweets:
print(t, ' ', tweets[t])
tweets.preProcess('en')
for t in tweets:
print(t, ' ', tweets[t])