-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtop_wordcloud.py
320 lines (229 loc) · 12 KB
/
top_wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# -*- coding: utf-8 -*-
"""WordCloud.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Ynd3GOj1I6Vdot79UvTzrvFausmlxGIZ
"""
import matplotlib.pyplot as plt
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import (WordCloud, get_single_color_func)
#from PIL import Image
#import seaborn as sns
#import random
import streamlit as st
import pandas as pd
import numpy as np
import nltk
import re # library for regular expression operations
#import string # for string operations
# module for stop words that come with NLTK
from nltk.corpus import stopwords, wordnet
#from nltk.stem import PorterStemmer # module for stemming
from nltk.stem import WordNetLemmatizer # module for stemming
#from nltk.tokenize import TweetTokenizer # module for tokenizing strings
#from nltk.tokenize import sent_tokenize
#from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#from textblob import TextBlob
#from textblob import Word
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
# Word Cloud
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# Load data
tweets = pd.read_csv('tweets_EDA_clean.csv', encoding='utf-8', index_col=0)
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Data Pre-Processing #
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
def app():
def clean_text(tweet):
# function to clean tweets
"""Function to clean tweet text from noise, lowercasing, remove stop words
Input: Raw text
Output: Clean text.
"""
temp = tweet.lower()
temp = re.sub(r'\\n', " ", temp)
temp = re.sub(r'&\S+', " ", temp)
temp = re.sub("@[a-z0-9_]+", " ", temp)
temp = re.sub("#[a-z0-9_]+", " ", temp)
temp = re.sub(r'http\S+', " ", temp)
temp = re.sub(r'covid19|covid-19|coronavirus|virus', "covid", temp)
temp = re.sub(r'vaccine|vaccination', "vaccine", temp)
temp = re.sub(r'covid\s+vaccine', "vaccine", temp)
temp = re.sub('[()!?]', ' ', temp)
temp = re.sub('\[.*?\]', ' ', temp)
temp = re.sub("[^a-z0-9]", " ", temp)
# Remove stop words from the twitter texts
stop_words = stopwords.words('english')
temp = temp.split()
temp = [w for w in temp if not w in stop_words]
temp = " ".join(word for word in temp)
return temp
# Clean data
tweets['pre_cleaned_text'] = tweets['text'].apply(clean_text)
# Lemmatize with POS Tag
def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
# Reduce word to its root word with lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
#tweets['cleaned_text'] = tweets['pre_cleaned_text'].apply(lambda x: " ".join(
# wordnet_lemmatizer.lemmatize(word, pos='v') for word in x.split()))
tweets['cleaned_text'] = tweets['pre_cleaned_text'].apply(lambda x: " ".join(
wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x.split()))
# Initialize VADER
sentimentAnalyser = SentimentIntensityAnalyzer()
def calculate_sentiment(text):
# Run VADER on the text
scores = sentimentAnalyser.polarity_scores(text)
# Extract the compound score
compound_score = scores['compound']
# Return compound score
return compound_score
# function that will categorize the 'sentiment_score' column by Postive, Negative, or Neutral
def getCategory(score):
if score > 0.05:
return 'Positive'
elif score < -0.05:
return 'Negative'
else:
return 'Neutral'
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# #
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Sentiment Analysis after cleaning
# Apply the function to every row in the "text" column and output the results into a new column "sentiment_score"
tweets['sentiment_score'] = tweets['cleaned_text'].apply(calculate_sentiment)
tweets['analysis'] = tweets['sentiment_score'].apply(getCategory)
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# #
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Feature extraction
def get_word_counts(documents):
"""
Funtion to get embeddings of the documents
"""
# Creating the CountVectorizer instance
vectorizer = CountVectorizer()
# Getting the tokenization and occurrence counting
words_trans = vectorizer.fit_transform(documents)
words = vectorizer.get_feature_names()
return words_trans, words
# call the function
X, word = get_word_counts(tweets['cleaned_text'])
# Getting the bag of words as DataFrame
words_df = pd.DataFrame(
list(zip(word, np.ravel(X.sum(axis=0)))), columns=["Word", "Word_Count_All_Sentiments"]
)
# sort values by column "Word_Count_All_Sentiments"
words_df = words_df.sort_values(
by=["Word_Count_All_Sentiments"], ascending=False)
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# #
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Overall sentiment
total_tuple = []
for word in words_df.Word: # loop through each word in the column
temp2 = []
# loop through each text in the column
for ind, text in enumerate(tweets['cleaned_text']):
if word in text:
# Look for the sentiment_score of matched word and append to temp2 list ##### [tweets.loc[ind, 'sentiment_score'] for ind, text in enumerate(tweets['cleaned_text']) if word in text]
temp2.append(tweets.loc[ind, 'sentiment_score'])
# get average of each word and append the word and the mean to total_tuple list
average_num = sum(temp2)/(len(temp2) + 0.0001)
total_tuple.append((word, average_num))
# Filter positive and negative sentiments into new lists as input for the color_to_words
res_pos = [sub[0] for sub in total_tuple if sub[1] > 0.05]
res_neg = [sub[0] for sub in total_tuple if sub[1] < -0.05]
#res_neu = [sub[0] for sub in total_tuple if (sub[1] <= 0.05) & (sub[1] >= -0.05)]
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Word Cloud #
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
class GroupedColorFunc(object):
"""Create a color function object which assigns DIFFERENT SHADES of
specified colors to certain words based on the color to words mapping.
Uses wordcloud.get_single_color_func
Parameters
----------
color_to_words : dict(str -> list(str))
A dictionary that maps a color to the list of words.
default_color : str
Color that will be assigned to a word that's not a member
of any value from color_to_words.
"""
def __init__(self, color_to_words, default_color):
self.color_func_to_words = [
(get_single_color_func(color), set(words))
for (color, words) in color_to_words.items()]
self.default_color_func = get_single_color_func(default_color)
def get_color_func(self, word):
"""Returns a single_color_func associated with the word"""
try:
color_func = next(
color_func for (color_func, words) in self.color_func_to_words
if word in words)
except StopIteration:
color_func = self.default_color_func
return color_func
def __call__(self, word, **kwargs):
return self.get_color_func(word)(word, **kwargs)
def wordcloud(text, display_words):
# Generate a word cloud that assigns colors to words based on a predefined mapping from colors to words
wc = WordCloud(max_words=display_words, random_state=42,
max_font_size=119, width=800, height=500).generate(text)
color_to_words = {
# words below will be colored with a green single color function
'#00ff00': res_pos,
# will be colored with a red single color function
'red': res_neg
}
# Words that are not in any of the color_to_words values
# will be colored with a grey single color function
default_color = 'grey'
# Create a color function with single tone
# grouped_color_func = SimpleGroupedColorFunc(color_to_words, default_color)
# Create a color function with multiple tones
grouped_color_func = GroupedColorFunc(color_to_words, default_color)
# Apply our color function
wc.recolor(color_func=grouped_color_func)
# Plot
plt.figure(figsize=(25, 25))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
# plt.show()
st.set_option('deprecation.showPyplotGlobalUse', False)
st.pyplot()
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Slider, wordcloud
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# with open('style.css') as f:
# st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
#st.title("*WordCloud for Covid-19 Sentiment on Twitter*")
title1 = '<p style="color:Blue; font-size: 40px;">Covid-19 Sentiment on Twitter</p>'
st.markdown(title1, unsafe_allow_html=True)
title2 = '<p style="color:Blue; font-size: 32px;">Wordcloud</p>'
st.markdown(title2, unsafe_allow_html=True)
#st.subheader("Wordcloud")
#st.markdown("The wordcloud vizualization displays the top words used in the tweets, the sizes of the words represent their fequencies, the bigger the word, the more frequent the word and the smaller the word, the less frequent the word. The colors represent `sentiment scores` associated with the word which ranges from -1 to 1, from most negative to most positive. Sentiment scores greater than `0.05` indicate **Positive Sentiment** with the color *green*, and scores less than `-0.05` indicate **Negative Sentiment** with color *red*, while **Neutral Sentiment** takes color *grey*. ")
st.markdown("The wordcloud vizualization displays the top words used in the tweets, the sizes of the words represent their fequencies, the bigger the word, the more frequent the word and the smaller the word, the less frequent the word. The colors represent `sentiment scores` associated with the word which ranges from -1 to 1, from most negative to most positive. ")
text1 = '<p>Sentiment scores greater than <code>0.05</code> indicate <em id = "positiveSent">Positive Sentiment</em> in <em id = "positiveSent">green</em> color, and scores less than <code>-0.05</code> indicate <em id = "negativeSent">Negative Sentiment</em> in <em id = "negativeSent">red</em> color, while <em id = "neutralSent">Neutral Sentiment</em> takes color <em id = "neutralSent">grey</em>. </p>'
st.markdown(text1, unsafe_allow_html=True)
st.markdown("The user can choose the number of words to display on the wordcloud by moving the slide to a desired number. The result will show the top number of words used in the tweets and their polarity in colors defined above. ")
# --------------------------------------------------------
all_sentiments = ' '.join([category for category in tweets['cleaned_text']])
num_words = st.slider(label='Number of Words to Display',
min_value=5, max_value=150, value=75, step=5)
st.write("###### Slider number:", num_words)
wordcloud(all_sentiments, num_words)
# --------------------------------------------------------