-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
313 lines (230 loc) · 11.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import pandas as pd
import emoji
from urlextract import URLExtract
from wordcloud import WordCloud
from collections import Counter
import io
import nltk
nltk.downloader.download('vader_lexicon')
nltk.downloader.download('stopwords')
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
import base64 # Standard Python Module
import re
from zipfile import ZipFile
# Function calculate message_count, words_count, media_count, links_count, emojis_count
def top_stats(selected_user, df):
if selected_user != 'All':
df = df[df['User'] == selected_user]
# Fetch the number of messages
message_count = df.shape[0]
# Combine all messages into a single string for word count and emoji count
all_messages = ' '.join(df['Message'].astype(str))
# Fetch the total number of words
words_list = all_messages.split()
words_count = len(words_list)
# Fetch the number of media messages
media_count = df[df['Message'] == '<Media omitted>'].shape[0]
# Create an instance of URLExtract
extract = URLExtract()
# Fetch the number of links shared
links_list = extract.find_urls(all_messages)
links_count = len(links_list)
# Fetch the number of emojis shared
emojis_list = [c for c in all_messages if c in emoji.EMOJI_DATA]
emojis_count = len(emojis_list)
return message_count, words_count, media_count, links_count, emojis_count
# Function find most mentioned(tagged) user
def most_tagged_users(df):
# Combine all messages into a single string
all_messages = ' '.join(df['Message'])
tagged_users = re.findall(r'@\d{12}', all_messages) # Find tagged users in the format @91XXXXXXXXXX
# Replace "@" with "+" in the tagged_users list
tagged_users = [user.replace('@', '+') for user in tagged_users]
# Create a new DataFrame for the most tagged users and their frequencies
most_tagged_user_df = pd.DataFrame(tagged_users, columns=['Tagged Users'])
most_tagged_user_df['Frequency'] = most_tagged_user_df['Tagged Users'].apply(lambda x: tagged_users.count(x))
most_tagged_user_df = most_tagged_user_df.drop_duplicates().sort_values(by='Frequency')
return most_tagged_user_df
# Function find most active users
def most_chat_users(df):
# Count the number of messages sent by each user and get the top users
top_users_sr = df['User'].value_counts().head()
# Calculate the percentage of messages sent by each user
total_messages = df.shape[0]
user_percentages_sr = (top_users_sr/ total_messages * 100).round(2)
# Create a DataFrame with user names and their message percentages
top_users_contribution_df = pd.DataFrame({'User': user_percentages_sr.index, 'Contribution(%)': user_percentages_sr}).reset_index(drop=True)
return top_users_sr, top_users_contribution_df
# Function return Emojis used and frequency in dataFrame
def emoji_analysis(selected_user,df):
if selected_user != 'All':
df = df[df['User'] == selected_user]
# Combine all messages into a single string
all_messages = ' '.join(df['Message'])
# Extract emojis from the combined messages
emojis_list = [c for c in all_messages if c in emoji.EMOJI_DATA]
# Count the frequency of each emoji
emojis_freq = Counter(emojis_list)
# Convert the emoji frequency dictionary to a DataFrame
emojis_freq_df = pd.DataFrame(list(emojis_freq.items()), columns=['Emoji', 'Frequency'])
# Sort the DataFrame by frequency (descending order)
emojis_freq_df = emojis_freq_df.sort_values(by='Frequency', ascending=False)
# Reset the DataFrame index
emojis_freq_df = emojis_freq_df.reset_index(drop=True)
return emojis_freq_df
# Function return DataFrame, which contains a count of messages per day
def get_daily_timeline(selected_user, df):
# Check if 'All' is selected; if not, filter the DataFrame by the selected user
if selected_user != 'All':
df = df[df['User'] == selected_user]
# Group the DataFrame by 'Date', count the number of 'Message' entries for each date,
# and reset the index to create a new DataFrame
daily_timeline = df.groupby('Date', sort=False).count()['Message'].reset_index()
# Return the daily_timeline DataFrame
return daily_timeline
# ---- utility functions for activity map -----
def get_week_activity_map(selected_user, df):
# Check if 'All' is selected; if not, filter the DataFrame by the selected user
if selected_user != 'All':
df = df[df['User'] == selected_user]
# Create a Series that counts the occurrences of each day name.
week_activity_map_sr = df['day_name'].value_counts()
# Return the Series with counts of activities for each day of the week.
return week_activity_map_sr
def get_month_activity_map(selected_user, df):
# Check if 'All' is selected; if not, filter the DataFrame by the selected user
if selected_user != 'All':
df = df[df['User'] == selected_user]
# Create a Series that counts the occurrences of each month name.
month_activity_map_sr = df['month_name'].value_counts()
# Return the Series with counts of activities for each month.
return month_activity_map_sr
# Function to generates a heatmap of message counts based on the hour of the day and day of the week
# Returns: pandas.DataFrame: A DataFrame representing the day vs. hour activity heatmap.
def get_day_hour_heatmap(selected_user, df):
# Filter the DataFrame by the selected user, if not 'All'
if selected_user != 'All':
df = df[df['User'] == selected_user]
# Group the data by 'hour' and 'day_name', counting the number of messages
data = df.groupby(['hour', 'day_name'], as_index=False)['Message'].count()
# Pivot the data to create the heatmap
data = data.pivot(index='day_name', columns='hour', values='Message')
# Reorder the day names in the DataFrame
data = data.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
return data
# ---- utility functions for word cloud-----
def generate_wordcloud(selected_user, df):
# Load stop words
with open('stop_words.txt', 'r') as f:
custom_words = set(f.read().splitlines())
# Importing set of stopwords from nltk
stop_words = set(stopwords.words('english'))
# Merging the two sets, one custom-created and the other from nltk, and storing them as a list
stop_words = list(stop_words.union(custom_words))
# Filter the DataFrame based on the selected user
if selected_user != 'All':
df = df[df['User'] == selected_user]
# Remove rows with '<Media omitted> & deleted message' in the 'message' column
df = df[df['Message'] != '<Media omitted>']
df = df[df['Message'] != 'This message was deleted']
# Define a function to remove stop words from a message
def remove_stop_words(message):
words = message.lower().split()
filtered_words = [word for word in words if word not in stop_words]
return " ".join(filtered_words)
# Apply stop word removal to the 'message' column
df['Message'] = df['Message'].apply(remove_stop_words)
# Combine all messages into a single string
all_messages = ' '.join(df['Message'])
# Remove emojis from the combined messages
all_messages = ''.join([c for c in all_messages if c not in emoji.EMOJI_DATA])
# Use regular expressions to remove user mentions and phone numbers
all_messages = re.sub(r'(@\d{12})', '', all_messages)
if len(all_messages) == 0:
return None, None # Return None if there are no words
# Create a WordCloud
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
wordcloud = wc.generate(all_messages)
# Calculate the most common words and create a DataFrame
words = all_messages.split()
most_common_word_df = pd.DataFrame(pd.Series(words).value_counts().head().reset_index())
most_common_word_df.columns = ['Most Common Words', 'Frequency']
return wordcloud, most_common_word_df
# ----- utility functions for sentiment analysis--------
# Function to calculate the sentiment score for a given message
def calculate_sentiment_score(message, sent):
message = emoji.demojize(message)
scores = sent.polarity_scores(message)
return scores['compound']
# Function to identify the users with the most positive, negative, and neutral sentiments
def user_sentiment_contributors(df):
# Initialize the SentimentIntensityAnalyzer
sent = SentimentIntensityAnalyzer()
# Create a dictionary to store compound sentiment scores & tootal message for each user
user_scores = {}
user_message_counts = {}
# Remove rows with '<Media omitted> & deleted message' in the 'message' column
df = df[df['Message'] != '<Media omitted>']
df = df[df['Message'] != 'This message was deleted']
# Iterate through the DataFrame to calculate sentiment scores for each user
for index, row in df.iterrows():
user = row['User']
message = row['Message']
compound_score = calculate_sentiment_score(message, sent)
# Update the total compound sentiment score and message count for the user
user_scores[user] = user_scores.get(user, 0) + compound_score
user_message_counts[user] = user_message_counts.get(user, 0) + 1
# Calculate weighted scores for each user based on sentiment and message count
weighted_scores = {user: user_scores[user] / user_message_counts[user] for user in user_scores if user_message_counts[user] > 0}
most_positive_user = max(weighted_scores, key=weighted_scores.get)
most_negative_user = min(weighted_scores, key=weighted_scores.get)
most_neutral_user = min(weighted_scores, key=lambda user: abs(weighted_scores[user]))
return {
'Most Positive User': most_positive_user,
'Most Negative User': most_negative_user,
'Most Neutral User': most_neutral_user
}
# Function to calculate the count of positive, negative, and neutral sentiment sentences
def sentiment_analysis(selected_user, df):
# Filter the DataFrame based on the selected user
if selected_user != 'All':
df = df[df['User'] == selected_user]
# Initialize the SentimentIntensityAnalyzer
sent = SentimentIntensityAnalyzer()
positive = 0
negative = 0
neutral = 0
# Remove rows with '<Media omitted> & deleted message' in the 'message' column
df = df[df['Message'] != '<Media omitted>']
df = df[df['Message'] != 'This message was deleted']
# Iterate through the messages and count the sentiment categories
for msg in df['Message']:
compound_score = calculate_sentiment_score(msg, sent)
# Update sentiment category counts based on compound score
if compound_score > 0.05:
positive += 1
elif compound_score < -0.05:
negative += 1
else:
neutral += 1
return positive, negative, neutral
# Function to generate a zip file containing all plots
# Args plot_data (list): A list of Matplotlib figures to be included in the zip file.
# Returns str: Base64-encoded binary data of the zip file containing the plots.
def generate_all_plots_zip(plot_data):
# Create a BytesIO buffer to hold the zip file
with io.BytesIO() as zip_buffer:
# Create a ZipFile object for writing the zip file
with ZipFile(zip_buffer, "w") as zipf:
for name, fig in plot_data:
# Create a BytesIO buffer for each image
with io.BytesIO() as img_buffer:
fig.savefig(img_buffer, format="png")
img_buffer.seek(0)
# Write the image to the zip file with a unique name
zipf.writestr(f"{name}.png", img_buffer.read())
# Move the buffer cursor to the beginning and encode the zip file in base64
zip_buffer.seek(0)
b64 = base64.b64encode(zip_buffer.read()).decode()
return b64