Skip to content

Commit

Permalink
Merge pull request #3 from tardigrde/message
Browse files Browse the repository at this point in the history
added more test data and hot fixes
  • Loading branch information
tardigrde committed Aug 11, 2020
2 parents 87e177c + a5f9c9d commit 353f471
Show file tree
Hide file tree
Showing 20 changed files with 482 additions and 197 deletions.
22 changes: 11 additions & 11 deletions ConversationAnalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,22 @@ def __str__(self):
def stats(self):
return self.get_stats(self.df)

# TODO has to be tested
def get_stats(self, df, subject='all', start=None, end=None, period=None):
df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period)
stats = ConversationStats(df)
return stats

def get_time_series_data(self, subject='all', **kwargs):
time_series = generate_time_series(**kwargs)
return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject)

def get_plotable_time_series_data(self, interval_stats, statistic):
@staticmethod
def get_plottable_time_series_data(interval_stats, statistic):
for k, v in interval_stats.items():
if isinstance(v, ConversationStats):
interval_stats[k] = getattr(v, statistic)
return interval_stats

def get_stats(self, df, subject='all', start=None, end=None, period=None):
df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period)
stats = ConversationStats(df)
return stats

@staticmethod
@subject_checker
@date_checker
Expand Down Expand Up @@ -91,13 +91,13 @@ def unique_msg_count(self):
# 3.
@property
def most_used_msgs(self):
# TODO first few (1-10) messages
# TODO LATER first few (1-10) messages
return self.messages.value_counts()

# 4.
@property
def msg_frequency(self):
# TODO this has been most likely depracated
# NOTE this has been most likely depracated OR?
pass

# 5.
Expand Down Expand Up @@ -132,12 +132,12 @@ def char_count(self):
# 10.
@property
def most_used_chars(self):
return None # TODO or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string
return None # TODO LATER or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string

# 11.
@property
def rate_of_media_messages(self):
pass # TODO what?
pass # NOTE what?

def get_words(self):
token_list = self.messages.str.lower().str.split()
Expand Down
4 changes: 2 additions & 2 deletions Conversations.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def extract_names_from_convos(jsons):
else:
name_data_map[key] = {
'title': msg.title,
'compact_name': msg.compact_names, # TODO is list ok for if length is only 1??
'compact_name': msg.compact_names,
# 'participants': msg.participants + ['Levente Csőke'],
'participants': msg.participants,
'messages': msg.df,
Expand All @@ -67,7 +67,7 @@ def to_df(self):
self._df = pd.DataFrame(self.decoded.get('messages'))

def set_date_as_index(self):
# TODO maybe not needed; could calculate real time
# NOTE maybe not needed; could calculate real time
date_series = self._df.timestamp_ms.apply(self.ts_to_date)
self._df = self._df.set_index(date_series).iloc[::-1]

Expand Down
2 changes: 1 addition & 1 deletion Group.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# TODO groups should be searched by looking into jsons unfortunately :(
# TODO LATER groups should be searched by looking into jsons unfortunately :(
# because of directory says others
# maybe we dont use groups right away?

Expand Down
25 changes: 10 additions & 15 deletions MessagingAnalyzer.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,22 @@
from utils import year_converter, month_converter, generate_time_series, get_stats_for_intervals
from utils import year_converter, month_converter, generate_date_series, get_stats_for_intervals
from datetime import datetime, date, timedelta
import pandas as pd
from ConversationAnalyzer import ConversationAnalyzer

"""
"""


class MessagingAnalyzer:
def __init__(self, names, people):
# TODO input people only. class ill know what to do
# TODO input people only. class will know what to do
self.names = names
self.people = people

def time_series_analysis_for_all(self, subject=None, **kwargs):
time_series = generate_time_series(**kwargs)
time_series = generate_date_series(**kwargs)
stacked_df = self.stack_dfs(self.people)
interval_stats = get_stats_for_intervals(self.get_stats, stacked_df, time_series, subject=subject)

def get_stats(self, df, subject='all', start=None, end=None, period=None):
# TODO
# TODO LATER
# here you have to do something with it
pass

Expand All @@ -29,14 +25,14 @@ def get_count(self, attribute, subject='all', start=None, end=None, period=None)
# we have a list of names we want to iterate over
for name in self.names:
stats = self.get_conversation_stats(name=name, subject=subject, start=start, end=end, period=period)
if stats is not None: # TODO too explicit; needed because it is possible that None will be returned, if t got an empty df
if stats is not None:
count += getattr(stats, attribute)
return count

def get_conversation_stats(self, name, subject='all', start=None, end=None, period=None):
messages = self.people.get(name).messages
analyzer = ConversationAnalyzer(name, messages)
if analyzer is None: # TODO this is too explicit ?!
if analyzer is None:
return None
return analyzer.get_stats(messages, subject=subject, start=start, end=end, period=period)

Expand Down Expand Up @@ -80,7 +76,7 @@ def most_used_messages_by_me(self, **kwargs):
>>> s2 = pd.Series([3, 2, 1, 1])
>>> s1_vc = s1.value_counts()
>>> s2_vc = s2.value_counts()
TODO (later) most used is already a problem:
TODO LATER most used is already a problem:
- because its a series of all the unique messages/words ever used in a convo
- it contains strings like ':d', ':p' and 'xd'
- from all the convos the result of value_counts has to be cleared
Expand All @@ -101,14 +97,14 @@ def most_used_words_by_partners(self, **kwargs):

# 5. Number of messages sent/got on busiest period (by year/month/day/hour)
def days_when_most_messages_sent(self):
# TODO hard algorithmic problem
# TODO LATER hard algorithmic problem
pass

def days_when_most_messages_received(self):
pass

def hours_when_most_messages_sent(self):
# TODO
# TODO LATER
# is this referring to the absolute hour most messages sent??
# like: 2014.07.25. 15h-16h
# OR
Expand All @@ -131,5 +127,4 @@ def stack_dfs(people):
for data in people.values():
if data.messages is not None:
dfs.append(data.messages)
# TODO do I need to sort by index (date)? yes!
return pd.concat(dfs).sort_index() # TODO why ignore_index??
return pd.concat(dfs).sort_index()
36 changes: 20 additions & 16 deletions Visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,37 @@
from People import People
from ConversationAnalyzer import ConversationAnalyzer


# plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120})

TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data'


class Visualizer:
def __init__(self):
pass

def plot_time_series(self, x, y, title="Time series", xlabel='Date', ylabel='Value', dpi=100):
def plot_convos(self, names):
people = People(path=TEST_DATA_PATH)
for name in names:
data = self.set_up_data(people, name, period='d')
df = pd.DataFrame(data.items(), columns=['date', 'value'])
v.plot_time_series(x=df.date, y=df.value, title=name)

@staticmethod
def set_up_data(people, name, period='y'):
analyzer = ConversationAnalyzer(name, people.individuals.get(name).messages)
interval_stats = analyzer.get_time_series_data(subject='all', start=None, end=None, period=period)
return analyzer.get_plottable_time_series_data(interval_stats, statistic='msg_count')

@staticmethod
def plot_time_series(x, y, title="Time series", xlabel='Date', ylabel='Value', dpi=100):
plt.figure(figsize=(16, 5), dpi=dpi)
plt.plot(x, y, color='tab:red')
plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
plt.show()


def set_up(people, name, interval='y'):
analyzer = ConversationAnalyzer(name, people.individuals.get(name).messages)
interval_stats = analyzer.get_time_series_data()
stats = interval_stats.get(interval)
return analyzer.get_plotable_time_series_data(stats, statistic='msg_count')


if __name__ == "__main__":
v = Visualizer()
TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data'
people = People(path=TEST_DATA_PATH)
names = ['Teflon Musk', 'Tőke Hal']
for name in names:
data = set_up(people, name, interval='d')
df = pd.DataFrame(data.items(), columns=['date', 'value'])
v.plot_time_series(x=df.date, y=df.value, title=name)
v = Visualizer()
v.plot_convos(names)
9 changes: 6 additions & 3 deletions tests/TestConversationAnalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def _stats(name, **kwargs):
return _stats


# TODO extend all functions with all the data
# TODO LATER or not extend all functions with all the data
def test_stats_toke_hal_all(statistics):
stats = statistics('Tőke Hal')

Expand Down Expand Up @@ -133,8 +133,8 @@ def test_stats_toke_hal_me_2014_12(statistics):
def test_stats_teflon_musk(statistics):
stats = statistics('Teflon Musk')
assert stats.msg_count == 6
assert stats.unique_msg_count == 2 # TODO this does not count media messages
# assert stats.most_used_msgs == 0 # TODO should only return the most used or e.g. top10 most used
assert stats.unique_msg_count == 2
# assert stats.most_used_msgs == 0 # TODO LATER should only return the most used or e.g. top10 most used
# assert stats.msg_frequency == 0
assert stats.word_count == 14
assert stats.unique_word_count == 7
Expand Down Expand Up @@ -224,6 +224,9 @@ def test_stats_teflon_musk_all_2014_12(statistics):
assert stats.char_count == 0
# assert stats.most_used_chars == 0

class TestConversationAnalyzer: # Foo Bar
pass


def test_time_series_analysis_for_user(analyze):
analyzer = analyze('Teflon Musk')
Expand Down
Loading

0 comments on commit 353f471

Please sign in to comment.