-
Notifications
You must be signed in to change notification settings - Fork 0
/
subreddits-plot.py
executable file
·134 lines (112 loc) · 3.56 KB
/
subreddits-plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
"""Plot subreddits creation and relative size."""
__author__ = "Joseph Reagle"
__copyright__ = "Copyright (C) 2024 Joseph Reagle"
__license__ = "GLPv3"
__version__ = "0.2"
import argparse
from pathlib import Path
import matplotlib.dates as mdates
import matplotlib.patheffects as path_effects
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from adjustText import adjust_text
# Create an argument parser
parser = argparse.ArgumentParser(
description="Plot subreddits creation and relative size."
)
parser.add_argument(
"-i", "--input", type=Path, required=True, help="Path to the input CSV file"
)
args = parser.parse_args()
# Read in the CSV data
df = pd.read_csv(args.input, comment="#")
# Convert the 'subscribers' column to numeric, replacing non-numeric values with NaN
df["subscribers"] = pd.to_numeric(df["subscribers"], errors="coerce")
# Convert the 'created' column to datetime
df["created"] = pd.to_datetime(df["created"], format="%Y-%m-%d")
# Sort by date
df = df.sort_values("created")
# Calculate the relative size of each subreddit
ADJUST_CIRCUMFERENCE = 4 # Adjustable parameter for bubble size
df["relative_size"] = df["subscribers"] / df["subscribers"].max() * 1000
# Create a dictionary to map categories to colors
category_colors = {
"general": "magenta",
"relationship": "blue",
"legal": "teal",
"finance": "olive",
"health": "red",
"fashion": "pink",
"gender": "green",
"disclosure": "purple",
"judgement": "orange",
}
# Set the threshold values
THRESHOLD_SIZE = 10000 # Ignore subreddits with subscribers less than this value
THRESHOLD_YEAR = 2024 # Ignore subreddits created after this year
# Filter data based on thresholds
df_filtered = df[
(df["subscribers"] >= THRESHOLD_SIZE) & (df["created"].dt.year <= THRESHOLD_YEAR)
]
BUBBLE_SCALE_FACTOR = 5 # Adjustable factor for overall bubble size
# Create the plot using Seaborn
plt.figure(figsize=(12, 8))
sns.scatterplot(
data=df_filtered,
x="created",
y="subscribers",
size="relative_size",
hue="category",
palette=category_colors,
sizes=(20 * BUBBLE_SCALE_FACTOR, 2000 * BUBBLE_SCALE_FACTOR),
alpha=0.7,
legend=False,
)
# Customize plot aesthetics
plt.yscale("log")
plt.xlabel("Date Created")
plt.ylabel("Number of Subscribers")
plt.title("Creation and Size of Advice Subreddits")
# Create a custom legend for categories only
legend_elements = [
plt.Line2D(
[0], [0], marker="o", color="w", label=cat, markerfacecolor=color, markersize=10
)
for cat, color in category_colors.items()
]
plt.legend(
handles=legend_elements,
title="Category",
bbox_to_anchor=(1.05, 1),
loc="upper left",
)
texts = []
for _, row in df_filtered.iterrows():
texts.append(
plt.text(
row["created"],
row["subscribers"],
row["subreddit"],
fontsize=10,
va="center",
ha="left",
path_effects=[path_effects.withStroke(linewidth=3, foreground="white")],
)
)
# Adjust the positions of the labels
adjust_text(texts, arrowprops={"arrowstyle": "-", "color": "k", "lw": 0.0})
# Adjust x-axis limits to provide extra space on both sides
x_min, x_max = plt.xlim()
x_min_date = mdates.num2date(x_min)
x_max_date = mdates.num2date(x_max)
plt.xlim(
mdates.date2num(x_min_date - pd.Timedelta(days=100)),
mdates.date2num(x_max_date + pd.Timedelta(days=200)),
)
plt.tight_layout()
# Save and show plot
output_file = Path(args.input).with_suffix(".png")
plt.savefig(output_file, dpi=300)
plt.show()