-
Notifications
You must be signed in to change notification settings - Fork 0
/
reddit_pickle_scraper.py
77 lines (68 loc) · 3.07 KB
/
reddit_pickle_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import praw
import pandas as pd
import pickle
# Define user agent details
r = praw.Reddit(user_agent=user_agent, client_id=reddit_client_id, client_secret=reddit_client_secret)
epressed_posts_titles = []
depressed_posts_content = []
depressed_comments = []
depressed_dict = {"title": [],
"id": [],
"num_comments": [],
"comments": [],
"content": []}
non_depressed_dict = {"title": [],
"id": [],
"num_comments": [],
"comments": [],
"content": []}
def depressed_data(subreddit):
for post in r.subreddit(subreddit).top(limit=1000):
post.comments.replace_more(limit=100)
depressed_dict['title'].append(post.title)
depressed_dict['id'].append(post.id)
depressed_dict['num_comments'].append(post.num_comments)
comments = post.comments.list()
comments_new = []
for comment in comments:
comments_new.append(comment.body)
depressed_dict['comments'].append(comments_new)
depressed_dict['content'].append(post.selftext)
depressed_df = pd.DataFrame(depressed_dict)
depressed_df.drop_duplicates(subset=['id'], inplace=True)
print("Total number of comments: ", sum(depressed_df['num_comments']))
print(depressed_df)
def non_depressed_data(subreddit):
for post in r.subreddit(subreddit).hot(limit=1000):
post.comments.replace_more(limit=100)
non_depressed_dict['title'].append(post.title)
non_depressed_dict['id'].append(post.id)
non_depressed_dict['num_comments'].append(post.num_comments)
comments = post.comments.list()
comments_new = []
for comment in comments:
comments_new.append(comment.body)
non_depressed_dict['comments'].append(comments_new)
non_depressed_dict['content'].append(post.selftext)
non_depressed_df = pd.DataFrame(non_depressed_dict)
non_depressed_df.drop_duplicates(subset=['id'], inplace=True)
print("Total number of comments: ", sum(non_depressed_df['num_comments']))
print(non_depressed_df)
depressed_data('depression')
non_depressed_data('askreddit')
depressed_pickle = open("depressed.pickle","wb")
pickle.dump(depressed_dict, depressed_pickle)
depressed_pickle.close()
test_depressed_pickle = open("depressed.pickle","rb")
test_depressed_pickle_dict = pickle.load(test_depressed_pickle)
depressed_df2 = pd.DataFrame(test_depressed_pickle_dict)
depressed_df2.drop_duplicates(subset=['id'], inplace=True)
print("test Total number of comments: ", sum(depressed_df2['num_comments']))
non_depressed_pickle = open("non_depressed.pickle","wb")
pickle.dump(non_depressed_dict, non_depressed_pickle)
non_depressed_pickle.close()
test_non_depressed_pickle = open("non_depressed.pickle","rb")
test_non_depressed_pickle_dict = pickle.load(test_non_depressed_pickle)
non_depressed_df2 = pd.DataFrame(test_non_depressed_pickle_dict)
non_depressed_df2.drop_duplicates(subset=['id'], inplace=True)
print("test Total number of comments: ", sum(non_depressed_df2['num_comments']))