-
Notifications
You must be signed in to change notification settings - Fork 0
/
question_scrape.py
124 lines (101 loc) · 4.2 KB
/
question_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import requests
import bs4
import csv
import os
import errno
from multiprocessing import Pool
base_dir = os.path.abspath(os.path.dirname(__file__))
clue_dir = os.path.join(base_dir, 'clues')
base_link = "http://www.j-archive.com/"
season_suffix = "showseason.php?season="
all_seasons = "listseasons.php"
season_id_start = 47
season_id_without_base_start = 22
episode_id_start = 46
fieldnames = ['category', 'value', 'clues', 'answer']
episodes_already_checked = []
# checks if directory exists and creates if missing
def make_sure_path_exists(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
# grabs the content divs from the provided link
def get_content(link):
response = requests.get(link)
soup = bs4.BeautifulSoup(response.text)
content = soup.find('div', id='content')
return content
# returns a list a links from a given soup structure
def get_links_from_content(content):
links = []
for row_tag in content.findAll('tr'):
link_tag = row_tag.find('a')
links.append(link_tag.get('href'))
return links
# returns a list of season URLS
#
# will need base_link prefix in order to work
def j_archive_season_urls():
content = get_content(base_link + all_seasons)
grabbed_season_urls = get_links_from_content(content)
return grabbed_season_urls
# returns a list of episode URLS from a given season URL
def j_archive_episode_urls(link):
content = get_content(link)
table = content.find('table')
episode_urls = get_links_from_content(table)
return episode_urls
# creates a csv file of clues from a given episode URL
def get_clues(link):
content = get_content(link)
print('start: ' + link[episode_id_start:])
with open(os.path.join(clue_dir, str(link[episode_id_start:]) + '.csv'),
'w', encoding='utf-8', newline='') as clue_csv_file:
clue_csv_writer = csv.writer(clue_csv_file, quoting=csv.QUOTE_ALL)
clue_csv_writer.writerow(fieldnames)
for game_round in content.findAll('table', 'round'):
categories = [i.text for i in game_round.findAll('td', class_='category_name')]
for clue in game_round.findAll('td', 'clue'):
clue_text = clue.find('td', 'clue_text')
if clue_text is not None:
category_id = clue_text.get('id')
category_number = int(category_id.split("_")[2])
category = categories[category_number - 1]
answer_div = clue.find('div')
answer_mouse_over = answer_div['onmouseover']
answer_mouse_over_code = answer_mouse_over.split("', '")[2]
answer_reparsed = bs4.BeautifulSoup(answer_mouse_over_code)
answer = answer_reparsed.find_all('em')[0].text
value_td = answer_div.findAll('td')[1]
value_string = value_td.text
if value_string[:2] == "DD":
value = int(value_string[5:].replace(',', ''))
else:
value = int(value_string[1:])
clue_csv_writer.writerow([category,
value,
clue_text.text,
answer])
print('end::: ' + link[episode_id_start:])
# makes a list of missing csv files from
#
# useful for updating csv files or continuing timeouts
def get_missing_ids(seasons_selected):
clue_dir_list = set([csv_file[:-4] for csv_file in os.listdir(clue_dir)])
missing_ep_id = []
for season_link in seasons_selected:
print("checking season " + season_link[season_id_without_base_start:])
episode_urls = j_archive_episode_urls(base_link + season_link)
for ep in episode_urls:
if ep[episode_id_start:] not in clue_dir_list:
print(ep)
missing_ep_id.append(ep)
return missing_ep_id
if __name__ == "__main__":
make_sure_path_exists(clue_dir)
season_urls = j_archive_season_urls()
missing_ep_ids = get_missing_ids(season_urls)
pool = Pool(8)
pool.map(get_clues, missing_ep_ids)