forked from colinpollock/seinfeld-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
executable file
·74 lines (52 loc) · 2.1 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
import re
import sys
def unescape(s):
"""Replace HTML jibberish with normal symbols."""
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("‘", "'")
s = s.replace("’", "'")
s = s.replace("“", "'")
s = s.replace("”", "'")
s = s.replace("•", "'")
s = s.replace("–", "'")
s = s.replace(""", "'")
s = s.replace("…", "...")
# this has to be last:
s = s.replace("&", "&")
return s
def remove_tags(text):
"""Returns the text with HTML tags removed."""
return re.sub(r'<.*?>', '', text)
def parse_episode_info(html):
"""Return a dict with meta-info about the episode."""
groups = re.search(r'pc: .*? season (\d+), episode (\d+)', html).groups()
season_num = int(groups[0])
episode_num = int(groups[1])
title = re.search(r'Episode \d+(.*?) - (.*?)<', html).groups()[1]
date = re.search(r'Broadcast date: (.*?)<', html).groups()[0]
writers = re.search(r'Written [bB]y:? (.*?)<', html).groups()[0]
writers = tuple([w.strip() for w in re.split(r',|&', writers) if w])
director = re.search(r'Directed [bB]y (.*?)<', html).groups()[0]
return {'season_num': season_num, 'episode_num': episode_num,
'title': title, 'date': date, 'writers': writers,
'director': director}
def parse_script(html):
"""Returns a sequence of (speaker, utterance) pairs."""
utterances = re.findall(r'([A-Z]+)(?: \(.*?\))?: (.*?)</?(?:br|p)>', html)
for i, (speaker, utterance_text) in enumerate(utterances):
# Skip the monologues at the beginning of episodes.
if speaker.upper() == 'JERRY' and \
i == 0 and \
len(utterance_text.split()) > 100:
continue
yield (speaker, unescape(utterance_text))
def scrape_episode(html):
html = html.replace(' ', ' ')
splitted = re.split(r'={30}.*', html)
info_html = splitted[0]
script_html = splitted[1]
info = parse_episode_info(info_html)
utterances = parse_script(script_html)
return (info, utterances)