-
Notifications
You must be signed in to change notification settings - Fork 0
/
movies.py
178 lines (135 loc) · 4.91 KB
/
movies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from enum import Enum
from typing import List, Set, Dict
import re
import movie_data
from quote import QuoteCleanPattern
names = ["phantom_menace", "attack_of_the_clones", "revenge_of_the_sith"]
class WhichMovie(Enum):
NA = -1
PHANTOM_MENACE = 0
ATTACK_OF_THE_CLONES = 1
REVENGE_OF_THE_SITH = 2
movie_dict = {
WhichMovie.PHANTOM_MENACE: "phantom_menace",
WhichMovie.ATTACK_OF_THE_CLONES: "attack_of_the_clones",
WhichMovie.REVENGE_OF_THE_SITH: "revenge_of_the_sith"
}
name_dict = {
"phantom_menace": WhichMovie.PHANTOM_MENACE,
"attack_of_the_clones": WhichMovie.ATTACK_OF_THE_CLONES,
"revenge_of_the_sith": WhichMovie.REVENGE_OF_THE_SITH
}
class NotAPrequel(Exception):
def __init__(self, movie: str):
super().__init__(f'"{movie}" is not a proper Prequel movie format')
class MovieData:
"""
Regex patterns are based on line per line basis.
:param character_pattern: regex Pattern that will find every character in script regardless if on that particular
line they are currently speaking.
This data is stored to make sure you get the longest possible name later, which is implemented in parse_script.
:param quote_pattern: regex Pattern that will find in *first group*: character; *second group*: actual quote
:param blacklist_substrings: blocked substrings in characters, such as script place description, i.e. 'INT.' etc.
:param ignored: uppercase words that are not actually characters, or characters that you are sure dont speak
:param mappings: these are *words* that may be exactly equal to another word in a longer character name
that all map to the same character,
the goal is to use the most information possible for the character name and unify all those different script
characters into a single character with that name w/e it is set to
:param short_characters: are characters that names' could be a substring of other names
and yet still be a distinct character
:param quote_clean_patterns: these are set of instructions on how to match and what to replace with that match
Useful in Attack of the Clones where the dialogues contain tabs and newlines that need to be replaced with spaces.
:param which: used for logging to determine for which movie this data is used for
:param strict: list of strings that even though may be contained in larger strings of characters,
if present as speaking, form stand-alone characters, e.g.: "CLONE SERGEANT" is not "AT-ST CLONE SERGEANT",
but "DOOKU" is "COUNT DOOKU"
"""
def __init__(
self,
character_pattern: re.Pattern,
quote_pattern: re.Pattern,
ignored: List[str],
blacklist_substrings,
mappings: Dict[str, str] = dict(),
short_characters: Set[str] = set(),
strict: List[str] = [],
quote_clean_patterns: List[QuoteCleanPattern] = [],
which=WhichMovie.NA
):
self.character_pattern = character_pattern
self.quote_pattern = quote_pattern
self.ignored = ignored
self.blacklist_substrings = blacklist_substrings
self.mappings = mappings
self.short_characters = short_characters
self.which = which
self.quote_clean_patterns = quote_clean_patterns
self.strict = strict
def filter(self, character: str) -> bool:
if not isinstance(character, str):
raise Exception("Invalid type")
if character in self.ignored:
return False
if any((x in character for x in self.blacklist_substrings)):
return False
return True
def assert_ready(self):
assert self.character_pattern != ""
assert self.quote_pattern != ""
assert self.ignored != ()
def __repr__(self):
return f'<MovieData for="{self.which}">'
def load_data(which: WhichMovie) -> MovieData:
try:
movie = [movie_data.phantom_menace, movie_data.attack_of_the_clones, movie_data.revenge_of_the_sith][
which.value]
except IndexError as e:
raise NotAPrequel(str(e))
data = MovieData(
movie.character_pattern,
movie.quote_pattern,
ignored=movie.ignored,
blacklist_substrings=movie.blacklist,
mappings=movie.mappings,
strict=movie.strict,
which=which,
quote_clean_patterns=movie.quote_clean_patterns
)
return data
class Movie:
def __init__(self, name: str):
if name not in names:
raise NotAPrequel
self.name: str = name
self.which: WhichMovie = name_dict[name]
self.data: MovieData = load_data(self.which)
def assert_ready(self):
assert self.data
self.data.assert_ready()
def __repr__(self):
return f'<Movie "{self.name}">'
class Movies:
def __init__(self, names: List[str]):
self.movies = []
for name in names:
if name not in names:
raise NotAPrequel(name)
self.movies.append(Movie(name))
def __iter__(self):
return self.movies
def __contains__(self, item: str):
return item in names
def __repr__(self):
return f'<Movies num={len(self.movies)}>'
def __len__(self):
return len(self.movies)
movies = Movies(names)
class CachedMovie(Movie):
def __call__(self, name: str):
for movie in movies:
if movie.name == name:
movie.assert_ready()
return movie
else:
raise NotAPrequel(name)
pass