-
Notifications
You must be signed in to change notification settings - Fork 0
/
LetterboxdFilm.py
219 lines (169 loc) · 7.27 KB
/
LetterboxdFilm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
from requests import Session
from selectolax.parser import HTMLParser
TABBED_ATTRS = ["actor",
"additional-directing",
"additional-photography",
"art-direction",
"assistant-director",
"camera-operator",
"choreography",
"cinematography",
"composer",
"costume-design",
"country",
"director",
"editor",
"executive-producer",
"genre",
"hairstyling",
"language",
"lighting",
"makeup",
"mini-theme",
"original-writer",
"producer",
"production-design",
"set-decoration",
"songs",
"sound",
"special-effects",
"special-effects",
"story",
"studio",
"stunts",
"theme",
"title-design",
"visual-effects",
"writer"
]
class LetterboxdFilm:
"""
This class gets the HTML for the pages relevant to a film on Letterboxd,
providing a simple interface for the film page's HTML.
The following information is retreived and stored as attributes upon
initialization (all as `str` variables):
- Title
- Year
- Film page URL
- Film page HTML
Any other film information is accessed through CSS-based searches on the HTML, implemented
as class methods.
"""
def __init__(self, film_url: str):
insert_index = film_url.find("/film")
stats_url = film_url[:insert_index] + "/csi" + film_url[insert_index:] + "stats/"
self._stats_url = stats_url
with Session() as s:
page_response = s.get(film_url)
try: assert(page_response.status_code == 200)
except AssertionError:
print(f"\nInvalid URL: {film_url}\n")
exit()
self._url = film_url
page_html = HTMLParser(page_response.text)
stats_response = s.get(stats_url)
stats_html = HTMLParser(stats_response.text)
self._html = page_html
self._title = page_html.css("span.js-widont")[0].text()
self._year = page_html.css("a[href^='/films/year/']")[0].text()
self._stats_html = stats_html
@property
def url(self):
return self._url
@property
def title(self):
return self._title
@property
def year(self):
return self._year
@property
def page_html(self):
return self._html.html
def get_tabbed_attribute(self, attribute: str) -> list:
"""
Returns data from the tabbed section of a Letterboxd film page
where the cast, crew, details, genres, and releases info is
(except for the Releases tab, as this section follows a different
structure).
Will return `["Not listed"]` (a list with that string as its
only element) if the attribute was not found for the given film,
whether it was a valid attribute or not. An invalid argument warning is
printed after a ValueError is raised if the attribute is not valid.
Always use the full, singular form of the attribute you'd like, and replace each space
with a `-` (ASCII 45).
See example below:
```
>>> film = LetterboxdFilm("https://letterboxd.com/film/rango")
>>> film.get_tabbed_attribute("assistant-director")
>>> ['Adam Somner', 'Ian Calip']
```
"""
try:
if (attribute not in TABBED_ATTRS):
raise ValueError
except (ValueError):
print("\nINVALID ARGUMENT ", "\"", attribute, "\" passed to LetterboxdFilm.get_tabbed_attribute()\n",
"\nEXPLANATION: This film information is either not in the HTML by that name,\n",
" or there is no method imlemented to retireve it yet. \n",
"SUGGESTED ACTION: If it seems like the latter is the case, \n",
" please open up a GitHub issue and I will work on \n",
" implementing a method to retrieve the information. \n",
" You can also submit a pull request and implement it yourself.\n",
sep="")
pass
elements = self._html.css("a[href*='/" + attribute + "/']")
# extract text from found HTML elements
attribute_list = [e.text() for e in elements]
# return only distinct values, but still as a list
if (attribute_list): return list(set(attribute_list))
# the outcome whether the attribute was not valid or valid but not found for the film
else: return ["Not listed"]
def get_directors(self) -> list:
"""
For many films, there are multiple directors (e.g. The Matrix (1999)),
so this method always returns a list.
"""
return self.get_tabbed_attribute("director")
def get_genres(self) -> list:
return self.get_tabbed_attribute("genre")
def get_countries(self) -> list:
return self.get_tabbed_attribute("country")
def get_studios(self) -> list:
return self.get_tabbed_attribute("studio")
def get_actors(self) -> list:
return self.get_tabbed_attribute("actor")
def get_themes(self) -> list:
return self.get_tabbed_attribute("theme")
def get_avg_rating(self) -> float:
rating_element = self._html.css("meta[name='twitter:data2']")
avg_rating = None
if (len(rating_element) > 0):
rating_element_content = rating_element[0].attributes['content']
rating_element_title_parsed = rating_element_content.split(" ")
avg_rating = float(rating_element_title_parsed[0])
return avg_rating
def get_casting(self) -> dict:
"""
Returns a `dict` with the actor names as keys,
and character names as values.
"""
actor_nodes = self._html.css("a[href*='/actor/']")
casting = {}
for node in actor_nodes:
# sometimes, people link actors in reviews; gotta filter those out
try: node.attrs['title']
except (KeyError): continue
casting[node.text()] = node.attrs['title']
return casting
def get_watches(self) -> int:
watches_msg = self._stats_html.css("a.icon-watched")[0].attrs['title']
watches_msg = watches_msg[11:] # take out the "Watched by"
watches_msg = watches_msg[:-8] # take out the " members"
view_count = watches_msg.replace(",", "") # take out commas
return view_count
def get_likes(self) -> int:
likes_msg = self._stats_html.css("a.icon-liked")[0].attrs['title']
likes_msg = likes_msg[9:] # take out the "Liked by"
likes_msg = likes_msg[:-8] # take out the " members"
likes_count = likes_msg.replace(",", "") # take out commas
return likes_count