Skip to content

Commit

Permalink
Merge pull request #56 from mBaratta96/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
mBaratta96 authored Aug 10, 2023
2 parents e868576 + fc657f5 commit 1a297e2
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 17 deletions.
1 change: 0 additions & 1 deletion letterboxd_stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import os
import platformdirs
import argparse
import csv

default_folder = platformdirs.user_config_dir("letterboxd_stats", "mBaratta96")

Expand Down
19 changes: 17 additions & 2 deletions letterboxd_stats/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@


def check_if_watched(df: pd.DataFrame, row: pd.Series) -> bool:
"""watched.csv hasn't the TMDB id, so comparison can be done only by title.
This creates the risk of mismatch when two movies have the same title. To avoid this,
we must retrieve the TMDB id of the watched movie.
"""

if row["Title"] in df["Name"].values:
watched_films_same_name = df[df["Name"] == row["Title"]]
for _, film in watched_films_same_name.iterrows():
Expand All @@ -23,6 +28,8 @@ def check_if_watched(df: pd.DataFrame, row: pd.Series) -> bool:


def read_watched_films(df: pd.DataFrame, path: str, name: str) -> pd.DataFrame:
"""Check which film of a director you have seen. Add a column to show on the CLI."""

df_profile = pd.read_csv(path)
df.insert(0, "watched", np.where([check_if_watched(df_profile, row) for _, row in df.iterrows()], "[X]", "[ ]"))
df["Release Date"] = pd.to_datetime(df["Release Date"])
Expand All @@ -32,10 +39,10 @@ def read_watched_films(df: pd.DataFrame, path: str, name: str) -> pd.DataFrame:


def select_film_of_person(df: pd.DataFrame) -> pd.Series | None:
movie_id = cli.select_movie(df["Title"], df.index.to_series().parallel_map(str))
movie_id = cli.select_movie(df["Title"], df.index.to_series())
if movie_id is None:
return None
movie_row = df.loc[int(movie_id)]
movie_row = df.loc[movie_id]
return movie_row


Expand All @@ -45,6 +52,8 @@ def get_list_name(path: str) -> str:


def open_list(path: str, limit: int, acending: bool) -> str:
"""Select a list from the saved ones."""

list_names = {
get_list_name(os.path.join(path, letterboxd_list)): letterboxd_list for letterboxd_list in os.listdir(path)
}
Expand All @@ -53,6 +62,12 @@ def open_list(path: str, limit: int, acending: bool) -> str:


def open_file(filetype: str, path: str, limit, ascending, header=0) -> str:
"""There are some operations that are the same for all the .csv files. So isolate those similar operations,
and then we proceed to perform the particular operation for a certain file (watchlist, list, diary...).
FILE_OPERATIONS selects those particular operations according to the file we opened. Mainly they do
ordering and column filtering operations.
"""

df = pd.read_csv(path, header=header)
df.rename(columns={"Name": "Title", "Letterboxd URI": "Url"}, inplace=True)
df["Year"] = df["Year"].fillna(0).astype(int)
Expand Down
20 changes: 12 additions & 8 deletions letterboxd_stats/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,22 @@ def check_path(path: str):


def download_data():
"""Download exported data you find in the import/esport section of your Letterboxd profile"""

downloader = ws.Downloader()
downloader.login()
downloader.download_stats()


def get_movie_detail_from_url(letterboxd_url: str, is_diary=False):
if letterboxd_url is not None:
id = ws.get_tmdb_id(letterboxd_url, is_diary)
if id is not None:
tmdb.get_movie_detail(id, letterboxd_url)


def search_person(args_search: str):
"""Search for a director, list his/her movies and check if you have watched them."""

df, name = tmdb.get_person(args_search)
path = os.path.expanduser(os.path.join(config["root_folder"], "static", "watched.csv"))
check_path(path)
df = data.read_watched_films(df, path, name)
movie = data.select_film_of_person(df)
# We want to print the link of the selected movie. This has to be retrived from the search page.
while movie is not None:
search_film_query = f"{movie['Title']} {movie['Release Date'].year}" # type: ignore
title_url = ws.search_film(search_film_query)
Expand All @@ -59,14 +57,20 @@ def search_film(args_search_film: str):


def get_data(args_limit: int, args_ascending: bool, data_type: str):
"""Load and show on the CLI different .csv files that you have downloaded with the -d flag."""

path = os.path.expanduser(os.path.join(config["root_folder"], "static", DATA_FILES[data_type]))
check_path(path)
letterboxd_url = (
data.open_file(data_type, path, args_limit, args_ascending)
if data_type != "Lists"
else data.open_list(path, args_limit, args_ascending)
)
get_movie_detail_from_url(letterboxd_url, data_type == "Diary")
# If you select a movie, show its details.
if letterboxd_url is not None:
id = ws.get_tmdb_id(letterboxd_url, data_type == "diary")
if id is not None:
tmdb.get_movie_detail(id, letterboxd_url)


def main():
Expand Down
15 changes: 13 additions & 2 deletions letterboxd_stats/tmdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@


def get_person(name: str) -> Tuple[pd.DataFrame, str]:
"""Search the director with the TMDB api. Get all the movies.
https://developer.themoviedb.org/reference/person-details
https://developer.themoviedb.org/reference/person-movie-credits
"""

print(f"Searching for '{name}'")
search_results = search.people({"query": name})
names = [result.name for result in search_results] # type: ignore
Expand Down Expand Up @@ -43,6 +48,8 @@ def get_person(name: str) -> Tuple[pd.DataFrame, str]:
)
df = df[df["Department"] == department]
df = df.drop("Department", axis=1)
# person.details provides movies without time duration. If the user wants
# (since this slows down the process) get with the movie.details API.
if config["TMDB"]["get_list_runtimes"] is True:
df["Duration"] = df.index.to_series().parallel_map(get_film_duration) # type: ignore
return df, p["name"]
Expand Down Expand Up @@ -77,9 +84,13 @@ def get_movie_detail(movie_id: int, letterboxd_url=None):
cli.print_film(selected_details)


def get_film_duration(tmdb_id: str) -> int:
def get_film_duration(tmdb_id: int) -> int:
"""Get film duration from the TMDB api.
https://developer.themoviedb.org/reference/movie-details
"""

try:
runtime = movie.details(int(tmdb_id)).runtime # type: ignore
runtime = movie.details(tmdb_id).runtime # type: ignore
except TMDbException:
runtime = 0
return runtime
28 changes: 25 additions & 3 deletions letterboxd_stats/web_scraper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import os
from zipfile import ZipFile

from numpy import who
from letterboxd_stats import config
from letterboxd_stats import cli
import requests
Expand Down Expand Up @@ -31,6 +29,7 @@
class Downloader:
def __init__(self):
self.session = requests.Session()
# get home page to set cookies in the session.
self.session.get(URL)

def login(self):
Expand All @@ -44,6 +43,9 @@ def login(self):
raise ConnectionError("Impossible to login")

def download_stats(self):
"""Download and extract data of the import/export section.
.CSV file will be extracted in the folder specified in the config file."""

res = self.session.get(DATA_PAGE)
if res.status_code != 200 or "application/zip" not in res.headers["Content-Type"]:
raise ConnectionError(f"Impossible to download data. Response headers:\n{res.headers}")
Expand All @@ -66,6 +68,8 @@ def add_film_diary(self, title: str):
if res.status_code != 200:
raise ConnectionError("It's been impossible to retireve the Letterboxd page")
movie_page = html.fromstring(res.text)
# Not the TMDB id, but the Letterboxd ID to use to add the movie to diary.
# Reference: https://letterboxd.com/film/seven-samurai/
letterboxd_film_id = movie_page.get_element_by_id("frm-sidebar-rating").get("data-film-id")
payload["filmId"] = letterboxd_film_id
payload["__csrf"] = self.session.cookies.get("com.xk72.webparts.csrf")
Expand All @@ -89,6 +93,8 @@ def remove_watchlist(self, title: str):
print("Removed to your watchlist.")

def perform_operation(self, answer: str, link: str):
"""Depending on what the user has chosen, add to diary, add/remove watchlist."""

getattr(self, MOVIE_OPERATIONS[answer])(link)


Expand All @@ -97,8 +103,13 @@ def create_movie_url(title: str, operation: str) -> str:


def _get_tmdb_id_from_web(link: str, is_diary: bool) -> int:
"""Scraping the TMDB link from a Letterboxd film page.
Inpect this HTML for reference: https://letterboxd.com/film/seven-samurai/
"""

res = requests.get(link)
movie_page = html.fromstring(res.text)
# Diary links sends you to a different page with no link to TMDB. Redirect to the actual page.
if is_diary:
title_link = movie_page.xpath("//span[@class='film-title-wrapper']/a")
if len(title_link) == 0:
Expand All @@ -114,6 +125,13 @@ def _get_tmdb_id_from_web(link: str, is_diary: bool) -> int:


def get_tmdb_id(link: str, is_diary=False) -> int | None:
"""Find the TMDB id from a letterboxd page.
A link to a Letterboxd film usually starts with either https://letterboxd.com/
or https://boxd.it/ (usually all .csv files have this prefix). We structure the cache dict accordingly.
The cache is meant to avoid bottleneck of constantly retrieving the Id from an HTML page.
"""

tmdb_id_cache = shelve.open(cache_path, writeback=False, protocol=5)
prefix, key = link.rsplit("/", 1)
if prefix in tmdb_id_cache and key in tmdb_id_cache[prefix]:
Expand All @@ -136,11 +154,16 @@ def select_optional_operation() -> str:


def search_film(title: str, allow_selection=False) -> str:
"""Search a movie a get its Letterboxd link.
For reference: https://letterboxd.com/search/seven+samurai/?adult
"""

search_url = create_movie_url(title, "search")
res = requests.get(search_url)
if res.status_code != 200:
raise ConnectionError("It's been impossible to retireve the Letterboxd page")
search_page = html.fromstring(res.text)
# If we want to select movies from the seach page, get more data to print the selection prompt.
if allow_selection:
movie_list = search_page.xpath("//div[@class='film-detail-content']")
if len(movie_list) == 0:
Expand All @@ -152,7 +175,6 @@ def search_film(title: str, allow_selection=False) -> str:
year = f"({year[0].text}) " if len(year := movie.xpath("./h2/span//small/a")) > 0 else ""
link = movie.xpath("./h2/span/a")[0].get("href")
title_years_directors_links[f"{title} {year}- {director}"] = link

selected_film = cli.select_value(list(title_years_directors_links.keys()), "Select your film")
title_url = title_years_directors_links[selected_film].split("/")[-2]
else:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "letterboxd_stats"
version = "0.2.9"
version = "0.2.10"
authors = [{ name = "mBaratta96" }]
description = "Get information about your Letterboxd activity."
readme = "README.md"
Expand Down

0 comments on commit 1a297e2

Please sign in to comment.