diff --git a/letterboxd_stats/__init__.py b/letterboxd_stats/__init__.py index 6d09bf9..4e18a19 100644 --- a/letterboxd_stats/__init__.py +++ b/letterboxd_stats/__init__.py @@ -2,7 +2,6 @@ import os import platformdirs import argparse -import csv default_folder = platformdirs.user_config_dir("letterboxd_stats", "mBaratta96") diff --git a/letterboxd_stats/data.py b/letterboxd_stats/data.py index d222c82..f11ff85 100644 --- a/letterboxd_stats/data.py +++ b/letterboxd_stats/data.py @@ -13,6 +13,11 @@ def check_if_watched(df: pd.DataFrame, row: pd.Series) -> bool: + """watched.csv hasn't the TMDB id, so comparison can be done only by title. + This creates the risk of mismatch when two movies have the same title. To avoid this, + we must retrieve the TMDB id of the watched movie. + """ + if row["Title"] in df["Name"].values: watched_films_same_name = df[df["Name"] == row["Title"]] for _, film in watched_films_same_name.iterrows(): @@ -23,6 +28,8 @@ def check_if_watched(df: pd.DataFrame, row: pd.Series) -> bool: def read_watched_films(df: pd.DataFrame, path: str, name: str) -> pd.DataFrame: + """Check which film of a director you have seen. Add a column to show on the CLI.""" + df_profile = pd.read_csv(path) df.insert(0, "watched", np.where([check_if_watched(df_profile, row) for _, row in df.iterrows()], "[X]", "[ ]")) df["Release Date"] = pd.to_datetime(df["Release Date"]) @@ -32,10 +39,10 @@ def read_watched_films(df: pd.DataFrame, path: str, name: str) -> pd.DataFrame: def select_film_of_person(df: pd.DataFrame) -> pd.Series | None: - movie_id = cli.select_movie(df["Title"], df.index.to_series().parallel_map(str)) + movie_id = cli.select_movie(df["Title"], df.index.to_series()) if movie_id is None: return None - movie_row = df.loc[int(movie_id)] + movie_row = df.loc[movie_id] return movie_row @@ -45,6 +52,8 @@ def get_list_name(path: str) -> str: def open_list(path: str, limit: int, acending: bool) -> str: + """Select a list from the saved ones.""" + list_names = { get_list_name(os.path.join(path, letterboxd_list)): letterboxd_list for letterboxd_list in os.listdir(path) } @@ -53,6 +62,12 @@ def open_list(path: str, limit: int, acending: bool) -> str: def open_file(filetype: str, path: str, limit, ascending, header=0) -> str: + """There are some operations that are the same for all the .csv files. So isolate those similar operations, + and then we proceed to perform the particular operation for a certain file (watchlist, list, diary...). + FILE_OPERATIONS selects those particular operations according to the file we opened. Mainly they do + ordering and column filtering operations. + """ + df = pd.read_csv(path, header=header) df.rename(columns={"Name": "Title", "Letterboxd URI": "Url"}, inplace=True) df["Year"] = df["Year"].fillna(0).astype(int) diff --git a/letterboxd_stats/main.py b/letterboxd_stats/main.py index dcd103a..bfb1231 100644 --- a/letterboxd_stats/main.py +++ b/letterboxd_stats/main.py @@ -22,24 +22,22 @@ def check_path(path: str): def download_data(): + """Download exported data you find in the import/esport section of your Letterboxd profile""" + downloader = ws.Downloader() downloader.login() downloader.download_stats() -def get_movie_detail_from_url(letterboxd_url: str, is_diary=False): - if letterboxd_url is not None: - id = ws.get_tmdb_id(letterboxd_url, is_diary) - if id is not None: - tmdb.get_movie_detail(id, letterboxd_url) - - def search_person(args_search: str): + """Search for a director, list his/her movies and check if you have watched them.""" + df, name = tmdb.get_person(args_search) path = os.path.expanduser(os.path.join(config["root_folder"], "static", "watched.csv")) check_path(path) df = data.read_watched_films(df, path, name) movie = data.select_film_of_person(df) + # We want to print the link of the selected movie. This has to be retrived from the search page. while movie is not None: search_film_query = f"{movie['Title']} {movie['Release Date'].year}" # type: ignore title_url = ws.search_film(search_film_query) @@ -59,6 +57,8 @@ def search_film(args_search_film: str): def get_data(args_limit: int, args_ascending: bool, data_type: str): + """Load and show on the CLI different .csv files that you have downloaded with the -d flag.""" + path = os.path.expanduser(os.path.join(config["root_folder"], "static", DATA_FILES[data_type])) check_path(path) letterboxd_url = ( @@ -66,7 +66,11 @@ def get_data(args_limit: int, args_ascending: bool, data_type: str): if data_type != "Lists" else data.open_list(path, args_limit, args_ascending) ) - get_movie_detail_from_url(letterboxd_url, data_type == "Diary") + # If you select a movie, show its details. + if letterboxd_url is not None: + id = ws.get_tmdb_id(letterboxd_url, data_type == "diary") + if id is not None: + tmdb.get_movie_detail(id, letterboxd_url) def main(): diff --git a/letterboxd_stats/tmdb.py b/letterboxd_stats/tmdb.py index e2843de..d37b773 100644 --- a/letterboxd_stats/tmdb.py +++ b/letterboxd_stats/tmdb.py @@ -16,6 +16,11 @@ def get_person(name: str) -> Tuple[pd.DataFrame, str]: + """Search the director with the TMDB api. Get all the movies. + https://developer.themoviedb.org/reference/person-details + https://developer.themoviedb.org/reference/person-movie-credits + """ + print(f"Searching for '{name}'") search_results = search.people({"query": name}) names = [result.name for result in search_results] # type: ignore @@ -43,6 +48,8 @@ def get_person(name: str) -> Tuple[pd.DataFrame, str]: ) df = df[df["Department"] == department] df = df.drop("Department", axis=1) + # person.details provides movies without time duration. If the user wants + # (since this slows down the process) get with the movie.details API. if config["TMDB"]["get_list_runtimes"] is True: df["Duration"] = df.index.to_series().parallel_map(get_film_duration) # type: ignore return df, p["name"] @@ -77,9 +84,13 @@ def get_movie_detail(movie_id: int, letterboxd_url=None): cli.print_film(selected_details) -def get_film_duration(tmdb_id: str) -> int: +def get_film_duration(tmdb_id: int) -> int: + """Get film duration from the TMDB api. + https://developer.themoviedb.org/reference/movie-details + """ + try: - runtime = movie.details(int(tmdb_id)).runtime # type: ignore + runtime = movie.details(tmdb_id).runtime # type: ignore except TMDbException: runtime = 0 return runtime diff --git a/letterboxd_stats/web_scraper.py b/letterboxd_stats/web_scraper.py index d40e589..0ca0139 100644 --- a/letterboxd_stats/web_scraper.py +++ b/letterboxd_stats/web_scraper.py @@ -1,7 +1,5 @@ import os from zipfile import ZipFile - -from numpy import who from letterboxd_stats import config from letterboxd_stats import cli import requests @@ -31,6 +29,7 @@ class Downloader: def __init__(self): self.session = requests.Session() + # get home page to set cookies in the session. self.session.get(URL) def login(self): @@ -44,6 +43,9 @@ def login(self): raise ConnectionError("Impossible to login") def download_stats(self): + """Download and extract data of the import/export section. + .CSV file will be extracted in the folder specified in the config file.""" + res = self.session.get(DATA_PAGE) if res.status_code != 200 or "application/zip" not in res.headers["Content-Type"]: raise ConnectionError(f"Impossible to download data. Response headers:\n{res.headers}") @@ -66,6 +68,8 @@ def add_film_diary(self, title: str): if res.status_code != 200: raise ConnectionError("It's been impossible to retireve the Letterboxd page") movie_page = html.fromstring(res.text) + # Not the TMDB id, but the Letterboxd ID to use to add the movie to diary. + # Reference: https://letterboxd.com/film/seven-samurai/ letterboxd_film_id = movie_page.get_element_by_id("frm-sidebar-rating").get("data-film-id") payload["filmId"] = letterboxd_film_id payload["__csrf"] = self.session.cookies.get("com.xk72.webparts.csrf") @@ -89,6 +93,8 @@ def remove_watchlist(self, title: str): print("Removed to your watchlist.") def perform_operation(self, answer: str, link: str): + """Depending on what the user has chosen, add to diary, add/remove watchlist.""" + getattr(self, MOVIE_OPERATIONS[answer])(link) @@ -97,8 +103,13 @@ def create_movie_url(title: str, operation: str) -> str: def _get_tmdb_id_from_web(link: str, is_diary: bool) -> int: + """Scraping the TMDB link from a Letterboxd film page. + Inpect this HTML for reference: https://letterboxd.com/film/seven-samurai/ + """ + res = requests.get(link) movie_page = html.fromstring(res.text) + # Diary links sends you to a different page with no link to TMDB. Redirect to the actual page. if is_diary: title_link = movie_page.xpath("//span[@class='film-title-wrapper']/a") if len(title_link) == 0: @@ -114,6 +125,13 @@ def _get_tmdb_id_from_web(link: str, is_diary: bool) -> int: def get_tmdb_id(link: str, is_diary=False) -> int | None: + """Find the TMDB id from a letterboxd page. + + A link to a Letterboxd film usually starts with either https://letterboxd.com/ + or https://boxd.it/ (usually all .csv files have this prefix). We structure the cache dict accordingly. + The cache is meant to avoid bottleneck of constantly retrieving the Id from an HTML page. + """ + tmdb_id_cache = shelve.open(cache_path, writeback=False, protocol=5) prefix, key = link.rsplit("/", 1) if prefix in tmdb_id_cache and key in tmdb_id_cache[prefix]: @@ -136,11 +154,16 @@ def select_optional_operation() -> str: def search_film(title: str, allow_selection=False) -> str: + """Search a movie a get its Letterboxd link. + For reference: https://letterboxd.com/search/seven+samurai/?adult + """ + search_url = create_movie_url(title, "search") res = requests.get(search_url) if res.status_code != 200: raise ConnectionError("It's been impossible to retireve the Letterboxd page") search_page = html.fromstring(res.text) + # If we want to select movies from the seach page, get more data to print the selection prompt. if allow_selection: movie_list = search_page.xpath("//div[@class='film-detail-content']") if len(movie_list) == 0: @@ -152,7 +175,6 @@ def search_film(title: str, allow_selection=False) -> str: year = f"({year[0].text}) " if len(year := movie.xpath("./h2/span//small/a")) > 0 else "" link = movie.xpath("./h2/span/a")[0].get("href") title_years_directors_links[f"{title} {year}- {director}"] = link - selected_film = cli.select_value(list(title_years_directors_links.keys()), "Select your film") title_url = title_years_directors_links[selected_film].split("/")[-2] else: diff --git a/pyproject.toml b/pyproject.toml index ac34224..1b1ce45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "letterboxd_stats" -version = "0.2.9" +version = "0.2.10" authors = [{ name = "mBaratta96" }] description = "Get information about your Letterboxd activity." readme = "README.md"