Merge pull request #56 from mBaratta96/develop

Develop
mBaratta96 · Aug 10, 2023 · 1a297e2 · 1a297e2
2 parents e868576 + fc657f5
commit 1a297e2
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 17 deletions.
diff --git a/letterboxd_stats/__init__.py b/letterboxd_stats/__init__.py
@@ -2,7 +2,6 @@
 import os
 import platformdirs
 import argparse
-import csv
 
 default_folder = platformdirs.user_config_dir("letterboxd_stats", "mBaratta96")
 

diff --git a/letterboxd_stats/data.py b/letterboxd_stats/data.py
@@ -13,6 +13,11 @@
 
 
 def check_if_watched(df: pd.DataFrame, row: pd.Series) -> bool:
+    """watched.csv hasn't the TMDB id, so comparison can be done only by title.
+    This creates the risk of mismatch when two movies have the same title. To avoid this,
+    we must retrieve the TMDB id of the watched movie.
+    """
+
     if row["Title"] in df["Name"].values:
         watched_films_same_name = df[df["Name"] == row["Title"]]
         for _, film in watched_films_same_name.iterrows():
@@ -23,6 +28,8 @@ def check_if_watched(df: pd.DataFrame, row: pd.Series) -> bool:
 
 
 def read_watched_films(df: pd.DataFrame, path: str, name: str) -> pd.DataFrame:
+    """Check which film of a director you have seen. Add a column to show on the CLI."""
+
     df_profile = pd.read_csv(path)
     df.insert(0, "watched", np.where([check_if_watched(df_profile, row) for _, row in df.iterrows()], "[X]", "[ ]"))
     df["Release Date"] = pd.to_datetime(df["Release Date"])
@@ -32,10 +39,10 @@ def read_watched_films(df: pd.DataFrame, path: str, name: str) -> pd.DataFrame:
 
 
 def select_film_of_person(df: pd.DataFrame) -> pd.Series | None:
-    movie_id = cli.select_movie(df["Title"], df.index.to_series().parallel_map(str))
+    movie_id = cli.select_movie(df["Title"], df.index.to_series())
     if movie_id is None:
         return None
-    movie_row = df.loc[int(movie_id)]
+    movie_row = df.loc[movie_id]
     return movie_row
 
 
@@ -45,6 +52,8 @@ def get_list_name(path: str) -> str:
 
 
 def open_list(path: str, limit: int, acending: bool) -> str:
+    """Select a list from the saved ones."""
+
     list_names = {
         get_list_name(os.path.join(path, letterboxd_list)): letterboxd_list for letterboxd_list in os.listdir(path)
     }
@@ -53,6 +62,12 @@ def open_list(path: str, limit: int, acending: bool) -> str:
 
 
 def open_file(filetype: str, path: str, limit, ascending, header=0) -> str:
+    """There are some operations that are the same for all the .csv files. So isolate those similar operations,
+    and then we proceed to perform the particular operation for a certain file (watchlist, list, diary...).
+    FILE_OPERATIONS selects those particular operations according to the file we opened. Mainly they do
+    ordering and column filtering operations.
+    """
+
     df = pd.read_csv(path, header=header)
     df.rename(columns={"Name": "Title", "Letterboxd URI": "Url"}, inplace=True)
     df["Year"] = df["Year"].fillna(0).astype(int)

diff --git a/letterboxd_stats/main.py b/letterboxd_stats/main.py
@@ -22,24 +22,22 @@ def check_path(path: str):
 
 
 def download_data():
+    """Download exported data you find in the import/esport section of your Letterboxd profile"""
+
     downloader = ws.Downloader()
     downloader.login()
     downloader.download_stats()
 
 
-def get_movie_detail_from_url(letterboxd_url: str, is_diary=False):
-    if letterboxd_url is not None:
-        id = ws.get_tmdb_id(letterboxd_url, is_diary)
-        if id is not None:
-            tmdb.get_movie_detail(id, letterboxd_url)
-
-
 def search_person(args_search: str):
+    """Search for a director, list his/her movies and check if you have watched them."""
+
     df, name = tmdb.get_person(args_search)
     path = os.path.expanduser(os.path.join(config["root_folder"], "static", "watched.csv"))
     check_path(path)
     df = data.read_watched_films(df, path, name)
     movie = data.select_film_of_person(df)
+    # We want to print the link of the selected movie. This has to be retrived from the search page.
     while movie is not None:
         search_film_query = f"{movie['Title']} {movie['Release Date'].year}"  # type: ignore
         title_url = ws.search_film(search_film_query)
@@ -59,14 +57,20 @@ def search_film(args_search_film: str):
 
 
 def get_data(args_limit: int, args_ascending: bool, data_type: str):
+    """Load and show on the CLI different .csv files that you have downloaded with the -d flag."""
+
     path = os.path.expanduser(os.path.join(config["root_folder"], "static", DATA_FILES[data_type]))
     check_path(path)
     letterboxd_url = (
         data.open_file(data_type, path, args_limit, args_ascending)
         if data_type != "Lists"
         else data.open_list(path, args_limit, args_ascending)
     )
-    get_movie_detail_from_url(letterboxd_url, data_type == "Diary")
+    # If you select a movie, show its details.
+    if letterboxd_url is not None:
+        id = ws.get_tmdb_id(letterboxd_url, data_type == "diary")
+        if id is not None:
+            tmdb.get_movie_detail(id, letterboxd_url)
 
 
 def main():

diff --git a/letterboxd_stats/tmdb.py b/letterboxd_stats/tmdb.py
@@ -16,6 +16,11 @@
 
 
 def get_person(name: str) -> Tuple[pd.DataFrame, str]:
+    """Search the director with the TMDB api. Get all the movies.
+    https://developer.themoviedb.org/reference/person-details
+    https://developer.themoviedb.org/reference/person-movie-credits
+    """
+
     print(f"Searching for '{name}'")
     search_results = search.people({"query": name})
     names = [result.name for result in search_results]  # type: ignore
@@ -43,6 +48,8 @@ def get_person(name: str) -> Tuple[pd.DataFrame, str]:
     )
     df = df[df["Department"] == department]
     df = df.drop("Department", axis=1)
+    # person.details provides movies without time duration. If the user wants
+    # (since this slows down the process) get with the movie.details API.
     if config["TMDB"]["get_list_runtimes"] is True:
         df["Duration"] = df.index.to_series().parallel_map(get_film_duration)  # type: ignore
     return df, p["name"]
@@ -77,9 +84,13 @@ def get_movie_detail(movie_id: int, letterboxd_url=None):
     cli.print_film(selected_details)
 
 
-def get_film_duration(tmdb_id: str) -> int:
+def get_film_duration(tmdb_id: int) -> int:
+    """Get film duration from the TMDB api.
+    https://developer.themoviedb.org/reference/movie-details
+    """
+
     try:
-        runtime = movie.details(int(tmdb_id)).runtime  # type: ignore
+        runtime = movie.details(tmdb_id).runtime  # type: ignore
     except TMDbException:
         runtime = 0
     return runtime
diff --git a/letterboxd_stats/web_scraper.py b/letterboxd_stats/web_scraper.py
@@ -1,7 +1,5 @@
 import os
 from zipfile import ZipFile
-
-from numpy import who
 from letterboxd_stats import config
 from letterboxd_stats import cli
 import requests
@@ -31,6 +29,7 @@
 class Downloader:
     def __init__(self):
         self.session = requests.Session()
+        # get home page to set cookies in the session.
         self.session.get(URL)
 
     def login(self):
@@ -44,6 +43,9 @@ def login(self):
             raise ConnectionError("Impossible to login")
 
     def download_stats(self):
+        """Download and extract data of the import/export section.
+        .CSV file will be extracted in the folder specified in the config file."""
+
         res = self.session.get(DATA_PAGE)
         if res.status_code != 200 or "application/zip" not in res.headers["Content-Type"]:
             raise ConnectionError(f"Impossible to download data. Response headers:\n{res.headers}")
@@ -66,6 +68,8 @@ def add_film_diary(self, title: str):
         if res.status_code != 200:
             raise ConnectionError("It's been impossible to retireve the Letterboxd page")
         movie_page = html.fromstring(res.text)
+        # Not the TMDB id, but the Letterboxd ID to use to add the movie to diary.
+        # Reference: https://letterboxd.com/film/seven-samurai/
         letterboxd_film_id = movie_page.get_element_by_id("frm-sidebar-rating").get("data-film-id")
         payload["filmId"] = letterboxd_film_id
         payload["__csrf"] = self.session.cookies.get("com.xk72.webparts.csrf")
@@ -89,6 +93,8 @@ def remove_watchlist(self, title: str):
         print("Removed to your watchlist.")
 
     def perform_operation(self, answer: str, link: str):
+        """Depending on what the user has chosen, add to diary, add/remove watchlist."""
+
         getattr(self, MOVIE_OPERATIONS[answer])(link)
 
 
@@ -97,8 +103,13 @@ def create_movie_url(title: str, operation: str) -> str:
 
 
 def _get_tmdb_id_from_web(link: str, is_diary: bool) -> int:
+    """Scraping the TMDB link from a Letterboxd film page.
+    Inpect this HTML for reference: https://letterboxd.com/film/seven-samurai/
+    """
+
     res = requests.get(link)
     movie_page = html.fromstring(res.text)
+    # Diary links sends you to a different page with no link to TMDB. Redirect to the actual page.
     if is_diary:
         title_link = movie_page.xpath("//span[@class='film-title-wrapper']/a")
         if len(title_link) == 0:
@@ -114,6 +125,13 @@ def _get_tmdb_id_from_web(link: str, is_diary: bool) -> int:
 
 
 def get_tmdb_id(link: str, is_diary=False) -> int | None:
+    """Find the TMDB id from a letterboxd page.
+
+    A link to a Letterboxd film usually starts with either https://letterboxd.com/
+    or https://boxd.it/ (usually all .csv files have this prefix). We structure the cache dict accordingly.
+    The cache is meant to avoid bottleneck of constantly retrieving the Id from an HTML page.
+    """
+
     tmdb_id_cache = shelve.open(cache_path, writeback=False, protocol=5)
     prefix, key = link.rsplit("/", 1)
     if prefix in tmdb_id_cache and key in tmdb_id_cache[prefix]:
@@ -136,11 +154,16 @@ def select_optional_operation() -> str:
 
 
 def search_film(title: str, allow_selection=False) -> str:
+    """Search a movie a get its Letterboxd link.
+    For reference: https://letterboxd.com/search/seven+samurai/?adult
+    """
+
     search_url = create_movie_url(title, "search")
     res = requests.get(search_url)
     if res.status_code != 200:
         raise ConnectionError("It's been impossible to retireve the Letterboxd page")
     search_page = html.fromstring(res.text)
+    # If we want to select movies from the seach page, get more data to print the selection prompt.
     if allow_selection:
         movie_list = search_page.xpath("//div[@class='film-detail-content']")
         if len(movie_list) == 0:
@@ -152,7 +175,6 @@ def search_film(title: str, allow_selection=False) -> str:
             year = f"({year[0].text}) " if len(year := movie.xpath("./h2/span//small/a")) > 0 else ""
             link = movie.xpath("./h2/span/a")[0].get("href")
             title_years_directors_links[f"{title} {year}- {director}"] = link
-
         selected_film = cli.select_value(list(title_years_directors_links.keys()), "Select your film")
         title_url = title_years_directors_links[selected_film].split("/")[-2]
     else:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "letterboxd_stats"
-version = "0.2.9"
+version = "0.2.10"
 authors = [{ name = "mBaratta96" }]
 description = "Get information about your Letterboxd activity."
 readme = "README.md"