From b5f6ee50c8c32229d7341261bbfd94c1ab959178 Mon Sep 17 00:00:00 2001
From: David Wales <david.wales@swsphn.com.au>
Date: Wed, 18 Sep 2024 15:03:31 +1000
Subject: [PATCH] Don't assume that PMHC extracts will succeed

Previously, we assumed that all PMHC extracts would eventually change
from status=Processing to status=Completed, at which point we could
download the extract file. However, it is possible that the PMHC server
may error, which results in status=Error. We did not detect this Error
state previously, so would loop indefinitely in the false hope that the
extract file would eventually be created.

Now, we correctly detect status=Error, and raise a suitable exception.
---
 src/pmhclib/pmhc.py | 83 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 81 insertions(+), 2 deletions(-)

diff --git a/src/pmhclib/pmhc.py b/src/pmhclib/pmhc.py
index a94c352..f65adc8 100644
--- a/src/pmhclib/pmhc.py
+++ b/src/pmhclib/pmhc.py
@@ -44,6 +44,14 @@ class CouldNotFindPmhcUpload(Exception):
     """Custom error handler for when an upload cannot be found on PMHC"""
 
 
+class PmhcServerError(Exception):
+    """Custom exception for when a PMHC Server error is encountered"""
+
+
+class MaxRetriesExceeded(Exception):
+    """Custom exception for when the maximum retries is exceeded"""
+
+
 class SecureString(str):
     """Show `'***'` instead of string value in tracebacks"""
 
@@ -383,6 +391,7 @@ def download_pmhc_mds(
                 "all_episode_children": int(matched_episodes),
                 "spec_type": specification.term,
             }
+
             download_request = self.page.request.get(
                 "https://pmhc-mds.net/api/extract/csv",
                 params=params,
@@ -400,10 +409,75 @@ def download_pmhc_mds(
                 )
                 raise err
 
+            # Both PMHC server errors and incomplete processing extracts
+            # return the same HTTP status (400) and JSON response when
+            # trying to fetch the extract by UUID:
+            # https://pmhc-mds.net/api/extract/{download_uuid}/fetch
+            # {
+            #   "errors": {
+            #     "export_fetch": "Can not fetch extract for uuid
+            #         [123...]. Extract is not complete. Extract has
+            #         expired."
+            #   }
+            # }
+            #
+            # For this reason, it's not sufficient to simply try the
+            # download URL until we get a success code. If there is
+            # a PMHC server error, we will end up retrying forever.
+            #
+            # Instead, we need to fetch the list of extracts and filter
+            # for one with the required uuid. We can then check the
+            # extract status explicitly, which should be one of the
+            # following values:
+            #
+            # - Completed
+            # - Processing
+            # - Queued
+            # - Error
+            #
+            # If Completed, we can download the extract.
+            # If Processing or Queued, keep looping and waiting.
+            # If Error, the extract has failed. Exit.
+
+            # Wait for extract to be ready
             progress.update(extract_task, description="Waiting for extract...")
+            retries = 0
+            while retries <= max_retries:
+                time.sleep(30)
+                try:
+                    extracts_request = self.page.request.get(
+                        "https://pmhc-mds.net/api/extract?sort=-date"
+                    )
+                except playwright.sync_api.Error as err:
+                    if "Request timed out" in err.message:
+                        retries += 1
+                        logging.warning(
+                            f"Request timed out ({retries} of {max_retries}). Retrying."
+                        )
+                    else:
+                        raise err
+
+                extracts = extracts_request.json()
+                extract = next(
+                    filter(lambda item: item.get("uuid") == download_uuid, extracts)
+                )
+                status = extract["status"]
+
+                if status == "Completed":
+                    break
+                elif status == "Error":
+                    logging.error(f"PMHC extract with uuid {download_uuid} has failed.")
+                    logging.error("See PMHC Server error:")
+                    logging.error(extract["stash"]["error"])
+                    raise PmhcServerError("The PMHC extract has failed on the server.")
+            else:
+                raise MaxRetriesExceeded(
+                    f"Tried fetching PMHC extract list {retries - 1} times"
+                )
+
             # We know the URL which will give us the final download URL,
-            # as we have the uuid. However, the URL doesn't exist
-            # immediately. We loop until we get a success code.
+            # as we have the uuid. We have confirmed above that the
+            # extract is completed.
             request_ok = False
             retries = 0
             while not request_ok and retries <= max_retries:
@@ -422,6 +496,11 @@ def download_pmhc_mds(
                     else:
                         raise err
 
+            if retries > max_retries:
+                raise MaxRetriesExceeded(
+                    f"Tried fetching PMHC extract {retries - 1} times."
+                )
+
             download_url_json = download_url_request.json()
             download_url = download_url_json["location"]