Merge pull request #925 from rustyd0g/Midlothian-Council-V2

robbrad · Oct 24, 2024 · 3a89d86 · 3a89d86
2 parents c3db414 + 095328b
commit 3a89d86
Show file tree

Hide file tree

Showing 2 changed files with 120 additions and 39 deletions.
diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json
@@ -754,10 +754,12 @@
         "wiki_note": "Pass the house name/number plus the name of the street with the postcode parameter, wrapped in double quotes.  Check the address in the web site first. This version will only pick the first SHOW button returned by the search or if it is fully unique.  The search is not very predictable (e.g. house number 4 returns 14,24,4,44 etc.)."
     },
     "MidlothianCouncil": {
-        "url": "https://www.midlothian.gov.uk/directory_record/92594377/glenesk_bonnyrigg_eh19_3je",
-        "wiki_command_url_override": "https://www.midlothian.gov.uk/directory_record/XXXXXX/XXXXXX",
+        "house_number": "52",
+        "postcode": "EH19 2EB",
+        "skip_get_url": true,
+        "url": "https://www.midlothian.gov.uk/info/1054/bins_and_recycling/343/bin_collection_days",
         "wiki_name": "Midlothian Council",
-        "wiki_note": "Follow the instructions [here](https://www.midlothian.gov.uk/info/1054/bins_and_recycling/343/bin_collection_days) until you get the page that shows the weekly collections for your address then copy the URL and replace the URL in the command."
+        "wiki_note": "Pass the house name/number wrapped in double quotes along with the postcode parameter"
     },
     "MidSussexDistrictCouncil": {
         "house_number": "OAKLANDS, OAKLANDS ROAD RH16 1SS",

diff --git a/uk_bin_collection/uk_bin_collection/councils/MidlothianCouncil.py b/uk_bin_collection/uk_bin_collection/councils/MidlothianCouncil.py
@@ -1,3 +1,5 @@
+from urllib.parse import quote, urljoin
+
 from bs4 import BeautifulSoup
 
 from uk_bin_collection.uk_bin_collection.common import *
@@ -12,57 +14,134 @@ class CouncilClass(AbstractGetBinDataClass):
     implementation.
     """
 
+    BASE_URL = "https://www.midlothian.gov.uk"
+    DIRECTORY_URL = f"{BASE_URL}/site/scripts/directory_search.php?directoryID=35&keywords={{}}&search=Search"
+    BIN_TYPES = {
+        "Next recycling collection": "Recycling",
+        "Next grey bin collection": "Grey Bin",
+        "Next brown bin collection": "Brown Bin",
+        "Next food bin collection": "Food Bin",
+    }
+
     def parse_data(self, page: str, **kwargs) -> dict:
-        # Parse the HTML content using BeautifulSoup
-        soup = BeautifulSoup(page.text, features="html.parser")
 
-        # Initialize a dictionary to store the parsed bin data
+        house_identifier = kwargs.get(
+            "paon", ""
+        ).strip()  # Could be house number or name
+        postcode = kwargs.get("postcode")
+
+        # Check if both house identifier and postcode are provided
+        if not house_identifier:
+            print("Error: House identifier (number or name) must be provided.")
+            return {"bins": []}
+
+        if not postcode:
+            print("Error: Postcode must be provided.")
+            return {"bins": []}
+
+        check_postcode(postcode)
+        check_paon(house_identifier)
+
         data = {"bins": []}
+        search_url = self.DIRECTORY_URL.format(quote(postcode))
+
+        try:
+            search_results_html = requests.get(search_url)
+            search_results_html.raise_for_status()
+
+            soup = BeautifulSoup(search_results_html.text, "html.parser")
+            address_link = self._get_result_by_identifier(soup, house_identifier)
+
+            if address_link:
+                collections_url = urljoin(search_url, address_link["href"])
+                bin_collection_data = self._fetch_bin_collection_data(collections_url)
+
+                if bin_collection_data:
+                    data["bins"].extend(bin_collection_data)
+
+        except requests.RequestException as e:
+            print(f"Warning: Failed to fetch data from {search_url}. Error: {e}")
+
+        return data
 
-        # Define a mapping of bin collection labels to their corresponding types
-        bin_types = {
-            "Next recycling collection": "Recycling",
-            "Next grey bin collection": "Grey Bin",
-            "Next brown bin collection": "Brown Bin",
-            "Next food bin collection": "Food Bin",
-        }
-
-        # Locate the <ul> element with the class "data-table"
-        bin_collections = soup.find("ul", {"class": "data-table"})
-
-        # Proceed only if the <ul> element is found
-        if bin_collections:
-            # Retrieve all <li> elements within the <ul>, skipping the first two (not relevant)
-            bin_items = bin_collections.find_all("li")[2:]
-
-            # Iterate through each bin item
-            for bin in bin_items:
-                bin_type = None
-                # Retrieve the bin type from the header if it exists
-                if bin.h2 and bin.h2.text.strip() in bin_types:
-                    bin_type = bin_types[bin.h2.text.strip()]
+    def _get_result_by_identifier(self, soup, identifier: str) -> list:
+        """Extract the result link that matches the given house number or house name."""
+        try:
+            results_list = (
+                soup.find("article", class_="container")
+                .find("h2", text="Search results")
+                .find_next("ul", class_="item-list item-list__rich")
+            )
+
+            pattern = re.compile(re.escape(identifier.lower()) + r"[ ,]")
+
+            for item in results_list.find_all("li"):
+                address_link = item.find("a")
+                if address_link:
+                    link_text = address_link.text.strip().lower()
+                    if pattern.match(link_text):
+                        return address_link
+
+            print(f"Warning: No results found for identifier '{identifier}'.")
+            return None  # Return None if no match is found
+
+        except AttributeError as e:
+            print(f"Warning: Could not find the search results. Error: {e}")
+            return None  # Return None if no result found
+
+    def _fetch_bin_collection_data(self, url: str) -> list:
+        """Fetch and parse bin collection data from the given URL."""
+        try:
+            bin_collection_html = requests.get(url)
+            bin_collection_html.raise_for_status()
+
+            soup = BeautifulSoup(bin_collection_html.text, "html.parser")
+            bin_collections = soup.find("ul", class_="data-table")
+
+            if bin_collections:
+                return self._parse_bin_collection_items(
+                    bin_collections.find_all("li")[2:]  # Skip the first two items
+                )
+
+        except requests.RequestException as e:
+            print(
+                f"Warning: Failed to fetch bin collection data from {url}. Error: {e}"
+            )
+
+        return []  # Return an empty list on error
+
+    def _parse_bin_collection_items(self, bin_items: list) -> list:
+        """Parse bin collection items into a structured format."""
+        parsed_bins = []
+
+        for bin_item in bin_items:
+            bin_type = None
+            try:
+                if bin_item.h2 and bin_item.h2.text.strip() in self.BIN_TYPES:
+                    bin_type = self.BIN_TYPES[bin_item.h2.text.strip()]
 
                 bin_collection_date = None
-                # Retrieve the bin collection date from the div if it exists
-                if bin.div and bin.div.text.strip():
+                if bin_item.div and bin_item.div.text.strip():
                     try:
-                        # Parse the collection date from the div text and format it
                         bin_collection_date = datetime.strptime(
-                            bin.div.text.strip(),
-                            "%A %d/%m/%Y",
+                            bin_item.div.text.strip(), "%A %d/%m/%Y"
                         ).strftime(date_format)
                     except ValueError:
-                        # If date parsing fails, keep bin_collection_date as None
-                        pass
+                        print(
+                            f"Warning: Date parsing failed for {bin_item.div.text.strip()}."
+                        )
 
-                # If both bin type and collection date are identified, add to the data
                 if bin_type and bin_collection_date:
-                    data["bins"].append(
+                    parsed_bins.append(
                         {
                             "type": bin_type,
                             "collectionDate": bin_collection_date,
                         }
                     )
+                else:
+                    print(f"Warning: Missing data for bin item: {bin_item}")
 
-        # Return the parsed data, which may be empty if no bins were found
-        return data
+            except Exception as e:
+                print(f"Warning: An error occurred while parsing bin item. Error: {e}")
+
+        return parsed_bins