-
Notifications
You must be signed in to change notification settings - Fork 111
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1159 from harveycarpenter/Surrey-Heath-Borough-Co…
…uncil Feat: Add Surrey Heath Borough Council
- Loading branch information
Showing
2 changed files
with
124 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
116 changes: 116 additions & 0 deletions
116
uk_bin_collection/uk_bin_collection/councils/SurreyHeathBoroughCouncil.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import urllib | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
from uk_bin_collection.uk_bin_collection.common import * | ||
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass | ||
|
||
|
||
class CouncilClass(AbstractGetBinDataClass): | ||
""" | ||
Concrete classes have to implement all abstract operations of the | ||
base class. They can also override some operations with a default | ||
implementation. | ||
""" | ||
|
||
def parse_data(self, page: str, **kwargs) -> dict: | ||
requests.packages.urllib3.disable_warnings() | ||
root_url = "https://asjwsw-wrpsurreyheathmunicipal-live.whitespacews.com/" | ||
# Get the house number and postcode from the commandline | ||
user_paon = kwargs.get("paon") | ||
user_postcode = kwargs.get("postcode") | ||
check_postcode(user_postcode) | ||
|
||
# Start a new session for the form, and get the chosen URL from the commandline | ||
session = requests.Session() | ||
req = session.get(root_url) | ||
|
||
# Parse the requested URL to get a link to the "View My Collections" portal with a unique service ID | ||
start = BeautifulSoup(req.text, features="html.parser") | ||
start.prettify() | ||
base_link = start.select( | ||
"#menu-content > div > div:nth-child(1) > p.govuk-body.govuk-\\!-margin-bottom-0.colorblue.lineheight15 > a" | ||
)[0].attrs.get("href") | ||
|
||
# We need to reorder the query parts from the unique URL, so split them up to make it easier | ||
query_parts = urllib.parse.urlparse(base_link).query.split("&") | ||
parts = base_link.split("?") | ||
addr_link = ( | ||
parts[0] + "/mop.php?" + query_parts[1] + "&" + query_parts[0] + "&seq=2" | ||
) | ||
|
||
# Bring in some headers to emulate a browser, and put the UPRN and postcode into the form data. | ||
# This is sent in a POST request, emulating browser behaviour. | ||
headers = { | ||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", | ||
"Accept-Language": "en-GB,en;q=0.9", | ||
"Cache-Control": "no-cache", | ||
"Connection": "keep-alive", | ||
"Content-Type": "application/x-www-form-urlencoded", | ||
"Origin": "https://asjwsw-wrpsurreyheathmunicipal-live.whitespacews.com", | ||
"Pragma": "no-cache", | ||
"Referer": "https://asjwsw-wrpsurreyheathmunicipal-live.whitespacews.com/", | ||
"Sec-Fetch-Dest": "document", | ||
"Sec-Fetch-Mode": "navigate", | ||
"Sec-Fetch-Site": "same-origin", | ||
"Sec-Fetch-User": "?1", | ||
"Upgrade-Insecure-Requests": "1", | ||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 OPR/98.0.0.0", | ||
"sec-ch-ua": '"Chromium";v="112", "Not_A Brand";v="24", "Opera GX";v="98"', | ||
"sec-ch-ua-mobile": "?0", | ||
"sec-ch-ua-platform": '"Windows"', | ||
} | ||
data = { | ||
"address_name_number": user_paon, | ||
"address_street": "", | ||
"street_town": "", | ||
"address_postcode": user_postcode, | ||
} | ||
addr_page = session.post(addr_link, headers=headers, data=data) | ||
addr = BeautifulSoup(addr_page.text, features="html.parser") | ||
addr.prettify() | ||
|
||
# This page should only have one address, but regardless, select the first one and make a request to load the | ||
# calendar page. | ||
cal_link = root_url + addr.select("#property_list > ul > li > a")[0].attrs.get( | ||
"href" | ||
) | ||
cal_page = session.get(cal_link) | ||
|
||
# Parse the calendar page | ||
soup = BeautifulSoup(cal_page.text, features="html.parser") | ||
soup.prettify() | ||
data = {"bins": []} | ||
|
||
# For whatever reason, each row contains all the information for that row, and each one after it. This code | ||
# essentially gets all items from each row, but ignores the whitespace that you get when splitting using \n. | ||
# This produces a big list of dates then bin types, so we split them up into a list of lists - each pair is | ||
# a date and the bin type. | ||
items = [ | ||
i | ||
for i in soup.find( | ||
"u1", | ||
{ | ||
"class": "displayinlineblock justifycontentleft alignitemscenter margin0 padding0" | ||
}, | ||
).text.split("\n") | ||
if i != "" | ||
] | ||
pairs = [items[i : i + 2] for i in range(0, len(items), 2)] | ||
|
||
# Loop through the paired bin dates and types | ||
for pair in pairs: | ||
# This isn't necessary, but better safe than sorry | ||
collection_date = datetime.strptime(pair[0], date_format).strftime( | ||
date_format | ||
) | ||
# Change the formatting of the purple bins to replace the hyphens with slashes | ||
if pair[1] == "Batteries-small electricals-textiles": | ||
bin_type = pair[1].replace("-", "/").strip() | ||
else: | ||
bin_type = pair[1] | ||
|
||
# Add the data into the dictionary | ||
data["bins"].append({"type": bin_type, "collectionDate": collection_date}) | ||
|
||
return data |