From 246eb56bcc1dc6a3c2a216cff22a96cfaf43d40a Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Mon, 31 Jul 2023 17:13:48 +0100 Subject: [PATCH] WIP national data import script --- .../commands/import_national_data.py | 210 ++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 crowdsourcer/management/commands/import_national_data.py diff --git a/crowdsourcer/management/commands/import_national_data.py b/crowdsourcer/management/commands/import_national_data.py new file mode 100644 index 00000000..694d62a9 --- /dev/null +++ b/crowdsourcer/management/commands/import_national_data.py @@ -0,0 +1,210 @@ +import re + +from django.conf import settings +from django.core.management.base import BaseCommand + +import pandas as pd + +from crowdsourcer.models import Option, PublicAuthority, Question + + +class Command(BaseCommand): + help = "import questions" + + question_file = settings.BASE_DIR / "data" / "national_data.xlsx" + + sheets = { + "Planning Q10B (pivot table) - Renewable Energy": { + "section": "Planning & Land Use", + "number": 10, + "number_part": "b", + # "council_col": "Planning Authority", + # "score_col": "COUNTA of Ref ID for Planning Applications", + }, + # XXX - what? + # "Planning Q10B (pivot table) - Renewable Energy": { + # "section": "Planning & Land Use", + # "number": 11, + # }, + "Recycling": { + "section": "Waste Reduction & Food", + "number": 8, + "header_row": 2, + "council_col": "Local Authority 2020/21", + "score_col": "Mark", + }, + "Residual Waste": { + "section": "Waste Reduction & Food", + "number": 9, + "header_row": 1, + "council_col": "Local Authority 2020/21", + "score_col": "Mark", + }, + "Transport Q4 - 20mph": { + "section": "Transport", + "number": 4, + "council_col": "Council name", + "score_col": "Award point - only 1 tier", + "type": "yes_no", + }, + "Transport Q6 - Active Travel England scores": { + "section": "Transport", + "number": 6, + "header_row": 1, + "council_col": "Local Authority", + "score_col": "Front end to show", + }, + "Transport 8B - Bus Ridership": { + "section": "Transport", + "number": 8, + "number_part": "b", + "header_row": 2, + "council_col": "Local Authority", + "score_col": "Front end to show", + # XXX - may be tiered + "type": "select_one", + "options": [ + {"desc": "Criteria not met", "score": 0}, + {"desc": "75 journeys per head of population", "score": 1}, + {"desc": "150 journeys per head of population", "score": 2}, + ], + }, + # "still to add": {"section": "Transport", "number": 10}, + "Transport 12a - Air Quality NO2": { + "section": "Transport", + "number": 12, + "number_part": "a", + }, + "Transport 12b - Air Quality PM2.5": { + "section": "Transport", + "number": 12, + "number_part": "b", + }, + "Biodiversity Q4 - Wildlife Sites ": {"section": "Biodiversity", "number": 4}, + "Biodiversity Q7 - Green Flag Awards (pivot table)": { + "section": "Biodiversity", + "number": 7, + }, + "Gov&Fin Q11a": { + "section": "Governance & Finance", + "number": 11, + "number_part": "a", + "council_col": "Council", + "score_col": "Score", + }, + "Gov&Fin Q11b": { + "section": "Governance & Finance", + "number": 11, + "number_part": "b", + "council_col": "Council", + "score_col": "Score", + }, + "Gov&Fin Q4": { + "section": "Governance & Finance", + "number": 4, + "header_row": 5, + "gss_col": "Local Authority Code", + "score_col": "Score", + }, + "EPC": { + "section": "Buildings & Heating", + "number": 7, + "header_row": 1, + "gss_col": "Local Authority Code", + "score_col": "Tiered mark", + }, + } + + def add_arguments(self, parser): + parser.add_argument( + "-q", "--quiet", action="store_true", help="Silence progress bars." + ) + + def add_options(self, q, details): + if details.get("type", None) is not None: + q_type = details["type"] + if q_type == "yes_no": + Option.objects.update_or_create(question=q, score=1, description="Yes") + Option.objects.update_or_create(question=q, score=0, description="No") + elif q_type == "select_one": + for option in details["options"]: + Option.objects.update_or_create( + question=q, score=option["score"], description=option["desc"] + ) + + def get_df(self, sheet, details): + header_row = details.get("header_row", 0) + df = pd.read_excel( + self.question_file, + sheet_name=sheet[0:31], + header=header_row, + ) + + df = df.dropna(axis="index", how="all") + + return df + + def get_question(self, details): + q = None + try: + args = { + "section__title": details["section"], + "number": details["number"], + } + if details.get("number_part", None) is not None: + args["number_part"] = details["number_part"] + + q = Question.objects.get(**args) + except Question.DoesNotExist: + print("did not find question") + + return q + + def get_score(self, q, row, details): + q_type = details.get("type", "") + score = row[details["score_col"]] + + if type(score) == str: + match = re.match(r"\"?(\d) out of \d", score) + if match: + score = match.group(1) + + if q_type == "yes_no": + if score == "Yes": + score = 1 + else: + score = 0 + + return score + + def import_answers(self, df, q, details): + if details.get("gss_col", details.get("council_col", None)) is not None: + for _, row in df.iterrows(): + council_col = details.get("gss_col", details.get("council_col", "")) + + value = row[council_col] + args = {"name": value} + if details.get("gss_col", None) is not None: + args = {"unique_id": value} + try: + authority = PublicAuthority.objects.get(**args) + except PublicAuthority.DoesNotExist: + print("no authority found for ", args) + continue + + score = self.get_score(q, row, details) + print(authority.name, score) + + try: + option = Option.objects.get(question=q, score=score) + print(option) + except Option.DoesNotExist: + print(f"No option found for {q.number}, {score}") + + def handle(self, quiet: bool = False, *args, **kwargs): + for sheet, details in self.sheets.items(): + print(details["section"], details["number"]) + df = self.get_df(sheet, details) + q = self.get_question(details) + self.add_options(q, details) + self.import_answers(df, q, details)