WIP national data import script

mysociety · Jul 31, 2023 · 246eb56 · 246eb56
1 parent c6e0eae
commit 246eb56
Showing 1 changed file with 210 additions and 0 deletions.
diff --git a/crowdsourcer/management/commands/import_national_data.py b/crowdsourcer/management/commands/import_national_data.py
@@ -0,0 +1,210 @@
+import re
+
+from django.conf import settings
+from django.core.management.base import BaseCommand
+
+import pandas as pd
+
+from crowdsourcer.models import Option, PublicAuthority, Question
+
+
+class Command(BaseCommand):
+    help = "import questions"
+
+    question_file = settings.BASE_DIR / "data" / "national_data.xlsx"
+
+    sheets = {
+        "Planning Q10B (pivot table) - Renewable Energy": {
+            "section": "Planning & Land Use",
+            "number": 10,
+            "number_part": "b",
+            # "council_col": "Planning Authority",
+            # "score_col": "COUNTA of Ref ID for Planning Applications",
+        },
+        # XXX - what?
+        # "Planning Q10B (pivot table) - Renewable Energy": {
+        # "section": "Planning & Land Use",
+        # "number": 11,
+        # },
+        "Recycling": {
+            "section": "Waste Reduction & Food",
+            "number": 8,
+            "header_row": 2,
+            "council_col": "Local Authority 2020/21",
+            "score_col": "Mark",
+        },
+        "Residual Waste": {
+            "section": "Waste Reduction & Food",
+            "number": 9,
+            "header_row": 1,
+            "council_col": "Local Authority 2020/21",
+            "score_col": "Mark",
+        },
+        "Transport Q4 - 20mph": {
+            "section": "Transport",
+            "number": 4,
+            "council_col": "Council name",
+            "score_col": "Award point - only 1 tier",
+            "type": "yes_no",
+        },
+        "Transport Q6 - Active Travel England scores": {
+            "section": "Transport",
+            "number": 6,
+            "header_row": 1,
+            "council_col": "Local Authority",
+            "score_col": "Front end to show",
+        },
+        "Transport 8B - Bus Ridership": {
+            "section": "Transport",
+            "number": 8,
+            "number_part": "b",
+            "header_row": 2,
+            "council_col": "Local Authority",
+            "score_col": "Front end to show",
+            # XXX - may be tiered
+            "type": "select_one",
+            "options": [
+                {"desc": "Criteria not met", "score": 0},
+                {"desc": "75 journeys per head of population", "score": 1},
+                {"desc": "150 journeys per head of population", "score": 2},
+            ],
+        },
+        # "still to add": {"section": "Transport", "number": 10},
+        "Transport 12a - Air Quality NO2": {
+            "section": "Transport",
+            "number": 12,
+            "number_part": "a",
+        },
+        "Transport 12b - Air Quality PM2.5": {
+            "section": "Transport",
+            "number": 12,
+            "number_part": "b",
+        },
+        "Biodiversity Q4 - Wildlife Sites ": {"section": "Biodiversity", "number": 4},
+        "Biodiversity Q7 - Green Flag Awards (pivot table)": {
+            "section": "Biodiversity",
+            "number": 7,
+        },
+        "Gov&Fin Q11a": {
+            "section": "Governance & Finance",
+            "number": 11,
+            "number_part": "a",
+            "council_col": "Council",
+            "score_col": "Score",
+        },
+        "Gov&Fin Q11b": {
+            "section": "Governance & Finance",
+            "number": 11,
+            "number_part": "b",
+            "council_col": "Council",
+            "score_col": "Score",
+        },
+        "Gov&Fin Q4": {
+            "section": "Governance & Finance",
+            "number": 4,
+            "header_row": 5,
+            "gss_col": "Local Authority Code",
+            "score_col": "Score",
+        },
+        "EPC": {
+            "section": "Buildings & Heating",
+            "number": 7,
+            "header_row": 1,
+            "gss_col": "Local Authority Code",
+            "score_col": "Tiered mark",
+        },
+    }
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "-q", "--quiet", action="store_true", help="Silence progress bars."
+        )
+
+    def add_options(self, q, details):
+        if details.get("type", None) is not None:
+            q_type = details["type"]
+            if q_type == "yes_no":
+                Option.objects.update_or_create(question=q, score=1, description="Yes")
+                Option.objects.update_or_create(question=q, score=0, description="No")
+            elif q_type == "select_one":
+                for option in details["options"]:
+                    Option.objects.update_or_create(
+                        question=q, score=option["score"], description=option["desc"]
+                    )
+
+    def get_df(self, sheet, details):
+        header_row = details.get("header_row", 0)
+        df = pd.read_excel(
+            self.question_file,
+            sheet_name=sheet[0:31],
+            header=header_row,
+        )
+
+        df = df.dropna(axis="index", how="all")
+
+        return df
+
+    def get_question(self, details):
+        q = None
+        try:
+            args = {
+                "section__title": details["section"],
+                "number": details["number"],
+            }
+            if details.get("number_part", None) is not None:
+                args["number_part"] = details["number_part"]
+
+            q = Question.objects.get(**args)
+        except Question.DoesNotExist:
+            print("did not find question")
+
+        return q
+
+    def get_score(self, q, row, details):
+        q_type = details.get("type", "")
+        score = row[details["score_col"]]
+
+        if type(score) == str:
+            match = re.match(r"\"?(\d) out of \d", score)
+            if match:
+                score = match.group(1)
+
+        if q_type == "yes_no":
+            if score == "Yes":
+                score = 1
+            else:
+                score = 0
+
+        return score
+
+    def import_answers(self, df, q, details):
+        if details.get("gss_col", details.get("council_col", None)) is not None:
+            for _, row in df.iterrows():
+                council_col = details.get("gss_col", details.get("council_col", ""))
+
+                value = row[council_col]
+                args = {"name": value}
+                if details.get("gss_col", None) is not None:
+                    args = {"unique_id": value}
+                try:
+                    authority = PublicAuthority.objects.get(**args)
+                except PublicAuthority.DoesNotExist:
+                    print("no authority found for ", args)
+                    continue
+
+                score = self.get_score(q, row, details)
+                print(authority.name, score)
+
+                try:
+                    option = Option.objects.get(question=q, score=score)
+                    print(option)
+                except Option.DoesNotExist:
+                    print(f"No option found for {q.number}, {score}")
+
+    def handle(self, quiet: bool = False, *args, **kwargs):
+        for sheet, details in self.sheets.items():
+            print(details["section"], details["number"])
+            df = self.get_df(sheet, details)
+            q = self.get_question(details)
+            self.add_options(q, details)
+            self.import_answers(df, q, details)