Skip to content

Commit

Permalink
WIP national data import script
Browse files Browse the repository at this point in the history
  • Loading branch information
struan committed Jul 31, 2023
1 parent c6e0eae commit 246eb56
Showing 1 changed file with 210 additions and 0 deletions.
210 changes: 210 additions & 0 deletions crowdsourcer/management/commands/import_national_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import re

from django.conf import settings
from django.core.management.base import BaseCommand

import pandas as pd

from crowdsourcer.models import Option, PublicAuthority, Question


class Command(BaseCommand):
help = "import questions"

question_file = settings.BASE_DIR / "data" / "national_data.xlsx"

sheets = {
"Planning Q10B (pivot table) - Renewable Energy": {
"section": "Planning & Land Use",
"number": 10,
"number_part": "b",
# "council_col": "Planning Authority",
# "score_col": "COUNTA of Ref ID for Planning Applications",
},
# XXX - what?
# "Planning Q10B (pivot table) - Renewable Energy": {
# "section": "Planning & Land Use",
# "number": 11,
# },
"Recycling": {
"section": "Waste Reduction & Food",
"number": 8,
"header_row": 2,
"council_col": "Local Authority 2020/21",
"score_col": "Mark",
},
"Residual Waste": {
"section": "Waste Reduction & Food",
"number": 9,
"header_row": 1,
"council_col": "Local Authority 2020/21",
"score_col": "Mark",
},
"Transport Q4 - 20mph": {
"section": "Transport",
"number": 4,
"council_col": "Council name",
"score_col": "Award point - only 1 tier",
"type": "yes_no",
},
"Transport Q6 - Active Travel England scores": {
"section": "Transport",
"number": 6,
"header_row": 1,
"council_col": "Local Authority",
"score_col": "Front end to show",
},
"Transport 8B - Bus Ridership": {
"section": "Transport",
"number": 8,
"number_part": "b",
"header_row": 2,
"council_col": "Local Authority",
"score_col": "Front end to show",
# XXX - may be tiered
"type": "select_one",
"options": [
{"desc": "Criteria not met", "score": 0},
{"desc": "75 journeys per head of population", "score": 1},
{"desc": "150 journeys per head of population", "score": 2},
],
},
# "still to add": {"section": "Transport", "number": 10},
"Transport 12a - Air Quality NO2": {
"section": "Transport",
"number": 12,
"number_part": "a",
},
"Transport 12b - Air Quality PM2.5": {
"section": "Transport",
"number": 12,
"number_part": "b",
},
"Biodiversity Q4 - Wildlife Sites ": {"section": "Biodiversity", "number": 4},
"Biodiversity Q7 - Green Flag Awards (pivot table)": {
"section": "Biodiversity",
"number": 7,
},
"Gov&Fin Q11a": {
"section": "Governance & Finance",
"number": 11,
"number_part": "a",
"council_col": "Council",
"score_col": "Score",
},
"Gov&Fin Q11b": {
"section": "Governance & Finance",
"number": 11,
"number_part": "b",
"council_col": "Council",
"score_col": "Score",
},
"Gov&Fin Q4": {
"section": "Governance & Finance",
"number": 4,
"header_row": 5,
"gss_col": "Local Authority Code",
"score_col": "Score",
},
"EPC": {
"section": "Buildings & Heating",
"number": 7,
"header_row": 1,
"gss_col": "Local Authority Code",
"score_col": "Tiered mark",
},
}

def add_arguments(self, parser):
parser.add_argument(
"-q", "--quiet", action="store_true", help="Silence progress bars."
)

def add_options(self, q, details):
if details.get("type", None) is not None:
q_type = details["type"]
if q_type == "yes_no":
Option.objects.update_or_create(question=q, score=1, description="Yes")
Option.objects.update_or_create(question=q, score=0, description="No")
elif q_type == "select_one":
for option in details["options"]:
Option.objects.update_or_create(
question=q, score=option["score"], description=option["desc"]
)

def get_df(self, sheet, details):
header_row = details.get("header_row", 0)
df = pd.read_excel(
self.question_file,
sheet_name=sheet[0:31],
header=header_row,
)

df = df.dropna(axis="index", how="all")

return df

def get_question(self, details):
q = None
try:
args = {
"section__title": details["section"],
"number": details["number"],
}
if details.get("number_part", None) is not None:
args["number_part"] = details["number_part"]

q = Question.objects.get(**args)
except Question.DoesNotExist:
print("did not find question")

return q

def get_score(self, q, row, details):
q_type = details.get("type", "")
score = row[details["score_col"]]

if type(score) == str:
match = re.match(r"\"?(\d) out of \d", score)
if match:
score = match.group(1)

if q_type == "yes_no":
if score == "Yes":
score = 1
else:
score = 0

return score

def import_answers(self, df, q, details):
if details.get("gss_col", details.get("council_col", None)) is not None:
for _, row in df.iterrows():
council_col = details.get("gss_col", details.get("council_col", ""))

value = row[council_col]
args = {"name": value}
if details.get("gss_col", None) is not None:
args = {"unique_id": value}
try:
authority = PublicAuthority.objects.get(**args)
except PublicAuthority.DoesNotExist:
print("no authority found for ", args)
continue

score = self.get_score(q, row, details)
print(authority.name, score)

try:
option = Option.objects.get(question=q, score=score)
print(option)
except Option.DoesNotExist:
print(f"No option found for {q.number}, {score}")

def handle(self, quiet: bool = False, *args, **kwargs):
for sheet, details in self.sheets.items():
print(details["section"], details["number"])
df = self.get_df(sheet, details)
q = self.get_question(details)
self.add_options(q, details)
self.import_answers(df, q, details)

0 comments on commit 246eb56

Please sign in to comment.