Skip to content

Commit

Permalink
feat: classifier (#5)
Browse files Browse the repository at this point in the history
* docs: update docs

* feat: classifier
  • Loading branch information
JohnOlushola authored Jul 25, 2022
1 parent 3b94787 commit 4e30dbc
Show file tree
Hide file tree
Showing 6 changed files with 351 additions and 38 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,8 @@ Create a system to classify data spending of political parties with pdf invoices
On what services is money spent at elections? This is primary question behind this project. We know that just over £50 million was spent at the last general election, but strikingly little about how. Parties have to report their spending to the Electoral Commission under broad categories (e.g. ‘advertising’ and ‘market research and canvassing’), but this provides very little detail. They do, however, have to provide invoices for any spend over £200 so there is a vast resource available to find out more.

## Setup
For notebooks and other scripts to run successfully set the variables in `config.json` accordingly.
- For notebooks and other scripts to run successfully set the variables in `config.json` accordingly.
- No env requirements file provided yet. Ensure all needed packages are installed and follow their instruction guidelines

## Results
All results are provided in notebooks
Empty file removed notebooks/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
invoices_base_url = 'http://search.electoralcommission.org.uk/Api/Spending/Invoices/'
default_path_to_csv = '/Users/temiloluwaolushola/Documents/Sussex/political_spending_uk/data/results.csv'
output_path = '/Users/temiloluwaolushola/Documents/Sussex/political_spending_uk/data/v2/'
output_filetype = 'json'

labels = ['Transport', 'Miscellaneous', 'Catering', 'Accommodation', 'Expenses claimed by provider ', 'Completely unclear', 'Ambiguous and needs discussion ', 'ADVERTISING AND PRESS', 'Merchandise', 'Newspaper or magazine advertising', 'Radio advertising', 'Social media advertising', 'Online advertising (not social media, i.e. web advertising but not online newspapers or social media)', 'Other forms of advertising (billboards, advans, digital posters outside)', 'PR', 'Campaign materials', 'Design services', 'Campaign material printing ', 'Direct Mail/ Leaflet delivery/ postage ', 'Infrastructure and equipment', 'Telecommunications services', 'Physical Security', 'Event costs/ Production/ Venue hire', 'Mobile application services', 'Email services', 'Website services', 'Search Engine Optimization', 'Recruitment services/staffing costs', 'Creative content owned by a third party (e.g. Getty images, PA images, demo music)', 'Translation/Braile/British Sign Language services', 'Campaign activity', 'GOTV', 'Fundraising ', 'Data and infrastructure', 'Campaign database or CRM (including SQL)', 'Data Services and analysis', 'IT infrastructure and support', 'Office supplies (staples, paperclips, IT equipment, envelopes)', 'Production Services ', 'Video editing/ production', 'Audio editing/production', 'Photos editing/production', 'Consultancy', 'Communication consultants', 'Design consultants', 'Ad strategy and consultancy', 'Social media strategy and consultancy', 'Data consultancy', 'Legal advice', 'Research', 'Polling', 'Focus groups', 'Ordinance survey data', 'Message testing', 'Archival research', 'Other forms of research', 'Social/Digital listening']
75 changes: 75 additions & 0 deletions src/models/classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from calendar import day_abbr
import os
import pandas as pd
from transformers import pipeline
from src.config import output_path, output_filetype, labels

class Classifier:
def __init__(self, data_filetype="json", labels=labels, output_filetype=output_filetype):
self.data_filetype = data_filetype

self.data_points = None
self.labels = labels
self.output_filetype = output_filetype
self.hypothesis_template = "This text is about {}."

self.predictions = []
self.predictions_as_df = None

self._init_model()
self._read_data()

def _init_model(self):
self.model = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

def _read_data(self):
data_file = output_path + 'out.' + self.data_filetype

if self.data_filetype == "json":
data_points = pd.read_json(data_file)
elif self.data_filetype == "csv":
data_points = pd.read_csv(data_file)
elif self.data_filetype == "excel":
data_points = pd.read_excel(data_file)
else:
raise ValueError("Data file type unsupported")

self.data_points = data_points.iloc[: , :3]

def _tokenizer(self, text):
return self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=500)

def _classify_data_point(self, data_point):
prediction = self.model(data_point, self.labels, hypothesis_template=self.hypothesis_template, multi_label=True)
return prediction['labels'][0]

def _save_predictions(self):
os.makedirs(output_path, exist_ok=True)
file_path = output_path + "predictions." + self.output_filetype

try:
if self.output_filetype == "csv":
self.predictions_as_df.to_csv(file_path)
elif self.output_filetype == "json":
self.predictions_as_df.to_json(file_path)
elif self.output_filetype == "excel":
self.predictions_as_df.to_excel(file_path)
else:
print(f"Save unsuccesful: {self.output_filetype} is unsupported")
except:
print("Save unsuccesful: something went wrong. View predictions at classifier_instance.predictions")

def classify(self):
print(f"Classifying {len(self.data_points)} data points")

for index, data_point in self.data_points.iterrows():
id = data_point[0]
description = data_point[1]
amount = data_point[2]
label = self._classify_data_point(data_point=description)

self.predictions.append((id, description, amount, label))

print(f"Saving predictions")
self.predictions_as_df = pd.DataFrame(self.predictions)
self.save_predictions()
232 changes: 232 additions & 0 deletions src/notebooks/classification.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Classification"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"\n",
"module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from src.models.classifier import Classifier\n",
"\n",
"classifier = Classifier()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### View data points"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>66836</td>\n",
" <td>GE 2019 Statics ALL .08</td>\n",
" <td>2.081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>66836</td>\n",
" <td>2 GE2019 - AB Polling Day</td>\n",
" <td>4,941.61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>66836</td>\n",
" <td>3 GE2019 - GOTV FS per 4 GE2019 - GOTV NHS per...</td>\n",
" <td>1,945.59</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>66836</td>\n",
" <td>5 GE2019 - JC - Jessi GOTV</td>\n",
" <td>5,006.62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>66836</td>\n",
" <td>6 GE2019 - JC - Jessi story 7 GE2049 - JC - ri...</td>\n",
" <td>58,828.99</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14713</th>\n",
" <td>68003</td>\n",
" <td>CCHQ Boris Johnson GE Campaign 2019 1 20 To th...</td>\n",
" <td>98,153.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14714</th>\n",
" <td>68003</td>\n",
" <td>Payment details: Carriage:</td>\n",
" <td>£0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14715</th>\n",
" <td>68003</td>\n",
" <td>CCHQ Boris Johnson GE Campaign 2019 1 20 To th...</td>\n",
" <td>98,153.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14716</th>\n",
" <td>68003</td>\n",
" <td>Payment details: Carriage:</td>\n",
" <td>£0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14717</th>\n",
" <td>68003</td>\n",
" <td></td>\n",
" <td>£96,187.50</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>14718 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2\n",
"0 66836 GE 2019 Statics ALL .08 2.081\n",
"1 66836 2 GE2019 - AB Polling Day 4,941.61\n",
"2 66836 3 GE2019 - GOTV FS per 4 GE2019 - GOTV NHS per... 1,945.59\n",
"3 66836 5 GE2019 - JC - Jessi GOTV 5,006.62\n",
"4 66836 6 GE2019 - JC - Jessi story 7 GE2049 - JC - ri... 58,828.99\n",
"... ... ... ...\n",
"14713 68003 CCHQ Boris Johnson GE Campaign 2019 1 20 To th... 98,153.10\n",
"14714 68003 Payment details: Carriage: £0.00\n",
"14715 68003 CCHQ Boris Johnson GE Campaign 2019 1 20 To th... 98,153.10\n",
"14716 68003 Payment details: Carriage: £0.00\n",
"14717 68003 £96,187.50\n",
"\n",
"[14718 rows x 3 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier.data_points"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Classifying 14718 data points\n"
]
}
],
"source": [
"classifier.classify()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### View predictions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"classifier.predictions_as_df\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.12 ('nlp')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "bfc06090c28f9a2b1bc5cad670ac112515b1c0f123b001add159cd7414fa589d"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 4e30dbc

Please sign in to comment.