feat: classifier (#5)

* docs: update docs * feat: classifier
JohnOlushola · Jul 25, 2022 · 4e30dbc · 4e30dbc
1 parent 3b94787
commit 4e30dbc
Show file tree

Hide file tree

Showing 6 changed files with 351 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -12,5 +12,8 @@ Create a system to classify data spending of political parties with pdf invoices
 On what services is money spent at elections? This is primary question behind this project. We know that just over £50 million was spent at the last general election, but strikingly little about how. Parties have to report their spending to the Electoral Commission under broad categories (e.g. ‘advertising’ and ‘market research and canvassing’), but this provides very little detail. They do, however, have to provide invoices for any spend over £200 so there is a vast resource available to find out more.
 
 ## Setup
-For notebooks and other scripts to run successfully set the variables in `config.json` accordingly.
+- For notebooks and other scripts to run successfully set the variables in `config.json` accordingly.
+- No env requirements file provided yet. Ensure all needed packages are installed and follow their instruction guidelines
 
+## Results
+All results are provided in notebooks
diff --git a/notebooks/__init__.py b/notebooks/__init__.py
diff --git a/src/config.py b/src/config.py
@@ -3,3 +3,6 @@
 invoices_base_url = 'http://search.electoralcommission.org.uk/Api/Spending/Invoices/'
 default_path_to_csv = '/Users/temiloluwaolushola/Documents/Sussex/political_spending_uk/data/results.csv'
 output_path = '/Users/temiloluwaolushola/Documents/Sussex/political_spending_uk/data/v2/'
+output_filetype = 'json'
+
+labels = ['Transport', 'Miscellaneous', 'Catering', 'Accommodation', 'Expenses claimed by provider ', 'Completely unclear', 'Ambiguous and needs discussion ', 'ADVERTISING AND PRESS', 'Merchandise', 'Newspaper or magazine advertising', 'Radio advertising', 'Social media advertising', 'Online advertising (not social media,  i.e. web advertising but not online newspapers or social media)', 'Other forms of advertising (billboards, advans, digital posters outside)', 'PR', 'Campaign materials', 'Design services', 'Campaign material printing ', 'Direct Mail/ Leaflet delivery/ postage ', 'Infrastructure and equipment', 'Telecommunications services', 'Physical Security', 'Event costs/ Production/ Venue hire', 'Mobile application services', 'Email services', 'Website services', 'Search Engine Optimization', 'Recruitment services/staffing costs', 'Creative content owned by a third party (e.g. Getty images, PA images, demo music)', 'Translation/Braile/British Sign Language services', 'Campaign activity', 'GOTV', 'Fundraising ', 'Data and infrastructure', 'Campaign database or CRM (including SQL)', 'Data Services and analysis', 'IT infrastructure and support', 'Office supplies (staples, paperclips, IT equipment, envelopes)', 'Production Services ', 'Video editing/ production', 'Audio editing/production', 'Photos editing/production', 'Consultancy', 'Communication consultants', 'Design consultants', 'Ad strategy and consultancy', 'Social media strategy and consultancy', 'Data consultancy', 'Legal advice', 'Research', 'Polling', 'Focus groups', 'Ordinance survey data', 'Message testing', 'Archival research', 'Other forms of research', 'Social/Digital listening']
diff --git a/src/models/classifier.py b/src/models/classifier.py
@@ -0,0 +1,75 @@
+from calendar import day_abbr
+import os
+import pandas as pd
+from transformers import pipeline
+from src.config import output_path, output_filetype, labels
+
+class Classifier:
+    def __init__(self, data_filetype="json", labels=labels, output_filetype=output_filetype):
+        self.data_filetype = data_filetype
+
+        self.data_points = None
+        self.labels = labels
+        self.output_filetype = output_filetype
+        self.hypothesis_template = "This text is about {}."
+
+        self.predictions = []
+        self.predictions_as_df = None
+
+        self._init_model()
+        self._read_data()
+
+    def _init_model(self):
+        self.model = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
+
+    def _read_data(self):
+        data_file = output_path + 'out.' + self.data_filetype
+
+        if self.data_filetype == "json":
+            data_points = pd.read_json(data_file)
+        elif self.data_filetype == "csv":
+            data_points = pd.read_csv(data_file)
+        elif self.data_filetype == "excel":
+            data_points = pd.read_excel(data_file)
+        else:
+            raise ValueError("Data file type unsupported")
+
+        self.data_points = data_points.iloc[: , :3]
+
+    def _tokenizer(self, text):
+        return self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=500)
+
+    def _classify_data_point(self, data_point):
+        prediction = self.model(data_point, self.labels, hypothesis_template=self.hypothesis_template, multi_label=True)
+        return prediction['labels'][0]
+
+    def _save_predictions(self):
+        os.makedirs(output_path, exist_ok=True)
+        file_path = output_path + "predictions." + self.output_filetype
+
+        try:
+            if self.output_filetype == "csv":
+                self.predictions_as_df.to_csv(file_path)
+            elif self.output_filetype == "json":
+                self.predictions_as_df.to_json(file_path)
+            elif self.output_filetype == "excel":
+                self.predictions_as_df.to_excel(file_path)
+            else:
+                print(f"Save unsuccesful: {self.output_filetype} is unsupported")
+        except:
+            print("Save unsuccesful: something went wrong. View predictions at classifier_instance.predictions")
+
+    def classify(self):
+        print(f"Classifying {len(self.data_points)} data points")
+
+        for index, data_point in self.data_points.iterrows():
+            id = data_point[0]
+            description = data_point[1]
+            amount = data_point[2]
+            label = self._classify_data_point(data_point=description)
+
+            self.predictions.append((id, description, amount, label))
+
+        print(f"Saving predictions")
+        self.predictions_as_df = pd.DataFrame(self.predictions)
+        self.save_predictions()
diff --git a/src/notebooks/classification.ipynb b/src/notebooks/classification.ipynb
@@ -0,0 +1,232 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.models.classifier import Classifier\n",
+    "\n",
+    "classifier = Classifier()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### View data points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>66836</td>\n",
+       "      <td>GE 2019 Statics ALL .08</td>\n",
+       "      <td>2.081</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>66836</td>\n",
+       "      <td>2 GE2019 - AB Polling Day</td>\n",
+       "      <td>4,941.61</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>66836</td>\n",
+       "      <td>3 GE2019 - GOTV FS per 4 GE2019 - GOTV NHS per...</td>\n",
+       "      <td>1,945.59</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>66836</td>\n",
+       "      <td>5 GE2019 - JC - Jessi GOTV</td>\n",
+       "      <td>5,006.62</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>66836</td>\n",
+       "      <td>6 GE2019 - JC - Jessi story 7 GE2049 - JC - ri...</td>\n",
+       "      <td>58,828.99</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14713</th>\n",
+       "      <td>68003</td>\n",
+       "      <td>CCHQ Boris Johnson GE Campaign 2019 1 20 To th...</td>\n",
+       "      <td>98,153.10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14714</th>\n",
+       "      <td>68003</td>\n",
+       "      <td>Payment details: Carriage:</td>\n",
+       "      <td>£0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14715</th>\n",
+       "      <td>68003</td>\n",
+       "      <td>CCHQ Boris Johnson GE Campaign 2019 1 20 To th...</td>\n",
+       "      <td>98,153.10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14716</th>\n",
+       "      <td>68003</td>\n",
+       "      <td>Payment details: Carriage:</td>\n",
+       "      <td>£0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14717</th>\n",
+       "      <td>68003</td>\n",
+       "      <td></td>\n",
+       "      <td>£96,187.50</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>14718 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           0                                                  1           2\n",
+       "0      66836                            GE 2019 Statics ALL .08       2.081\n",
+       "1      66836                          2 GE2019 - AB Polling Day    4,941.61\n",
+       "2      66836  3 GE2019 - GOTV FS per 4 GE2019 - GOTV NHS per...    1,945.59\n",
+       "3      66836                         5 GE2019 - JC - Jessi GOTV    5,006.62\n",
+       "4      66836  6 GE2019 - JC - Jessi story 7 GE2049 - JC - ri...   58,828.99\n",
+       "...      ...                                                ...         ...\n",
+       "14713  68003  CCHQ Boris Johnson GE Campaign 2019 1 20 To th...   98,153.10\n",
+       "14714  68003                         Payment details: Carriage:       £0.00\n",
+       "14715  68003  CCHQ Boris Johnson GE Campaign 2019 1 20 To th...   98,153.10\n",
+       "14716  68003                         Payment details: Carriage:       £0.00\n",
+       "14717  68003                                                     £96,187.50\n",
+       "\n",
+       "[14718 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "classifier.data_points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Classifying 14718 data points\n"
+     ]
+    }
+   ],
+   "source": [
+    "classifier.classify()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### View predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier.predictions_as_df\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('nlp')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "bfc06090c28f9a2b1bc5cad670ac112515b1c0f123b001add159cd7414fa589d"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}