Arranged jupyter notebooks

DataCritica · Jan 17, 2024 · 168d80b · 168d80b
1 parent 6f34eec
commit 168d80b
Show file tree

Hide file tree

Showing 5 changed files with 2,663 additions and 2,437 deletions.
diff --git a/_notebooks/1.0-analyze.ipynb b/_notebooks/1.0-analyze.ipynb
@@ -2,261 +2,142 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
-    "import spacy\n",
-    "import re\n",
-    "import plotly.express as px\n",
-    "\n",
-    "nlp = spacy.load('es_core_news_sm')"
+    "from tqdm import tqdm\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModelForSequenceClassification"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "55374\n"
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
      ]
     }
    ],
    "source": [
-    "df = pd.read_csv('../data/raw/elecciones_argentina.csv')\n",
-    "\n",
-    "# Convert the 'date_local' column to datetime format\n",
-    "df['date_local'] = pd.to_datetime(df['date_local'])\n",
-    "\n",
-    "print(len(df))"
+    "model_name = \"piubamas/beto-contextualized-hate-speech\"\n",
+    "# Load tokenizer and model\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(model_name)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "0    [#DebatePresidencial2023]\n",
-       "1                          NaN\n",
-       "2                          NaN\n",
-       "Name: hashtags, dtype: object"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "55374\n"
+     ]
     }
    ],
    "source": [
-    "df['hashtags'] = df['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'#\\w+', x)) == 0 else re.findall(r'#\\w+', x))\n",
+    "df = pd.read_csv('../data/raw/elecciones_argentina.csv')\n",
     "\n",
-    "df['hashtags'].head(3)"
+    "print(len(df))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "0    #DebatePresidencial2023\n",
-       "1                        NaN\n",
-       "2                        NaN\n",
-       "Name: hashtags, dtype: object"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "52476\n"
+     ]
     }
    ],
    "source": [
-    "df['hashtags'] = df['hashtags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)\n",
+    "filtered_df = df.dropna(subset=['text'])\n",
     "\n",
-    "df['hashtags'].head(3)\n"
+    "print(len(filtered_df))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 32,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "#MileiPresidente             52\n",
-       "#Debate2023                  51\n",
-       "#MassaPresidente             40\n",
-       "#EleccionesArgentina2023     32\n",
-       "#DebatePresidencial2023      26\n",
-       "#DebatePresidencial          25\n",
-       "#Milei                       24\n",
-       "#Elecciones2023              21\n",
-       "#MileiPresidente2023         21\n",
-       "#MassaPresidente2023         21\n",
-       "#Argentina                   20\n",
-       "#Massa                       20\n",
-       "#Viviana1079                 18\n",
-       "#ElClubDelMoro               17\n",
-       "#MileiVillarruel2023         16\n",
-       "#MileiNo                     16\n",
-       "#MassaNoVasASerPresidente    15\n",
-       "#EleccionesArgentina         13\n",
-       "#GranHermano                 12\n",
-       "#PatoPresidente              12\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# convert dataframe column to list\n",
-    "hashtags = df['hashtags'].unique()\n",
-    "\n",
-    "# remove nan items from list\n",
-    "hashtags = [x for x in hashtags if not pd.isna(x)]\n",
-    "\n",
-    "# split items into a list based on a delimiter\n",
-    "hashtags = [x.split(',') for x in hashtags]\n",
+    "id2label = [model.config.id2label[k] for k in range(len(model.config.id2label))]\n",
     "\n",
-    "# flatten list of lists\n",
-    "hashtags = [item for sublist in hashtags for item in sublist]\n",
+    "def predict(*args):\n",
+    "    try:\n",
+    "        encoding = tokenizer.encode_plus(*args)\n",
     "\n",
-    "# remove whitespaces\n",
-    "hashtags = list(map(lambda x: x.replace(' ', ''), hashtags))\n",
+    "        inputs = {\n",
+    "            k: torch.LongTensor(encoding[k]).reshape(1, -1) for k in {\"input_ids\", \"attention_mask\", \"token_type_ids\"}\n",
+    "        }\n",
     "\n",
-    "# count items on list\n",
-    "hashtags_count = pd.Series(hashtags).value_counts()\n",
+    "        output = model.forward(\n",
+    "            **inputs\n",
+    "        )\n",
     "\n",
-    "# return first n rows in descending order\n",
-    "top_hashtags = hashtags_count.nlargest(20)\n",
+    "        chars = list(zip(id2label, list(output.logits[0].detach().cpu().numpy() > 0)))\n",
     "\n",
-    "top_hashtags"
+    "        return [char for char, pred in chars if pred]\n",
+    "    \n",
+    "    except Exception as e:\n",
+    "        # Handle the error appropriately\n",
+    "        print(\"An error occurred:\", str(e))\n",
+    "        return np.nan"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "0    [SergioMassa, myriambregman, Letra_P]\n",
-       "1                                      NaN\n",
-       "2                                      NaN\n",
-       "Name: mentions, dtype: object"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 52476/52476 [1:06:43<00:00, 13.11it/s]\n",
+      "/tmp/ipykernel_13246/4286658429.py:8: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  filtered_df['label'] = outputs\n"
+     ]
     }
    ],
    "source": [
-    "df['mentions'] = df['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'@(\\w+)', x)) == 0 else re.findall(r'@(\\w+)', x))\n",
+    "texts = filtered_df['text'].to_list()\n",
     "\n",
-    "df['mentions'].head(3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0    SergioMassa, myriambregman, Letra_P\n",
-       "1                                    NaN\n",
-       "2                                    NaN\n",
-       "Name: mentions, dtype: object"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['mentions'] = df['mentions'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)\n",
+    "outputs = []\n",
+    "for text in tqdm(texts):\n",
+    "    output = predict(text)\n",
+    "    outputs.append(output)\n",
     "\n",
-    "df['mentions'].head(3)"
+    "filtered_df['label'] = outputs"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 36,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "edufeiok           1011\n",
-       "JonatanViale        880\n",
-       "JMilei              486\n",
-       "vivicanosaok        480\n",
-       "PRossiOficial       435\n",
-       "majulluis           433\n",
-       "lanacionmas         357\n",
-       "SergioMassa         281\n",
-       "PatoBullrich        263\n",
-       "Gatosylvestre       235\n",
-       "C5N                 224\n",
-       "guadavazquez        213\n",
-       "fantinofantino      208\n",
-       "luisnovaresio       201\n",
-       "todonoticias        195\n",
-       "alfleuco            176\n",
-       "rialjorge           174\n",
-       "VickyVillarruel     148\n",
-       "LANACION            136\n",
-       "trebuquero          134\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# convert dataframe column to list\n",
-    "mentions = df['mentions'].unique()\n",
-    "\n",
-    "# remove nan items from list\n",
-    "mentions = [x for x in mentions if not pd.isna(x)]\n",
-    "\n",
-    "# split items into a list based on a delimiter\n",
-    "mentions = [x.split(',') for x in mentions]\n",
-    "\n",
-    "# flatten list of lists\n",
-    "mentions = [item for sublist in mentions for item in sublist]\n",
-    "\n",
-    "# remove whitespaces\n",
-    "mentions = list(map(lambda x: x.replace(' ', ''), mentions))\n",
-    "\n",
-    "# count items on list\n",
-    "mentions_count = pd.Series(mentions).value_counts()\n",
-    "\n",
-    "# return first n rows in descending order\n",
-    "top_mentions = mentions_count.nlargest(20)\n",
-    "\n",
-    "top_mentions"
+    "filtered_df.to_csv('../data/raw/datos_etiquetados.csv', index=False)"
    ]
   }
  ],