Skip to content

Commit

Permalink
Arranged jupyter notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
fer-aguirre committed Jan 17, 2024
1 parent 6f34eec commit 168d80b
Show file tree
Hide file tree
Showing 5 changed files with 2,663 additions and 2,437 deletions.
253 changes: 67 additions & 186 deletions _notebooks/1.0-analyze.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,261 +2,142 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import spacy\n",
"import re\n",
"import plotly.express as px\n",
"\n",
"nlp = spacy.load('es_core_news_sm')"
"from tqdm import tqdm\n",
"import torch\n",
"from transformers import AutoTokenizer, AutoModelForSequenceClassification"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"name": "stderr",
"output_type": "stream",
"text": [
"55374\n"
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
}
],
"source": [
"df = pd.read_csv('../data/raw/elecciones_argentina.csv')\n",
"\n",
"# Convert the 'date_local' column to datetime format\n",
"df['date_local'] = pd.to_datetime(df['date_local'])\n",
"\n",
"print(len(df))"
"model_name = \"piubamas/beto-contextualized-hate-speech\"\n",
"# Load tokenizer and model\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForSequenceClassification.from_pretrained(model_name)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 [#DebatePresidencial2023]\n",
"1 NaN\n",
"2 NaN\n",
"Name: hashtags, dtype: object"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"55374\n"
]
}
],
"source": [
"df['hashtags'] = df['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'#\\w+', x)) == 0 else re.findall(r'#\\w+', x))\n",
"df = pd.read_csv('../data/raw/elecciones_argentina.csv')\n",
"\n",
"df['hashtags'].head(3)"
"print(len(df))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 #DebatePresidencial2023\n",
"1 NaN\n",
"2 NaN\n",
"Name: hashtags, dtype: object"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"52476\n"
]
}
],
"source": [
"df['hashtags'] = df['hashtags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)\n",
"filtered_df = df.dropna(subset=['text'])\n",
"\n",
"df['hashtags'].head(3)\n"
"print(len(filtered_df))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"#MileiPresidente 52\n",
"#Debate2023 51\n",
"#MassaPresidente 40\n",
"#EleccionesArgentina2023 32\n",
"#DebatePresidencial2023 26\n",
"#DebatePresidencial 25\n",
"#Milei 24\n",
"#Elecciones2023 21\n",
"#MileiPresidente2023 21\n",
"#MassaPresidente2023 21\n",
"#Argentina 20\n",
"#Massa 20\n",
"#Viviana1079 18\n",
"#ElClubDelMoro 17\n",
"#MileiVillarruel2023 16\n",
"#MileiNo 16\n",
"#MassaNoVasASerPresidente 15\n",
"#EleccionesArgentina 13\n",
"#GranHermano 12\n",
"#PatoPresidente 12\n",
"Name: count, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# convert dataframe column to list\n",
"hashtags = df['hashtags'].unique()\n",
"\n",
"# remove nan items from list\n",
"hashtags = [x for x in hashtags if not pd.isna(x)]\n",
"\n",
"# split items into a list based on a delimiter\n",
"hashtags = [x.split(',') for x in hashtags]\n",
"id2label = [model.config.id2label[k] for k in range(len(model.config.id2label))]\n",
"\n",
"# flatten list of lists\n",
"hashtags = [item for sublist in hashtags for item in sublist]\n",
"def predict(*args):\n",
" try:\n",
" encoding = tokenizer.encode_plus(*args)\n",
"\n",
"# remove whitespaces\n",
"hashtags = list(map(lambda x: x.replace(' ', ''), hashtags))\n",
" inputs = {\n",
" k: torch.LongTensor(encoding[k]).reshape(1, -1) for k in {\"input_ids\", \"attention_mask\", \"token_type_ids\"}\n",
" }\n",
"\n",
"# count items on list\n",
"hashtags_count = pd.Series(hashtags).value_counts()\n",
" output = model.forward(\n",
" **inputs\n",
" )\n",
"\n",
"# return first n rows in descending order\n",
"top_hashtags = hashtags_count.nlargest(20)\n",
" chars = list(zip(id2label, list(output.logits[0].detach().cpu().numpy() > 0)))\n",
"\n",
"top_hashtags"
" return [char for char, pred in chars if pred]\n",
" \n",
" except Exception as e:\n",
" # Handle the error appropriately\n",
" print(\"An error occurred:\", str(e))\n",
" return np.nan"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 [SergioMassa, myriambregman, Letra_P]\n",
"1 NaN\n",
"2 NaN\n",
"Name: mentions, dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 52476/52476 [1:06:43<00:00, 13.11it/s]\n",
"/tmp/ipykernel_13246/4286658429.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" filtered_df['label'] = outputs\n"
]
}
],
"source": [
"df['mentions'] = df['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'@(\\w+)', x)) == 0 else re.findall(r'@(\\w+)', x))\n",
"texts = filtered_df['text'].to_list()\n",
"\n",
"df['mentions'].head(3)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 SergioMassa, myriambregman, Letra_P\n",
"1 NaN\n",
"2 NaN\n",
"Name: mentions, dtype: object"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['mentions'] = df['mentions'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)\n",
"outputs = []\n",
"for text in tqdm(texts):\n",
" output = predict(text)\n",
" outputs.append(output)\n",
"\n",
"df['mentions'].head(3)"
"filtered_df['label'] = outputs"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"edufeiok 1011\n",
"JonatanViale 880\n",
"JMilei 486\n",
"vivicanosaok 480\n",
"PRossiOficial 435\n",
"majulluis 433\n",
"lanacionmas 357\n",
"SergioMassa 281\n",
"PatoBullrich 263\n",
"Gatosylvestre 235\n",
"C5N 224\n",
"guadavazquez 213\n",
"fantinofantino 208\n",
"luisnovaresio 201\n",
"todonoticias 195\n",
"alfleuco 176\n",
"rialjorge 174\n",
"VickyVillarruel 148\n",
"LANACION 136\n",
"trebuquero 134\n",
"Name: count, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# convert dataframe column to list\n",
"mentions = df['mentions'].unique()\n",
"\n",
"# remove nan items from list\n",
"mentions = [x for x in mentions if not pd.isna(x)]\n",
"\n",
"# split items into a list based on a delimiter\n",
"mentions = [x.split(',') for x in mentions]\n",
"\n",
"# flatten list of lists\n",
"mentions = [item for sublist in mentions for item in sublist]\n",
"\n",
"# remove whitespaces\n",
"mentions = list(map(lambda x: x.replace(' ', ''), mentions))\n",
"\n",
"# count items on list\n",
"mentions_count = pd.Series(mentions).value_counts()\n",
"\n",
"# return first n rows in descending order\n",
"top_mentions = mentions_count.nlargest(20)\n",
"\n",
"top_mentions"
"filtered_df.to_csv('../data/raw/datos_etiquetados.csv', index=False)"
]
}
],
Expand Down
Loading

0 comments on commit 168d80b

Please sign in to comment.