From 55d98dcca97f222751d80d504ed59a6d47a29576 Mon Sep 17 00:00:00 2001 From: Juliana Resplande Date: Wed, 23 Aug 2023 10:01:21 -0300 Subject: [PATCH] Add files via upload --- process.ipynb | 1615 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1615 insertions(+) create mode 100644 process.ipynb diff --git a/process.ipynb b/process.ipynb new file mode 100644 index 0000000..25490cc --- /dev/null +++ b/process.ipynb @@ -0,0 +1,1615 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install matplotlib_venn gensim matpltlib pandas numpy tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "zIUmmW67vs3H" + }, + "outputs": [], + "source": [ + "from collections import Counter\n", + "from functools import reduce\n", + "import os\n", + "\n", + "\n", + "from matplotlib_venn import venn2_unweighted\n", + "from gensim.utils import deaccent\n", + "import matplotlib.pyplot as plt\n", + "from tqdm.auto import tqdm\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "tqdm.pandas()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "64j4lYuov-a5" + }, + "outputs": [], + "source": [ + "datasets = dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "RAW_DATA_DIR = \"raw_data\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/mnt/d/Pesquisa/Fact br eval/raw_data\n" + ] + } + ], + "source": [ + "%cd {RAW_DATA_DIR}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A - Baixar dados" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://github.com/kamplus/FakeNewsSet/raw/master/Dataset/News_fake.csv\n", + "!wget https://github.com/kamplus/FakeNewsSet/raw/master/Dataset/News_notFake.csv\n", + "!wget https://zenodo.org/record/5191798/files/central_de_fatos.csv\n", + "!wget https://github.com/Gabriel-Lino-Garcia/FakeRecogna/raw/master/dataset/FakeRecogna.xlsx\n", + "!git clone https://github.com/roneysco/Fake.br-Corpus\n", + "!gdown --id 1Xx_4bw37cgrEkEhQ1toLsIbAj2pLgvDx -O fact-check_tweet_dataset.zip\n", + "!unzip fact-check_tweet_dataset.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## B - Importar dados" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "7ChABOSnvx5X" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11647\n", + "10551\n" + ] + } + ], + "source": [ + "datasets[\"Central de fatos\"] = pd.read_csv(\"central_de_fatos.csv\", sep=\";\")\n", + "print(datasets[\"Central de fatos\"].shape[0])\n", + "datasets[\"Central de fatos\"][\"rating\"] = datasets[\"Central de fatos\"][\"rating\"].str.lower().apply(eval).apply(set)\n", + "datasets[\"Central de fatos\"] = datasets[\"Central de fatos\"][datasets[\"Central de fatos\"][\"rating\"].apply(len) == 1]\n", + "print(datasets[\"Central de fatos\"].shape[0])\n", + "datasets[\"Central de fatos\"][\"categories\"] = datasets[\"Central de fatos\"][\"categories\"] \\\n", + " .apply(lambda c: eval(c) if isinstance(c, str) and \"[\" in c else c) \\\n", + " .apply(lambda c: c[0] if isinstance(c, list) and len(c) > 0 else c) \\\n", + " .apply(lambda c: \"\" if isinstance(c, list) and len(c) == 0 else c)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "fake_news_set_fake = pd.read_csv(\"News_fake.csv\")\n", + "fake_news_set_fake[\"alternative_name\"] = \"fake\"\n", + "\n", + "fake_news_set_not_fake = pd.read_csv(\"News_notFake.csv\")\n", + "fake_news_set_not_fake[\"alternative_name\"] = \"not_fake\"\n", + "\n", + "fake_news_set = pd.concat([fake_news_set_fake, fake_news_set_not_fake])\n", + "datasets[\"FakeNewsSet\"] = fake_news_set\n", + "datasets[\"FakeNewsSet\"][\"tweet_ids\"] = datasets[\"FakeNewsSet\"][\"tweet_ids\"].str.split(\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "600\n", + "27059\n" + ] + } + ], + "source": [ + "print(datasets[\"FakeNewsSet\"].shape[0])\n", + "print(datasets[\"FakeNewsSet\"][\"tweet_ids\"].apply(len).sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ecc093250e6c4bbab6fff9ae039dad66", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3600 [00:00 0).sum(axis=1).sort_values(ascending=False).index]\n", + "domain_count = domain_count[[\"FakeRecogna\", \"Central de fatos\", \"Fact-check_tweet\", \"FakeNewsSet\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FakeRecognaCentral de fatosFact-check_tweetFakeNewsSet
domain
globo.com236681876279
uol.com.br49501825175138
afp.com49504910
aosfatos.org01431100143
boatos.org24745523950
projetocomprova.com.br38836120
apublica.org00411
estadao.com.br0593620
e-farsas.com787000
gov.br215000
observador.pt00210
r7.com00012
ricmais.com.br0001
sapo.pt00630
\n", + "
" + ], + "text/plain": [ + " FakeRecogna Central de fatos Fact-check_tweet \\\n", + "domain \n", + "globo.com 2366 818 76 \n", + "uol.com.br 4950 1825 175 \n", + "afp.com 495 0 49 \n", + "aosfatos.org 0 1431 100 \n", + "boatos.org 2474 5523 95 \n", + "projetocomprova.com.br 388 361 2 \n", + "apublica.org 0 0 4 \n", + "estadao.com.br 0 593 62 \n", + "e-farsas.com 787 0 0 \n", + "gov.br 215 0 0 \n", + "observador.pt 0 0 21 \n", + "r7.com 0 0 0 \n", + "ricmais.com.br 0 0 0 \n", + "sapo.pt 0 0 63 \n", + "\n", + " FakeNewsSet \n", + "domain \n", + "globo.com 279 \n", + "uol.com.br 138 \n", + "afp.com 10 \n", + "aosfatos.org 143 \n", + "boatos.org 0 \n", + "projetocomprova.com.br 0 \n", + "apublica.org 11 \n", + "estadao.com.br 0 \n", + "e-farsas.com 0 \n", + "gov.br 0 \n", + "observador.pt 0 \n", + "r7.com 12 \n", + "ricmais.com.br 1 \n", + "sapo.pt 0 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "domain_count" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ax = domain_count.T.plot(kind='bar', stacked=True, color=[plt.cm.Paired(i) for i in range(20)])\n", + "_ = plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_ = venn2_unweighted([review_urls[\"FakeRecogna\"], review_urls[\"Central de fatos\"]], set_labels=[\"FakeRecogna\", \"Central de fatos\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "categories_1 = datasets[\"FakeRecogna\"].value_counts(subset=\"Categoria\").reset_index(name=\"FakeRecogna\")\n", + "categories_1[\"Categoria\"] = categories_1[\"Categoria\"].str.lower()\n", + "\n", + "categories_2 = datasets[\"Central de fatos\"][\"categories\"].value_counts().reset_index().dropna()\n", + "categories_2 = categories_2.rename(columns={\"index\": \"Categoria\", \"categories\": \"Central de fatos\"})\n", + "categories_2[\"Categoria\"] = categories_2[\"Categoria\"].str.lower()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoriaCentral de fatosFakeRecogna
0política15693951
1brasil1081904
2saúde6604456
3mundo555580
4entretenimento4641409
5ciência79602
\n", + "
" + ], + "text/plain": [ + " Categoria Central de fatos FakeRecogna\n", + "0 política 1569 3951\n", + "1 brasil 1081 904\n", + "2 saúde 660 4456\n", + "3 mundo 555 580\n", + "4 entretenimento 464 1409\n", + "5 ciência 79 602" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "categories_2.merge(categories_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_ = venn2_unweighted([review_urls[\"Fact-check_tweet\"], review_urls[\"FakeNewsSet\"]], set_labels=[\"Fact-check_tweet\", \"FakeNewsSet\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "all_review_urls = reduce(lambda a, b: a.union(b), review_urls.values())\n", + "\n", + "review_urls_count = list()\n", + "for review_url in all_review_urls:\n", + " item = {\"review_url\": review_url}\n", + " item.update({dataset: review_url in review_urls[dataset] for dataset in review_urls.keys()})\n", + " review_urls_count.append(item)\n", + "\n", + "review_urls_count = pd.DataFrame(review_urls_count).set_index(\"review_url\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20028" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_review_urls)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "23467" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "domain_count.sum().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FakeRecognaCentral de fatosFact-check_tweetFakeNewsSet0
0TrueFalseFalseFalse8841
1FalseTrueFalseFalse7298
6FalseTrueTrueFalse192
3FalseFalseFalseTrue312
\n", + "
" + ], + "text/plain": [ + " FakeRecogna Central de fatos Fact-check_tweet FakeNewsSet 0\n", + "0 True False False False 8841\n", + "1 False True False False 7298\n", + "6 False True True False 192\n", + "3 False False False True 312" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_urls_count.value_counts().reset_index().loc[[0,1,6,3]]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FakeRecognaCentral de fatosFact-check_tweetFakeNewsSet0
2TrueTrueFalseFalse2675
8TrueFalseTrueFalse42
10TrueFalseFalseTrue5
5FalseTrueFalseTrue250
4FalseFalseTrueFalse274
11FalseFalseTrueTrue3
\n", + "
" + ], + "text/plain": [ + " FakeRecogna Central de fatos Fact-check_tweet FakeNewsSet 0\n", + "2 True True False False 2675\n", + "8 True False True False 42\n", + "10 True False False True 5\n", + "5 False True False True 250\n", + "4 False False True False 274\n", + "11 False False True True 3" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_urls_count.value_counts().reset_index().loc[[2,8,10,5,4,11]]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FakeRecognaCentral de fatosFact-check_tweetFakeNewsSet0
7TrueTrueTrueFalse112
9FalseTrueTrueTrue24
\n", + "
" + ], + "text/plain": [ + " FakeRecogna Central de fatos Fact-check_tweet FakeNewsSet 0\n", + "7 True True True False 112\n", + "9 False True True True 24" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_urls_count.value_counts().reset_index().loc[[7,9]]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "review_count = review_urls_count.sum(axis=1).sort_values(ascending=False).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 16725\n", + "2 3167\n", + "3 136\n", + "Name: 0, dtype: int64" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_count[0].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "tweet_ids = dict()\n", + "tweet_ids[\"Fact-check_tweet\"] = set(datasets[\"Fact-check_tweet\"]['tweet_id'])\n", + "tweet_ids[\"FakeNewsSet\"] = reduce(lambda a, b: set(a).union(set(b)),datasets[\"FakeNewsSet\"]['tweet_ids'])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_ = venn2_unweighted([tweet_ids[\"Fact-check_tweet\"], tweet_ids[\"FakeNewsSet\"]], set_labels=[\"Fact-check_tweet\", \"FakeNewsSet\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "categories_1_fixed = datasets[\"FakeRecogna\"][\"URL\"].apply(lambda url: \"//\".join(url.split(\"//\")[1:]).replace(\"www.\",\"\").split(\"/\")[1])\n", + "categories_1_fixed = categories_1_fixed[categories_1_fixed.str.len() < 30].dropna().value_counts().reset_index(name=\"FakeRecogna\").rename(columns={\"index\": \"Categoria\"})\n", + "categories_2[\"Categoria\"] = categories_2[\"Categoria\"].apply(deaccent)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoriaCentral de fatosFakeRecogna
02418503
1politica15691549
2brasil1081341
3saude6601312
4mundo555395
5tecnologia549295
6entretenimento464167
7religiao271101
8esporte215276
9ciencia7939
10opiniao432
11eleicoes30185
\n", + "
" + ], + "text/plain": [ + " Categoria Central de fatos FakeRecogna\n", + "0 2418 503\n", + "1 politica 1569 1549\n", + "2 brasil 1081 341\n", + "3 saude 660 1312\n", + "4 mundo 555 395\n", + "5 tecnologia 549 295\n", + "6 entretenimento 464 167\n", + "7 religiao 271 101\n", + "8 esporte 215 276\n", + "9 ciencia 79 39\n", + "10 opiniao 43 2\n", + "11 eleicoes 30 185" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "categories_2.merge(categories_1_fixed,on=\"Categoria\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## E - Unificar" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "find_domain = lambda url: norm_domain(treat_url(url).split(\"/\")[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "url_idx = pd.DataFrame(review_urls_count.index).reset_index()\n", + "url_idx[\"index\"] = url_idx[\"index\"].apply(lambda idx: f\"review_{idx:05d}\")\n", + "url_idx = url_idx.set_index(\"review_url\")[\"index\"].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "raw_df = datasets[\"FakeRecogna\"]\n", + "\n", + "fakeRecogna = pd.DataFrame()\n", + "\n", + "fakeRecogna[\"review_text\"] = raw_df[\"Titulo\"] + \"\\n\" + raw_df[\"Subtitulo\"].fillna(\"\") + \"\\n\" + raw_df[\"Noticia\"]\n", + "fakeRecogna[\"review_text\"] = fakeRecogna[\"review_text\"].str.strip()\n", + "\n", + "fakeRecogna[\"review_author\"] = raw_df[\"Autor\"].fillna(\"\").str.strip()\n", + "fakeRecogna[\"review_author\"] = fakeRecogna[\"review_author\"].apply(lambda a: a if \"/20\" not in a else \"\")\n", + "fakeRecogna[\"review_author\"] = fakeRecogna[\"review_author\"].fillna(\"\").str.replace(\"Por \", \"\")\n", + "fakeRecogna[\"review_author\"] = fakeRecogna[\"review_author\"].apply(lambda a: a if len(a.split(\" \")) <= 22 else \"\")\n", + "\n", + "fakeRecogna[\"review_url\"] = raw_df[\"URL\"].fillna(\"\").str.strip()\n", + "fakeRecogna[\"review_domain\"] = fakeRecogna[\"review_url\"].apply(find_domain)\n", + "\n", + "fakeRecogna[\"review_date\"] = raw_df[\"Data\"].fillna(\"\").str.strip()\n", + "fakeRecogna[\"review_date\"] = fakeRecogna[\"review_date\"].apply(lambda d: d if isinstance(d, str) and len(d) >= 10 else \"\")\n", + "fakeRecogna[\"review_date\"] = fakeRecogna[\"review_date\"].apply(lambda d: d[:10] if isinstance(d, str) and len(d) > 10 else d)\n", + "fakeRecogna[\"review_date\"] = fakeRecogna[\"review_date\"].fillna(\"\").str.strip()\n", + "\n", + "fakeRecogna[\"category\"] = raw_df[\"Categoria\"]\n", + "fakeRecogna[\"is_fake\"] = raw_df[\"Classe\"].map({0.0: 1, 1.0: -1})\n", + "\n", + "review_id = fakeRecogna[\"review_url\"].apply(lambda url: \"//\".join(url.split(\"//\")[1:]).replace(\"www.\",\"\")).map(url_idx)\n", + "fakeRecogna.insert(0, \"review_id\", review_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "FAKE_LABELS = {\"boato\", \"falso\", \"fake\", \"enganoso\", \"fora de contexto\", \"distorcido\", \"exagerado\", \"constraditório\", \"impreciso\", \"insustentável\", \"contexto errado\"}\n", + "\n", + "def map_rating(rating: set):\n", + " if rating.intersection(FAKE_LABELS):\n", + " return 1\n", + " elif rating.intersection({\"verdadeiro, mas\", \"ainda é cedo para dizer\"}):\n", + " return 0\n", + " \n", + " return -1\n", + "\n", + "raw_df = datasets['Central de fatos']\n", + "\n", + "\n", + "central_de_fatos = pd.DataFrame()\n", + "\n", + "central_de_fatos[\"review_text\"] = raw_df[\"title\"] + \"\\n\" + raw_df[\"subtitle\"].fillna(\"\") + \"\\n\" + raw_df[\"text_news\"]\n", + "central_de_fatos[\"review_text\"] = central_de_fatos[\"review_text\"].str.strip()\n", + "\n", + "central_de_fatos[\"review_author\"] = \"\"\n", + "central_de_fatos[\"review_url\"] = raw_df[\"url\"].str.strip()\n", + "central_de_fatos[\"review_domain\"] = central_de_fatos[\"review_url\"].apply(find_domain)\n", + "central_de_fatos[\"review_date\"] = raw_df[\"publication_date\"].fillna(\"\").str.strip()\n", + "central_de_fatos[\"category\"] = raw_df[\"categories\"].str.lower()\n", + "central_de_fatos[\"is_fake\"] = raw_df[\"rating\"].map(map_rating)\n", + "\n", + "review_id = central_de_fatos[\"review_url\"].apply(lambda url: \"//\".join(url.split(\"//\")[1:]).replace(\"www.\",\"\")).map(url_idx)\n", + "central_de_fatos.insert(0, \"review_id\", review_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "fact_check_tt_pt = pd.DataFrame()\n", + "raw_df = datasets['Fact-check_tweet']\n", + "fact_check_tt_pt[\"review_url\"] = raw_df[\"article_url\"]\n", + "fact_check_tt_pt[\"claim_ids\"] = raw_df[\"tweet_id\"].apply(lambda x: [x])\n", + "fact_check_tt_pt[\"is_fake\"] = raw_df[\"label\"].map({0: -1, 1:1})\n", + "review_id = fact_check_tt_pt[\"review_url\"].apply(lambda url: \"//\".join(url.split(\"//\")[1:]).replace(\"www.\",\"\")).map(url_idx)\n", + "fact_check_tt_pt.insert(0, \"review_id\", review_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "fakeNewsSet = pd.DataFrame()\n", + "raw_df = datasets[\"FakeNewsSet\"]\n", + "fakeNewsSet[\"review_url\"] = raw_df[\"news_url\"]\n", + "fakeNewsSet[\"claim_ids\"] = raw_df[\"tweet_ids\"]\n", + "fakeNewsSet[\"is_fake\"] = raw_df[\"alternative_name\"].map({\"fake\": 1, \"not_fake\":-1})\n", + "review_id = fakeNewsSet[\"review_url\"].apply(lambda url: \"//\".join(url.split(\"//\")[1:]).replace(\"www.\",\"\")).map(url_idx)\n", + "fakeNewsSet.insert(0, \"review_id\", review_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "fakebr = pd.DataFrame()\n", + "raw_df = datasets['Fake.br']\n", + "fakebr[\"claim_text\"] = raw_df[\"text\"]\n", + "fakebr[\"claim_author\"] = raw_df[\"author\"]\n", + "fakebr[\"claim_url\"] = raw_df[\"link\"]\n", + "fakebr[\"claim_date\"] = raw_df[\"date of publication\"]\n", + "fakebr[\"category\"] = raw_df[\"category\"]\n", + "fakebr[\"is_fake\"] = raw_df[\"label\"].map({\"fake\": 1, \"true\": -1})" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "final_dataset = {\n", + " \"fact_check_tweet_pt\": fact_check_tt_pt,\n", + " \"FakeNewsSet\": fakeNewsSet,\n", + " \"FakeRecogna\": fakeRecogna,\n", + " \"central_de_fatos\": central_de_fatos,\n", + " \"fake_br\": fakebr,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "review_urls_count.columns = [\"FakeRecogna\", \"central_de_fatos\", \"fact_check_tweet_pt\", 'FakeNewsSet']" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4b18af638921437b803f6099346ada8b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3303 [00:00 1]\n", + "for url, row in tqdm(intersection_urls.iterrows(), total=intersection_urls.shape[0]):\n", + " intersection_label = set()\n", + "\n", + " review_id = url_idx[url]\n", + " for dataset in row[row == True].index:\n", + " dataset_label = final_dataset[dataset][final_dataset[dataset][\"review_id\"] == review_id][\"is_fake\"].iloc[0]\n", + " intersection_label.add(dataset_label)\n", + "\n", + " if len(intersection_label) > 1:\n", + " diff_reviews.add(review_id)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3303 240\n" + ] + } + ], + "source": [ + "print(intersection_urls.shape[0], len(diff_reviews))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "for dataset in final_dataset:\n", + " if not \"review_id\" in final_dataset[dataset].columns:\n", + " continue\n", + "\n", + " final_dataset[dataset] = final_dataset[dataset][~final_dataset[dataset][\"review_id\"].isin(diff_reviews)]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/mnt/d/Pesquisa/Fact br eval\n" + ] + } + ], + "source": [ + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "ename": "FileExistsError", + "evalue": "[Errno 17] File exists: 'data'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileExistsError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[46], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m os\u001b[39m.\u001b[39;49mmakedirs(\u001b[39m\"\u001b[39;49m\u001b[39mdata\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", + "File \u001b[0;32m:225\u001b[0m, in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n", + "\u001b[0;31mFileExistsError\u001b[0m: [Errno 17] File exists: 'data'" + ] + } + ], + "source": [ + "os.makedirs(\"data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "for dataset in final_dataset:\n", + " final_dataset[dataset].to_csv(os.path.join(\"data\",f\"{dataset}.tsv\"), sep=\"\\t\", index=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "updating: data/ (stored 0%)\n", + "updating: data/central_de_fatos.tsv (deflated 68%)\n", + "updating: data/fact_check_tweet_pt.tsv (deflated 72%)\n", + "updating: data/FakeNewsSet.tsv (deflated 61%)\n", + "updating: data/FakeRecogna.tsv (deflated 65%)\n", + "updating: data/fake_br.tsv (deflated 63%)\n" + ] + } + ], + "source": [ + "!zip FactChecksbr.zip -r data" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}