diff --git a/notebooks/00_preprocessing.ipynb b/notebooks/00_preprocessing.ipynb index c72f1aa..e86462b 100644 --- a/notebooks/00_preprocessing.ipynb +++ b/notebooks/00_preprocessing.ipynb @@ -24,10 +24,7 @@ "import gc # Módulo para realizar coleta de lixo e gerenciamento de memória\n", "\n", "import numpy as np # Módulo para trabalhar com matrizes e funções matemáticas\n", - "import pandas as pd # Módulo para trabalhar com dataframes e séries em Python\n", - "\n", - "import nltk # Módulo para processamento de linguagem natural\n", - "\n" + "import pandas as pd # Módulo para trabalhar com dataframes e séries em Python\n" ] }, { @@ -44,20 +41,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# caminho das queries \n", - "query_path = '../data/emails/mini_newsgroups/misc.forsale/'\n", - "\n", - "# caminho dos documentos\n", - "docs_path = '../data/emails/20_newsgroups/misc.forsale/'" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -73,23 +57,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "35" + "0" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# caminho das queries \n", + "query_path = '../data/emails/mini_newsgroups/misc.forsale/'\n", + "\n", + "# caminho dos documentos\n", + "docs_path = '../data/emails/20_newsgroups/misc.forsale/'\n", + "\n", "# Import das bases\n", - "database_docs = read_files(query_path)\n", + "database_docs = read_files(docs_path)\n", "database_query = read_files(query_path)\n", "\n", "base_docs = pd.DataFrame(database_docs)\n", @@ -101,6 +91,7 @@ "\n", "# junção das bases \n", "base = pd.concat([base_docs, base_query])\n", + "base.reset_index(drop=True, inplace=True)\n", "\n", "del base_docs, base_query, database_docs, database_query\n", "gc.collect()\n" @@ -124,69 +115,123 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "import ir " + "# (\\[a-z]): para encontrar todos os caracteres que começam com uma barra invertida () seguida por uma letra minúscula (a-z);\n", + "# ([^\\w\\]): para encontrar todos os caracteres que não são letras, números ou barras invertidas ();\n", + "# (\\S+\\d\\S+): para encontrar todos os trechos de texto que contêm um ou mais caracteres não brancos (\\S), \n", + "# seguidos por um dígito (\\d), seguidos por mais um ou mais caracteres não brancos (\\S).\n", + "base['post'] = base['text'].replace(r'(\\\\[a-z])|([^\\w\\\\])|(\\S+\\d\\S+)', ' ', regex=True)\n", + "\n", + "\n", + "# Aplicando as funções str.lower() e str.strip() simultaneamente\n", + "base['post'] = base['post'].apply(lambda x: x.lower().strip())\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "### Tokenização e Lemmatizer\n", + "\n", + "
" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "0 path cantaloupe srv c cmu edu rochester udel g...\n", + "1 path from myoakam ci ohio state edu micah r yo...\n", + "2 path from maureen l eagle newsgroup misc forsa...\n", + "3 newsgroup misc forsale path from mike diack mi...\n", + "4 path from jvinson xsoft xerox com jeffrey a vi...\n", + " ... \n", + "1095 xref cantaloupe srv c cmu edu newsgroup misc w...\n", + "1096 newsgroup misc forsale subject want lcd overhe...\n", + "1097 newsgroup ingr forsale hsv forsale misc forsal...\n", + "1098 newsgroup misc forsale path cantaloupe srv c c...\n", + "1099 xref cantaloupe srv c cmu edu path from scott ...\n", + "Name: post, Length: 1100, dtype: object" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from ir.preprocessing import lemmatize_word\n", + "\n", + "base['post'].apply(lambda x: ' '.join([lemmatize_word(word.lower()) for word in x.split()]))" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "# (\\[a-z]): para encontrar todos os caracteres que começam com uma barra invertida () seguida por uma letra minúscula (a-z);\n", - "# ([^\\w\\]): para encontrar todos os caracteres que não são letras, números ou barras invertidas ();\n", - "# (\\S+\\d\\S+): para encontrar todos os trechos de texto que contêm um ou mais caracteres não brancos (\\S), seguidos por um dígito (\\d), seguidos por mais um ou mais caracteres não brancos (\\S).\n", - "base['post'] = base['text'].replace(r'(\\\\[a-z])|([^\\w\\\\])|(\\S+\\d\\S+)', ' ', regex=True)\n", - "\n", + "from ir import tf_idf\n", "\n", - "# Aplicando as funções str.lower() e str.strip() simultaneamente\n", - "base['post'] = base['post'].apply(lambda x: x.lower().strip())\n" + "weight = tf_idf.tfidf(base, 'post').iloc[1:]\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Identificação das query / docs" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ - "from ir.preprocessing import PreProcessing as pp " + "d_index = base.query('tag==\"doc\"').index\n", + "q_index = base.query('tag==\"query\"').index" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 145, "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "type object 'PreProcessing' has no attribute 'tfidf'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[34], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m pp\u001b[39m.\u001b[39;49mtfidf(base, \u001b[39m'\u001b[39m\u001b[39mpost\u001b[39m\u001b[39m'\u001b[39m)\n", - "\u001b[1;31mAttributeError\u001b[0m: type object 'PreProcessing' has no attribute 'tfidf'" - ] - } - ], + "outputs": [], "source": [ - "pp.tfidf(base, 'post')" + "import itertools\n", + "\n", + "similarity = dict()\n", + "index_matrix = dict()\n", + "rank_matrix = dict()\n", + "\n", + "for j in q_index:\n", + " for i in d_index: \n", + " numerator = np.sum( weight.loc[:,i] * weight.loc[: , j])\n", + " denominator = np.linalg.norm(weight.loc[:,i])*np.linalg.norm(weight.loc[:,1019])\n", + " similarity[i] = numerator/denominator\n", + "\n", + "\n", + " rank_matrix[j] = pd.DataFrame(similarity.values(), columns=['rank']).sort_values(by='rank', ascending=False).head(10).values.tolist()\n", + " index_matrix[j] = pd.DataFrame(similarity.values(), columns=['rank']).sort_values(by='rank', ascending=False).head(10).index.to_list()\n", + "\n", + " \n" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 153, "metadata": {}, "outputs": [ { @@ -210,574 +255,370 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", - " 2\n", - " 3\n", - " 4\n", - " 5\n", - " 6\n", - " 7\n", - " 8\n", - " 9\n", + " 1000\n", + " 1001\n", + " 1002\n", + " 1003\n", + " 1004\n", + " 1005\n", + " 1006\n", + " 1007\n", + " 1008\n", + " 1009\n", " ...\n", - " 90\n", - " 91\n", - " 92\n", - " 93\n", - " 94\n", - " 95\n", - " 96\n", - " 97\n", - " 98\n", - " 99\n", + " 1090\n", + " 1091\n", + " 1092\n", + " 1093\n", + " 1094\n", + " 1095\n", + " 1096\n", + " 1097\n", + " 1098\n", + " 1099\n", " \n", " \n", " \n", " \n", - " \n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " ...\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " \n", - " \n", - " edu\n", - " 0.769037\n", - " 0.769037\n", - " 0.553274\n", - " 0.612645\n", - " 0.702170\n", - " 0.702170\n", - " 0.476731\n", - " 0.476731\n", - " 0.702170\n", - " 0.368849\n", + " 0\n", + " [0.6422584945103841]\n", + " [0.668743974268336]\n", + " [1.008422835773131]\n", + " [0.6661363151210968]\n", + " [0.8642602181331219]\n", + " [0.7417012424961686]\n", + " [0.7315442686011049]\n", + " [0.9460833202189584]\n", + " [0.9746037822064916]\n", + " [0.6869123604148133]\n", " ...\n", - " 0.661155\n", - " 0.476731\n", - " 0.702170\n", - " 0.368849\n", - " 0.368849\n", - " 0.553274\n", - " 0.661155\n", - " 0.702170\n", - " 0.661155\n", - " 0.184425\n", + " [1.1523833456270243]\n", + " [0.7956243694544436]\n", + " [1.7802942925566625]\n", + " [0.5158809769289813]\n", + " [0.5234239788184921]\n", + " [0.8038496213979783]\n", + " [0.6360279187397326]\n", + " [1.1259044435676868]\n", + " [1.0616162379284384]\n", + " [0.6784846202202287]\n", " \n", " \n", - " wpi\n", - " 23.817975\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " 1\n", + " [0.2873172889001637]\n", + " [0.17456679249116952]\n", + " [0.6332470445844255]\n", + " [0.09678526898308248]\n", + " [0.537824913541413]\n", + " [0.2820702175096787]\n", + " [0.6513139955405113]\n", + " [0.3597970619791958]\n", + " [0.24900044829809295]\n", + " [0.14368626326554504]\n", " ...\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " [0.42456045151374616]\n", + " [0.13839258426540738]\n", + " [0.1920094056436893]\n", + " [0.4608108432879023]\n", + " [0.46754863207460967]\n", + " [0.635942460180357]\n", + " [0.2673113797648495]\n", + " [0.09064837068978648]\n", + " [0.19426228087688888]\n", + " [0.17427577670397354]\n", " \n", " \n", - " to\n", - " 0.392922\n", - " 0.000000\n", - " 0.578730\n", - " 0.152003\n", - " 0.456009\n", - " 0.152003\n", - " 0.152003\n", - " 0.304006\n", - " 0.392922\n", - " 0.152003\n", + " 2\n", + " [0.19584221062188678]\n", + " [0.1319897299803032]\n", + " [0.08491363433325547]\n", + " [0.08737476357264361]\n", + " [0.47799964699339453]\n", + " [0.20691840972537448]\n", + " [0.0713300455225047]\n", + " [0.18078764107798664]\n", + " [0.11093369090167131]\n", + " [0.09580043962698802]\n", " ...\n", - " 0.304006\n", - " 0.392922\n", - " 0.544925\n", - " 0.000000\n", - " 0.000000\n", - " 0.152003\n", - " 0.152003\n", - " 0.152003\n", - " 0.504943\n", - " 0.152003\n", + " [0.39006994534248846]\n", + " [0.09497709201044909]\n", + " [0.17705994384084703]\n", + " [0.1911406606344065]\n", + " [0.19471247373710213]\n", + " [0.6126542869398587]\n", + " [0.06788233871189489]\n", + " [0.07823702305427702]\n", + " [0.17319361461646068]\n", + " [0.07447944169226609]\n", " \n", " \n", - " cs\n", - " 1.023461\n", - " 0.395929\n", - " 0.395929\n", - " 0.395929\n", - " 0.791857\n", - " 0.395929\n", - " 0.395929\n", - " 0.000000\n", - " 0.791857\n", - " 0.395929\n", + " 3\n", + " [0.13903116434341475]\n", + " [0.10146093652625415]\n", + " [0.08181050400129704]\n", + " [0.07523129301857605]\n", + " [0.18045913119908524]\n", + " [0.20260530333012666]\n", + " [0.054813355262948726]\n", + " [0.16776960486431058]\n", + " [0.10031171790840715]\n", + " [0.07826399750895882]\n", " ...\n", - " 0.395929\n", - " 0.000000\n", - " 0.791857\n", - " 0.000000\n", - " 0.000000\n", - " 0.791857\n", - " 0.395929\n", - " 0.791857\n", - " 0.395929\n", - " 0.395929\n", + " [0.2509003350905366]\n", + " [0.08889171346193057]\n", + " [0.16005754765559338]\n", + " [0.053302833660593664]\n", + " [0.053302833660593664]\n", + " [0.2514441707424639]\n", + " [0.06610690581400588]\n", + " [0.07732220562999899]\n", + " [0.17148587761263784]\n", + " [0.04757751873987604]\n", " \n", " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", + " 4\n", + " [0.1129173777348391]\n", + " [0.08606053698831793]\n", + " [0.07547569629425901]\n", + " [0.06970411933742698]\n", + " [0.17135272355687187]\n", + " [0.20091765863852995]\n", + " [0.05469328989879712]\n", + " [0.16743762657989103]\n", + " [0.09811418008672529]\n", + " [0.07488273104320195]\n", " ...\n", + " [0.1456676671253771]\n", + " [0.08031528425526144]\n", + " [0.1591723994058402]\n", + " [0.05187299077727413]\n", + " [0.04876597436921601]\n", + " [0.24540764374264745]\n", + " [0.064867860088572]\n", + " [0.07458882712067232]\n", + " [0.1268769662336863]\n", + " [0.04675221931850179]\n", " \n", " \n", - " msrp\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " 5\n", + " [0.11038217708080131]\n", + " [0.08212202043548315]\n", + " [0.07432402899651668]\n", + " [0.06957535331034335]\n", + " [0.11414112796082569]\n", + " [0.09766529639479365]\n", + " [0.05342228639423025]\n", + " [0.11293055713592527]\n", + " [0.09499508593239105]\n", + " [0.07351917900522721]\n", " ...\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 6.643856\n", + " [0.1356468036645224]\n", + " [0.07846954905194828]\n", + " [0.15849239426755599]\n", + " [0.04462161696830591]\n", + " [0.04472544383210234]\n", + " [0.2051558258313172]\n", + " [0.06082789393903154]\n", + " [0.06855422701443233]\n", + " [0.08964124013628569]\n", + " [0.043706874358644336]\n", " \n", " \n", - " scotts\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " 6\n", + " [0.10833574384967115]\n", + " [0.08201334401786602]\n", + " [0.07412645164725896]\n", + " [0.06759708576024981]\n", + " [0.09130509649851562]\n", + " [0.08716787810262448]\n", + " [0.0497899252125642]\n", + " [0.09487498299853164]\n", + " [0.08477817264550953]\n", + " [0.06935710726422203]\n", " ...\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 6.643856\n", + " [0.13395484290588885]\n", + " [0.07651073480529516]\n", + " [0.14400058142958222]\n", + " [0.04329534404644248]\n", + " [0.04462161696830591]\n", + " [0.10677052218571517]\n", + " [0.05830025911923057]\n", + " [0.06718300838215913]\n", + " [0.08292563259802246]\n", + " [0.04288337427490202]\n", " \n", " \n", - " bbking\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " 7\n", + " [0.09458970277908005]\n", + " [0.08147133735902357]\n", + " [0.07118734960890093]\n", + " [0.06755116098175724]\n", + " [0.08423578168208135]\n", + " [0.08564850087282809]\n", + " [0.04712198065394311]\n", + " [0.08900364470414873]\n", + " [0.0845410509094711]\n", + " [0.06696048135598477]\n", " ...\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 6.643856\n", + " [0.12832106795670173]\n", + " [0.0742473891955823]\n", + " [0.1337598250032541]\n", + " [0.03987922316585672]\n", + " [0.04329534404644248]\n", + " [0.08913041300626738]\n", + " [0.05705502660231897]\n", + " [0.06310045914518085]\n", + " [0.07445114613146177]\n", + " [0.04255318648575154]\n", " \n", " \n", - " sherman\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " 8\n", + " [0.06683185055384143]\n", + " [0.08107329389309223]\n", + " [0.06992507760853946]\n", + " [0.0665144106585746]\n", + " [0.07732918671239193]\n", + " [0.07767329982049115]\n", + " [0.04496152399683175]\n", + " [0.07886393345966285]\n", + " [0.08159272098702083]\n", + " [0.0655222722939256]\n", " ...\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 6.643856\n", + " [0.12560417304820823]\n", + " [0.07411198929338968]\n", + " [0.1331408833577327]\n", + " [0.03857993879037075]\n", + " [0.038711425869337984]\n", + " [0.08897777442428674]\n", + " [0.05560594182068467]\n", + " [0.0626845924656044]\n", + " [0.07237739511197822]\n", + " [0.041613779283231046]\n", " \n", " \n", - " mpd\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " 9\n", + " [0.06139453131422424]\n", + " [0.07948227337943198]\n", + " [0.06689020592169981]\n", + " [0.05955785852513209]\n", + " [0.07517277782794342]\n", + " [0.07765247541664737]\n", + " [0.04318953773752061]\n", + " [0.07531732464870161]\n", + " [0.08098184954929952]\n", + " [0.06458414852590263]\n", " ...\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 6.643856\n", + " [0.12278663409161311]\n", + " [0.07365677105558269]\n", + " [0.13062404036287226]\n", + " [0.03855871951859559]\n", + " [0.03855871951859559]\n", + " [0.08461722594983144]\n", + " [0.053320319118708596]\n", + " [0.06258022212397257]\n", + " [0.07103356433858925]\n", + " [0.04145958334073781]\n", " \n", " \n", "\n", - "

3654 rows × 100 columns

\n", + "

10 rows × 100 columns

\n", "" ], "text/plain": [ - " 0 1 2 3 4 5 \n", - " 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \\\n", - "edu 0.769037 0.769037 0.553274 0.612645 0.702170 0.702170 \n", - "wpi 23.817975 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "to 0.392922 0.000000 0.578730 0.152003 0.456009 0.152003 \n", - "cs 1.023461 0.395929 0.395929 0.395929 0.791857 0.395929 \n", - "... ... ... ... ... ... ... \n", - "msrp 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "scotts 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "bbking 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "sherman 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "mpd 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + " 1000 1001 1002 \n", + "0 [0.6422584945103841] [0.668743974268336] [1.008422835773131] \\\n", + "1 [0.2873172889001637] [0.17456679249116952] [0.6332470445844255] \n", + "2 [0.19584221062188678] [0.1319897299803032] [0.08491363433325547] \n", + "3 [0.13903116434341475] [0.10146093652625415] [0.08181050400129704] \n", + "4 [0.1129173777348391] [0.08606053698831793] [0.07547569629425901] \n", + "5 [0.11038217708080131] [0.08212202043548315] [0.07432402899651668] \n", + "6 [0.10833574384967115] [0.08201334401786602] [0.07412645164725896] \n", + "7 [0.09458970277908005] [0.08147133735902357] [0.07118734960890093] \n", + "8 [0.06683185055384143] [0.08107329389309223] [0.06992507760853946] \n", + "9 [0.06139453131422424] [0.07948227337943198] [0.06689020592169981] \n", "\n", - " 6 7 8 9 ... 90 91 \n", - " 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \\\n", - "edu 0.476731 0.476731 0.702170 0.368849 ... 0.661155 0.476731 \n", - "wpi 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", - "to 0.152003 0.304006 0.392922 0.152003 ... 0.304006 0.392922 \n", - "cs 0.395929 0.000000 0.791857 0.395929 ... 0.395929 0.000000 \n", - "... ... ... ... ... ... ... ... \n", - "msrp 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", - "scotts 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", - "bbking 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", - "sherman 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", - "mpd 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", + " 1003 1004 1005 \n", + "0 [0.6661363151210968] [0.8642602181331219] [0.7417012424961686] \\\n", + "1 [0.09678526898308248] [0.537824913541413] [0.2820702175096787] \n", + "2 [0.08737476357264361] [0.47799964699339453] [0.20691840972537448] \n", + "3 [0.07523129301857605] [0.18045913119908524] [0.20260530333012666] \n", + "4 [0.06970411933742698] [0.17135272355687187] [0.20091765863852995] \n", + "5 [0.06957535331034335] [0.11414112796082569] [0.09766529639479365] \n", + "6 [0.06759708576024981] [0.09130509649851562] [0.08716787810262448] \n", + "7 [0.06755116098175724] [0.08423578168208135] [0.08564850087282809] \n", + "8 [0.0665144106585746] [0.07732918671239193] [0.07767329982049115] \n", + "9 [0.05955785852513209] [0.07517277782794342] [0.07765247541664737] \n", "\n", - " 92 93 94 95 96 97 98 \n", - " 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \\\n", - "edu 0.702170 0.368849 0.368849 0.553274 0.661155 0.702170 0.661155 \n", - "wpi 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "to 0.544925 0.000000 0.000000 0.152003 0.152003 0.152003 0.504943 \n", - "cs 0.791857 0.000000 0.000000 0.791857 0.395929 0.791857 0.395929 \n", - "... ... ... ... ... ... ... ... \n", - "msrp 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "scotts 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "bbking 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "sherman 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "mpd 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + " 1006 1007 1008 \n", + "0 [0.7315442686011049] [0.9460833202189584] [0.9746037822064916] \\\n", + "1 [0.6513139955405113] [0.3597970619791958] [0.24900044829809295] \n", + "2 [0.0713300455225047] [0.18078764107798664] [0.11093369090167131] \n", + "3 [0.054813355262948726] [0.16776960486431058] [0.10031171790840715] \n", + "4 [0.05469328989879712] [0.16743762657989103] [0.09811418008672529] \n", + "5 [0.05342228639423025] [0.11293055713592527] [0.09499508593239105] \n", + "6 [0.0497899252125642] [0.09487498299853164] [0.08477817264550953] \n", + "7 [0.04712198065394311] [0.08900364470414873] [0.0845410509094711] \n", + "8 [0.04496152399683175] [0.07886393345966285] [0.08159272098702083] \n", + "9 [0.04318953773752061] [0.07531732464870161] [0.08098184954929952] \n", "\n", - " 99 \n", - " 0.000000 \n", - "edu 0.184425 \n", - "wpi 0.000000 \n", - "to 0.152003 \n", - "cs 0.395929 \n", - "... ... \n", - "msrp 6.643856 \n", - "scotts 6.643856 \n", - "bbking 6.643856 \n", - "sherman 6.643856 \n", - "mpd 6.643856 \n", + " 1009 ... 1090 1091 \n", + "0 [0.6869123604148133] ... [1.1523833456270243] [0.7956243694544436] \\\n", + "1 [0.14368626326554504] ... [0.42456045151374616] [0.13839258426540738] \n", + "2 [0.09580043962698802] ... [0.39006994534248846] [0.09497709201044909] \n", + "3 [0.07826399750895882] ... [0.2509003350905366] [0.08889171346193057] \n", + "4 [0.07488273104320195] ... [0.1456676671253771] [0.08031528425526144] \n", + "5 [0.07351917900522721] ... [0.1356468036645224] [0.07846954905194828] \n", + "6 [0.06935710726422203] ... [0.13395484290588885] [0.07651073480529516] \n", + "7 [0.06696048135598477] ... [0.12832106795670173] [0.0742473891955823] \n", + "8 [0.0655222722939256] ... [0.12560417304820823] [0.07411198929338968] \n", + "9 [0.06458414852590263] ... [0.12278663409161311] [0.07365677105558269] \n", "\n", - "[3654 rows x 100 columns]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tf_idf.tfidf(base.query('tag==\"doc\"'), 'post').fillna(0)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "### Tokenização e Lemmatizer\n", - "\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 path cantaloupe srv cs cmu edu rochester udel ...\n", - "1 path cantaloupe srv cs cmu edu da new harvard ...\n", - "2 newsgroup misc forsale path cantaloupe srv cs ...\n", - "3 path cantaloupe srv cs cmu edu rochester corne...\n", - "4 xref cantaloupe srv cs cmu edu path cantaloupe...\n", - " ... \n", - "95 xref cantaloupe srv cs cmu edu newsgroup misc ...\n", - "96 newsgroup misc forsale subject want lcd overhe...\n", - "97 newsgroup ingr forsale hsv forsale misc forsal...\n", - "98 newsgroup misc forsale path cantaloupe srv cs ...\n", - "99 xref cantaloupe srv cs cmu edu path from scott...\n", - "Name: post, Length: 200, dtype: object" + " 1092 1093 1094 \n", + "0 [1.7802942925566625] [0.5158809769289813] [0.5234239788184921] \\\n", + "1 [0.1920094056436893] [0.4608108432879023] [0.46754863207460967] \n", + "2 [0.17705994384084703] [0.1911406606344065] [0.19471247373710213] \n", + "3 [0.16005754765559338] [0.053302833660593664] [0.053302833660593664] \n", + "4 [0.1591723994058402] [0.05187299077727413] [0.04876597436921601] \n", + "5 [0.15849239426755599] [0.04462161696830591] [0.04472544383210234] \n", + "6 [0.14400058142958222] [0.04329534404644248] [0.04462161696830591] \n", + "7 [0.1337598250032541] [0.03987922316585672] [0.04329534404644248] \n", + "8 [0.1331408833577327] [0.03857993879037075] [0.038711425869337984] \n", + "9 [0.13062404036287226] [0.03855871951859559] [0.03855871951859559] \n", + "\n", + " 1095 1096 1097 \n", + "0 [0.8038496213979783] [0.6360279187397326] [1.1259044435676868] \\\n", + "1 [0.635942460180357] [0.2673113797648495] [0.09064837068978648] \n", + "2 [0.6126542869398587] [0.06788233871189489] [0.07823702305427702] \n", + "3 [0.2514441707424639] [0.06610690581400588] [0.07732220562999899] \n", + "4 [0.24540764374264745] [0.064867860088572] [0.07458882712067232] \n", + "5 [0.2051558258313172] [0.06082789393903154] [0.06855422701443233] \n", + "6 [0.10677052218571517] [0.05830025911923057] [0.06718300838215913] \n", + "7 [0.08913041300626738] [0.05705502660231897] [0.06310045914518085] \n", + "8 [0.08897777442428674] [0.05560594182068467] [0.0626845924656044] \n", + "9 [0.08461722594983144] [0.053320319118708596] [0.06258022212397257] \n", + "\n", + " 1098 1099 \n", + "0 [1.0616162379284384] [0.6784846202202287] \n", + "1 [0.19426228087688888] [0.17427577670397354] \n", + "2 [0.17319361461646068] [0.07447944169226609] \n", + "3 [0.17148587761263784] [0.04757751873987604] \n", + "4 [0.1268769662336863] [0.04675221931850179] \n", + "5 [0.08964124013628569] [0.043706874358644336] \n", + "6 [0.08292563259802246] [0.04288337427490202] \n", + "7 [0.07445114613146177] [0.04255318648575154] \n", + "8 [0.07237739511197822] [0.041613779283231046] \n", + "9 [0.07103356433858925] [0.04145958334073781] \n", + "\n", + "[10 rows x 100 columns]" ] }, - "execution_count": 26, + "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "base['post'].apply(lambda x: ' '.join([lemmatize_word(word.lower()) for word in x.split()]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def lemmatize_text(df, input_col):\n", - " vowels = ['a', 'e', 'i', 'o', 'u']\n", - "\n", - "a def lemmatize_word(word):\n", - " if len(word) <= 2:\n", - " return word\n", - " \n", - " if word.endswith('ns'):\n", - " return word[:-2]\n", - " \n", - " if word.endswith('s'):\n", - " return word[:-1]\n", - " \n", - " if word.endswith('ing') and len(word) > 5:\n", - " if word[-4] == word[-5] and word[-5] not in vowels:\n", - " return word[:-4] + word[-3:]\n", - " elif word[-3] in vowels:\n", - " return word[:-3]\n", - " else:\n", - " return word[:-2]\n", - " \n", - " if word.endswith('ly') and len(word) > 4:\n", - " return word[:-2]\n", - " \n", - " if word.endswith('ed') and len(word) > 3:\n", - " if word[-3] == word[-4] and word[-4] not in vowels:\n", - " return word[:-3] + word[-2:]\n", - " else:\n", - " return word[:-2]\n", - " \n", - " return word\n", - "\n", - " df[output_col] = df[input_col].apply(lambda x: ' '.join([lemmatize_word(word.lower()) for word in x.split()]))\n", - " return df\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package wordnet to\n", - "[nltk_data] C:\\Users\\kevin\\AppData\\Roaming\\nltk_data...\n", - "[nltk_data] Package wordnet is already up-to-date!\n", - "[nltk_data] Downloading package punkt to\n", - "[nltk_data] C:\\Users\\kevin\\AppData\\Roaming\\nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] - } - ], - "source": [ - "nltk.download('wordnet') # faz o download do recurso 'wordnet' do nltk\n", - "nltk.download('punkt') # faz o download do recurso 'punkt' do nltk\n", - "\n", - "# Cria um objeto 'w_tokenizer' da classe 'WhitespaceTokenizer' do nltk para tokenizar o texto por espaços em branco\n", - "w_tokenizer = nltk.tokenize.WhitespaceTokenizer()\n", - "\n", - "# Cria um objeto 'lemmatizer' da classe 'WordNetLemmatizer' do nltk para realizar a lematização das palavras\n", - "lemmatizer = nltk.WordNetLemmatizer()\n", - "\n", - "# Define a função 'lemmatizer_text' que recebe um texto como entrada, tokeniza o texto em palavras e lematiza cada palavra\n", - "def lemmatizer_text(text): \n", - " return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]\n", - "\n", - "# Cria uma nova coluna 'tokens' na tabela 'base_inicial' que contém uma lista de tokens lematizados para cada texto\n", - "base_inicial['tokens'] = base_inicial['text'].map(lemmatizer_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "### Remoção de Stopwords\n", - "\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to\n", - "[nltk_data] C:\\Users\\kevin\\AppData\\Roaming\\nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - } - ], - "source": [ - "# Faz o download do recurso 'stopwords' do nltk\n", - "nltk.download('stopwords')\n", - "\n", - "# Define a lista de stopwords em inglês usando o módulo stopwords do nltk\n", - "stopwords = stopwords.words('english')\n", - "\n", - "# Aplica a função lambda em cada linha da coluna 'text' da tabela 'base_inicial'\n", - "# A função lambda realiza a tokenização do texto, transforma as palavras em minúsculas e remove as stopwords\n", - "base_inicial['text'] = base_inicial['text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stopwords))\n", - "base_inicial['tokens'] = base_inicial['tokens'].apply(lambda words: [word.lower() for word in words if word not in stopwords])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# # Cria uma lista de palavras a partir da lista de tokens\n", - "# w = [j for i in list(itertools.chain(base_inicial['tokens'])) for j in i]\n", - "\n", - "# # Instancia um objeto SpellChecker para correção ortográfica\n", - "# spell = SpellChecker()\n", - "\n", - "# if !os.path.isfile(): \n", - "# # Cria um dicionário com as palavras únicas da lista, faz a correção ortográfica e associa com a palavra original\n", - "# spell_checked = {word: spell.correction(word) for word in pd.Series(w).unique()}\n", - "\n", - "# # Define o caminho do arquivo que irá armazenar o dicionário serializado\n", - "# path = '../references/spellcheck.pickle'\n", - "\n", - "# # Abre o arquivo para gravação em modo binário e escreve o objeto serializado\n", - "# with open(path, 'wb') as file: \n", - "# pickle.dump(spell_checked, file)\n", - "# else: \n", - "# pass" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Export da base" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "path = '../data/processed/base_processed.parquet.gzip'\n", - "\n", - "if os.path.isfile(path): \n", - " answer = input('File already exists, do you want to overwrite? (y/n)')\n", - " if answer.lower() in ['s', 'y']:\n", - " base_inicial.to_parquet(path, compression='gzip')\n", - " else:\n", - " raise FileExistsError('File already exists')\n", - "else: \n", - " base_inicial.to_parquet(path, compression='gzip')" + "pd.DataFrame(rank_matrix)" ] } ], diff --git a/notebooks/ir/tf_idf.py b/notebooks/ir/tf_idf.py index 5b6f737..5473a4b 100644 --- a/notebooks/ir/tf_idf.py +++ b/notebooks/ir/tf_idf.py @@ -15,7 +15,7 @@ def log_freq(doc): """ words = doc.split(' ') # Separar o documento em palavras freq = pd.Series(words).value_counts() # Contar a frequência de cada palavra - log_freq = 1 + np.log2(freq) # Calcular o log na base 2 da frequência + log_freq = 1 + np.log10(freq) # Calcular o log na base 2 da frequência return log_freq # Função que aplica a função log_freq a cada documento do dataframe