Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinsrq committed May 23, 2023
1 parent bcd1b89 commit a13a25c
Show file tree
Hide file tree
Showing 3 changed files with 273 additions and 43 deletions.
254 changes: 234 additions & 20 deletions notebooks/information_retrieval_emails.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,23 +41,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def read_files(doc_dir):\n",
" # Use a list comprehension to get a list of file paths\n",
" database = [{'filepath': doc_dir,\n",
" 'filename': filename,\n",
" 'text': open(os.path.join(doc_dir, filename), 'r').read().strip()}\n",
" for filename in os.listdir(doc_dir)]\n",
"\n",
" return database\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 31,
"metadata": {},
"outputs": [
{
Expand All @@ -66,17 +50,93 @@
"0"
]
},
"execution_count": 3,
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# caminho das queries \n",
"query_path = '../data/emails/mini_newsgroups/misc.forsale/'\n",
"query_path = '../data/emails/mini_newsgroups/'\n",
"\n",
"# caminho dos documentos\n",
"docs_path = '../data/emails/20_newsgroups/'\n",
"\n",
"# Iterate over each file in the directory and its subdirectories\n",
"def process_files(doc_dir: str): \n",
" \n",
" database = [] \n",
" \n",
" for filepath in os.listdir(doc_dir): \n",
" \n",
" for filename in os.listdir(f'{doc_dir}{filepath}'):\n",
"\n",
" # Open each file individually and read its contents\n",
" with open(os.path.join(doc_dir, filepath, filename), 'r') as f:\n",
" text_data = f.read().strip()\n",
"\n",
" # Split the header and body of the email\n",
" try:\n",
" header, body = text_data.split('\\n\\n', maxsplit=1)\n",
" except:\n",
" continue\n",
"\n",
" # Convert header to a dictionary\n",
" # header_dict = {}\n",
" # for line in header.split('\\n'):\n",
" # try:\n",
" # # Split the key and value in each header field and store them in a dictionary\n",
" # key, value = line.strip().split(': ', maxsplit=1)\n",
" # header_dict[key] = value\n",
" # except:\n",
" # # If a header field cannot be split properly, skip it and continue\n",
" # continue\n",
"\n",
" # Append the processed data to the list\n",
"\n",
" database.append({'filepath': filepath, \n",
" 'filename': filename,\n",
" 'body': body, \n",
" # **header_dict,\n",
" # 'text': text_data\n",
" })\n",
" return database\n",
"\n",
"# tranformation from dict -> dataframe\n",
"base_doc = pd.DataFrame(process_files(docs))\n",
"\n",
"base_doc = pd.DataFrame(process_files(doc_dir))\n",
"\n",
"# remove database from memory\n",
"gc.collect()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "PermissionError",
"evalue": "[Errno 13] Permission denied: '../data/emails/20_newsgroups/alt.atheism'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mPermissionError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[5], line 8\u001b[0m\n\u001b[0;32m 5\u001b[0m docs_path \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m../data/emails/20_newsgroups/\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 7\u001b[0m \u001b[39m# Import das bases\u001b[39;00m\n\u001b[1;32m----> 8\u001b[0m database_docs \u001b[39m=\u001b[39m read_files(docs_path)\n\u001b[0;32m 9\u001b[0m database_query \u001b[39m=\u001b[39m read_files(query_path)\n\u001b[0;32m 11\u001b[0m base_docs \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mDataFrame(database_docs)\n",
"Cell \u001b[1;32mIn[2], line 3\u001b[0m, in \u001b[0;36mread_files\u001b[1;34m(doc_dir)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mread_files\u001b[39m(doc_dir):\n\u001b[0;32m 2\u001b[0m \u001b[39m# Use a list comprehension to get a list of file paths\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m database \u001b[39m=\u001b[39m [{\u001b[39m'\u001b[39;49m\u001b[39mfilepath\u001b[39;49m\u001b[39m'\u001b[39;49m: doc_dir,\n\u001b[0;32m 4\u001b[0m \u001b[39m'\u001b[39;49m\u001b[39mfilename\u001b[39;49m\u001b[39m'\u001b[39;49m: filename,\n\u001b[0;32m 5\u001b[0m \u001b[39m'\u001b[39;49m\u001b[39mtext\u001b[39;49m\u001b[39m'\u001b[39;49m: \u001b[39mopen\u001b[39;49m(os\u001b[39m.\u001b[39;49mpath\u001b[39m.\u001b[39;49mjoin(doc_dir, filename), \u001b[39m'\u001b[39;49m\u001b[39mr\u001b[39;49m\u001b[39m'\u001b[39;49m)\u001b[39m.\u001b[39;49mread()\u001b[39m.\u001b[39;49mstrip()}\n\u001b[0;32m 6\u001b[0m \u001b[39mfor\u001b[39;49;00m filename \u001b[39min\u001b[39;49;00m os\u001b[39m.\u001b[39;49mlistdir(doc_dir)]\n\u001b[0;32m 8\u001b[0m \u001b[39mreturn\u001b[39;00m database\n",
"Cell \u001b[1;32mIn[2], line 5\u001b[0m, in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mread_files\u001b[39m(doc_dir):\n\u001b[0;32m 2\u001b[0m \u001b[39m# Use a list comprehension to get a list of file paths\u001b[39;00m\n\u001b[0;32m 3\u001b[0m database \u001b[39m=\u001b[39m [{\u001b[39m'\u001b[39m\u001b[39mfilepath\u001b[39m\u001b[39m'\u001b[39m: doc_dir,\n\u001b[0;32m 4\u001b[0m \u001b[39m'\u001b[39m\u001b[39mfilename\u001b[39m\u001b[39m'\u001b[39m: filename,\n\u001b[1;32m----> 5\u001b[0m \u001b[39m'\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m'\u001b[39m: \u001b[39mopen\u001b[39;49m(os\u001b[39m.\u001b[39;49mpath\u001b[39m.\u001b[39;49mjoin(doc_dir, filename), \u001b[39m'\u001b[39;49m\u001b[39mr\u001b[39;49m\u001b[39m'\u001b[39;49m)\u001b[39m.\u001b[39mread()\u001b[39m.\u001b[39mstrip()}\n\u001b[0;32m 6\u001b[0m \u001b[39mfor\u001b[39;00m filename \u001b[39min\u001b[39;00m os\u001b[39m.\u001b[39mlistdir(doc_dir)]\n\u001b[0;32m 8\u001b[0m \u001b[39mreturn\u001b[39;00m database\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\IPython\\core\\interactiveshell.py:282\u001b[0m, in \u001b[0;36m_modified_open\u001b[1;34m(file, *args, **kwargs)\u001b[0m\n\u001b[0;32m 275\u001b[0m \u001b[39mif\u001b[39;00m file \u001b[39min\u001b[39;00m {\u001b[39m0\u001b[39m, \u001b[39m1\u001b[39m, \u001b[39m2\u001b[39m}:\n\u001b[0;32m 276\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 277\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mIPython won\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt let you open fd=\u001b[39m\u001b[39m{\u001b[39;00mfile\u001b[39m}\u001b[39;00m\u001b[39m by default \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 278\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 279\u001b[0m \u001b[39m\"\u001b[39m\u001b[39myou can use builtins\u001b[39m\u001b[39m'\u001b[39m\u001b[39m open.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 280\u001b[0m )\n\u001b[1;32m--> 282\u001b[0m \u001b[39mreturn\u001b[39;00m io_open(file, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
"\u001b[1;31mPermissionError\u001b[0m: [Errno 13] Permission denied: '../data/emails/20_newsgroups/alt.atheism'"
]
}
],
"source": [
"# caminho das queries \n",
"query_path = '../data/emails/mini_newsgroups/'\n",
"\n",
"# caminho dos documentos\n",
"docs_path = '../data/emails/20_newsgroups/misc.forsale/'\n",
"docs_path = '../data/emails/20_newsgroups/'\n",
"\n",
"# Import das bases\n",
"database_docs = read_files(docs_path)\n",
Expand All @@ -97,6 +157,160 @@
"gc.collect()\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filepath</th>\n",
" <th>filename</th>\n",
" <th>text</th>\n",
" <th>tag</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>../data/emails/20_newsgroups/misc.forsale/</td>\n",
" <td>70337</td>\n",
" <td>Path: cantaloupe.srv.cs.cmu.edu!rochester!udel...</td>\n",
" <td>doc</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>../data/emails/20_newsgroups/misc.forsale/</td>\n",
" <td>74150</td>\n",
" <td>Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....</td>\n",
" <td>doc</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>../data/emails/20_newsgroups/misc.forsale/</td>\n",
" <td>74720</td>\n",
" <td>Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...</td>\n",
" <td>doc</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>../data/emails/20_newsgroups/misc.forsale/</td>\n",
" <td>74721</td>\n",
" <td>Newsgroups: misc.forsale\\nPath: cantaloupe.srv...</td>\n",
" <td>doc</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>../data/emails/20_newsgroups/misc.forsale/</td>\n",
" <td>74722</td>\n",
" <td>Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....</td>\n",
" <td>doc</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1095</th>\n",
" <td>../data/emails/mini_newsgroups/misc.forsale/</td>\n",
" <td>76927</td>\n",
" <td>Xref: cantaloupe.srv.cs.cmu.edu misc.wanted:31...</td>\n",
" <td>query</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1096</th>\n",
" <td>../data/emails/mini_newsgroups/misc.forsale/</td>\n",
" <td>76936</td>\n",
" <td>Newsgroups: misc.forsale\\nSubject: WANTED LCD ...</td>\n",
" <td>query</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1097</th>\n",
" <td>../data/emails/mini_newsgroups/misc.forsale/</td>\n",
" <td>76937</td>\n",
" <td>Newsgroups: ingr.forsale,hsv.forsale,misc.fors...</td>\n",
" <td>query</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1098</th>\n",
" <td>../data/emails/mini_newsgroups/misc.forsale/</td>\n",
" <td>76940</td>\n",
" <td>Newsgroups: misc.forsale\\nPath: cantaloupe.srv...</td>\n",
" <td>query</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1099</th>\n",
" <td>../data/emails/mini_newsgroups/misc.forsale/</td>\n",
" <td>76945</td>\n",
" <td>Xref: cantaloupe.srv.cs.cmu.edu comp.sys.mac.h...</td>\n",
" <td>query</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1100 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" filepath filename \n",
"0 ../data/emails/20_newsgroups/misc.forsale/ 70337 \\\n",
"1 ../data/emails/20_newsgroups/misc.forsale/ 74150 \n",
"2 ../data/emails/20_newsgroups/misc.forsale/ 74720 \n",
"3 ../data/emails/20_newsgroups/misc.forsale/ 74721 \n",
"4 ../data/emails/20_newsgroups/misc.forsale/ 74722 \n",
"... ... ... \n",
"1095 ../data/emails/mini_newsgroups/misc.forsale/ 76927 \n",
"1096 ../data/emails/mini_newsgroups/misc.forsale/ 76936 \n",
"1097 ../data/emails/mini_newsgroups/misc.forsale/ 76937 \n",
"1098 ../data/emails/mini_newsgroups/misc.forsale/ 76940 \n",
"1099 ../data/emails/mini_newsgroups/misc.forsale/ 76945 \n",
"\n",
" text tag \n",
"0 Path: cantaloupe.srv.cs.cmu.edu!rochester!udel... doc \n",
"1 Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.... doc \n",
"2 Path: cantaloupe.srv.cs.cmu.edu!das-news.harva... doc \n",
"3 Newsgroups: misc.forsale\\nPath: cantaloupe.srv... doc \n",
"4 Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.... doc \n",
"... ... ... \n",
"1095 Xref: cantaloupe.srv.cs.cmu.edu misc.wanted:31... query \n",
"1096 Newsgroups: misc.forsale\\nSubject: WANTED LCD ... query \n",
"1097 Newsgroups: ingr.forsale,hsv.forsale,misc.fors... query \n",
"1098 Newsgroups: misc.forsale\\nPath: cantaloupe.srv... query \n",
"1099 Xref: cantaloupe.srv.cs.cmu.edu comp.sys.mac.h... query \n",
"\n",
"[1100 rows x 4 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"base"
]
},
{
"attachments": {},
"cell_type": "markdown",
Expand Down
7 changes: 0 additions & 7 deletions notebooks/information_retrieval_movies.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -417,13 +417,6 @@
"map_result = mean_average_precision(resultados_relevantes, resultados_sistema)\n",
"map_result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
55 changes: 39 additions & 16 deletions notebooks/ir/io_file.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,47 @@
import os

def walklevel(some_dir, level=1):
# Remove trailing path separator
some_dir = some_dir.rstrip(os.path.sep)
def read_files(doc_dir) -> dict:
# Use a list comprehension to get a list of file paths
database = [{'filepath': doc_dir,
'filename': filename,
'text': open(os.path.join(doc_dir, filename), 'r').read().strip()}
for filename in os.listdir(doc_dir)]

# Make sure the directory exists
assert os.path.isdir(some_dir)
return database

# Count the number of path separators in the directory path
num_sep = some_dir.count(os.path.sep)
def process_files(doc_dir: str) -> dict:
"""
Processa os arquivos em um diretório e seus subdiretórios.
# Traverse the directory tree using os.walk()
for root, dirs, files in os.walk(some_dir):
Args:
doc_dir (str): Caminho para o diretório que contém os arquivos.
# Yield the current directory path, its subdirectories, and its files
yield root, dirs, files
Returns:
list: Uma lista de dicionários contendo os dados processados de cada arquivo.
# Count the number of path separators in the current directory path
num_sep_this = root.count(os.path.sep)
"""
database = [] # Lista para armazenar os dados processados

for filepath in os.listdir(doc_dir): # Percorre os arquivos no diretório
for filename in os.listdir(f'{doc_dir}{filepath}'): # Percorre os arquivos nos subdiretórios

# Abre o arquivo e lê seu conteúdo
with open(os.path.join(doc_dir, filepath, filename), 'r') as f:
text_data = f.read().strip()

try:
# Divide o conteúdo do arquivo em header e body
header, body = text_data.split('\n\n', maxsplit=1)

# Adiciona os dados processados à lista database
database.append({
'filepath': filepath,
'filename': filename,
'body': body,
})
except ValueError:
# Se ocorrer uma exceção ao dividir o conteúdo, continua para o próximo arquivo
continue

return database

# If the current directory level exceeds the specified depth level, remove its subdirectories
if num_sep + level <= num_sep_this:
del dirs[:]

0 comments on commit a13a25c

Please sign in to comment.