Skip to content

Commit

Permalink
Merge branch 'developer'
Browse files Browse the repository at this point in the history
  • Loading branch information
Erik172 committed Jul 4, 2024
2 parents 48a4fc7 + 6c39961 commit 51c0067
Show file tree
Hide file tree
Showing 42 changed files with 110 additions and 259,698 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ logs/*
bogota_apartments/spiders/gojom.py
.vscode/
data/interim/*.csv
images/

### JupyterNotebooks ###
# gitignore template for Jupyter Notebooks
Expand Down
69 changes: 39 additions & 30 deletions ETL/04_data_save.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,53 +5,62 @@
import logging
import os

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

filename = f'logs/04_data_save.log'
# Configurar el registro de eventos
filename = 'logs/04_data_save.log'
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', filename=filename)

if os.getcwd().split('/')[-1] == 'ETL':
logging.info('Cambiando directorio de trabajo')
# Cambiar el directorio de trabajo si es necesario
if os.path.basename(os.getcwd()) == 'ETL':
logging.info('Changing working directory')
os.chdir('..')

# Iniciar el proceso y registrar el inicio
logging.info(f'Process started at {datetime.now()}')
# Connect to MongoDB
logging.info('Connecting to MongoDB')
client = pymongo.MongoClient(os.getenv('MONGO_URI'))
db = client[os.getenv('MONGO_DATABASE')]
collection = db[os.getenv('MONGO_COLLECTION_PROCESSED')]

PREOCESSED_DATA = 'data/processed/apartments.csv'

# Read the processed data
logging.info('Reading the processed data')
try:
df = pd.read_csv(PREOCESSED_DATA, low_memory=False)
# Conectar a MongoDB
logging.info('Connecting to MongoDB')
client = pymongo.MongoClient(os.getenv('MONGO_URI'))
db = client[os.getenv('MONGO_DATABASE')]
collection = db['scrapy_bogota_apartments_processed']

# Ruta al archivo de datos procesados
PROCESSED_DATA = 'data/processed/apartments.csv'

# Leer los datos procesados desde el archivo CSV
logging.info('Reading the processed data')
df = pd.read_csv(PROCESSED_DATA, low_memory=False)
logging.info('Processed data read successfully')
except Exception as error:
logging.error(error)
exit(1)

# Save the processed data to MongoDB
logging.info('Saving the processed data to MongoDB')
# leer, buscar si existe, sie existe mirar si es igual, si es igual no hacer nada, si es diferente actualizar, si no existe insertar
try:
# Guardar los datos procesados en MongoDB
logging.info('Saving the processed data to MongoDB')
for index, row in df.iterrows():
apartment = collection.find_one({'codigo': row['codigo']})
if apartment is None:
collection.insert_one(row.to_dict())
else:
if apartment:
if apartment != row.to_dict():
collection.update_one({'codigo': row['codigo']}, {'$set': row.to_dict()})
else:
collection.insert_one(row.to_dict())

logging.info('Processed data saved successfully')

except Exception as error:
logging.error(error)
exit(1)
except FileNotFoundError as e:
logging.error(f'File not found: {e}')

except pd.errors.EmptyDataError as e:
logging.error(f'Empty data error: {e}')

except Exception as e:
logging.error(f'An error occurred: {e}')

finally:
# Cerrar la conexión a MongoDB
if 'client' in locals():
logging.info('Closing the connection to MongoDB')
client.close()

# Close the connection to MongoDB
logging.info('Closing the connection to MongoDB')
client.close()
logging.info(f'Process finished at {datetime.now()}')

logging.info(f'Process finished at {datetime.now()}')
File renamed without changes.
2 changes: 2 additions & 0 deletions bogota_apartments/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ class ApartmentsItem(scrapy.Item):

imagenes = scrapy.Field()

# imagenes_paths = scrapy.Field()

website = scrapy.Field(output_processor = TakeFirst())

datetime = scrapy.Field(output_processor = TakeFirst())
Expand Down
2 changes: 1 addition & 1 deletion bogota_apartments/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,4 +176,4 @@ def process_item(self, item, spider):
return item

self.db[self.collection].insert_one(data)
return item
return item
37 changes: 11 additions & 26 deletions bogota_apartments/settings.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,5 @@
# Scrapy settings for bogota_apartments project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from dotenv import load_dotenv
from datetime import datetime
import os

load_dotenv()
Expand All @@ -19,28 +11,21 @@

VERSION = '2.0.0'

# Splash settings
SPLASH_URL = 'http://localhost:8050/' # send requests to render web pages and execute JavaScript code.
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' # dupe filter is a mechanism that prevents Scrapy from making duplicate requests to a website.
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' # stores the cache on the local file system

# Database settings - uncomment if you want to use MongoDB
MONGO_URI = os.getenv('MONGO_URI')
MONGO_DATABASE = os.getenv('MONGO_DATABASE')

if not os.getenv('MONGO_COLLECTION_RAW') or not os.getenv('MONGO_COLLECTION_PROCESSED'):
MONGO_COLLECTION_RAW = 'scrapy_bogota_apartments'
MONGO_COLLECTION_PROCESSED = 'scrapy_bogota_apartments_processed'

else:
MONGO_COLLECTION_RAW = os.getenv('MONGO_COLLECTION_RAW')
MONGO_COLLECTION_PROCESSED = os.getenv('MONGO_COLLECTION_PROCESSED')

# Asignación condicional con valores por defecto
MONGO_COLLECTION_RAW = os.getenv('MONGO_COLLECTION_RAW', 'scrapy_bogota_apartments')
MONGO_COLLECTION_PROCESSED = os.getenv('MONGO_COLLECTION_PROCESSED', 'scrapy_bogota_apartments_processed')

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "bogota_apartments (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
Expand Down Expand Up @@ -75,8 +60,8 @@
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723, # This middleware handles cookies in requests made to Splash, and it is assigned the priority of 723
'scrapy_splash.SplashMiddleware': 725, # This middleware provides the integration between Scrapy and Splash and is assigned the priority of 725.
# 'scrapy_splash.SplashCookiesMiddleware': 723, # This middleware handles cookies in requests made to Splash, and it is assigned the priority of 723
# 'scrapy_splash.SplashMiddleware': 725, # This middleware provides the integration between Scrapy and Splash and is assigned the priority of 725.
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, # This middleware is responsible for handling HTTP compression, and it is assigned the priority of 810.
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
}
Expand Down Expand Up @@ -120,6 +105,6 @@
FEED_EXPORT_ENCODING = 'utf-8'

# Logging settings
# LOG_STDOUT = True
# LOG_FILE = f'logs/scrapy_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
# LOG_LEVEL = 'DEBUG'
LOG_STDOUT = True
LOG_FILE = f'logs/scrapy_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
LOG_LEVEL = 'DEBUG'
2 changes: 2 additions & 0 deletions bogota_apartments/spiders/habi.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Author: Erik Garcia (@erik172)
# Version: Stable
from fake_useragent import UserAgent
from datetime import datetime
import json
Expand Down
Loading

0 comments on commit 51c0067

Please sign in to comment.