Merge branch 'developer'

builker-col · Jul 4, 2024 · 51c0067 · 51c0067
2 parents 48a4fc7 + 6c39961
commit 51c0067
Show file tree

Hide file tree

Showing 42 changed files with 110 additions and 259,698 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ logs/*
 bogota_apartments/spiders/gojom.py
 .vscode/
 data/interim/*.csv
+images/
 
 ### JupyterNotebooks ###
 # gitignore template for Jupyter Notebooks

diff --git a/ETL/04_data_save.py b/ETL/04_data_save.py
@@ -5,53 +5,62 @@
 import logging
 import os
 
+# Cargar las variables de entorno desde el archivo .env
 load_dotenv()
 
-filename = f'logs/04_data_save.log'
+# Configurar el registro de eventos
+filename = 'logs/04_data_save.log'
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', filename=filename)
 
-if os.getcwd().split('/')[-1] == 'ETL':
-    logging.info('Cambiando directorio de trabajo')
+# Cambiar el directorio de trabajo si es necesario
+if os.path.basename(os.getcwd()) == 'ETL':
+    logging.info('Changing working directory')
     os.chdir('..')
 
+# Iniciar el proceso y registrar el inicio
 logging.info(f'Process started at {datetime.now()}')
-# Connect to MongoDB
-logging.info('Connecting to MongoDB')
-client = pymongo.MongoClient(os.getenv('MONGO_URI'))
-db = client[os.getenv('MONGO_DATABASE')]
-collection = db[os.getenv('MONGO_COLLECTION_PROCESSED')]
 
-PREOCESSED_DATA = 'data/processed/apartments.csv'
-
-# Read the processed data
-logging.info('Reading the processed data')
 try:
-    df = pd.read_csv(PREOCESSED_DATA, low_memory=False)
+    # Conectar a MongoDB
+    logging.info('Connecting to MongoDB')
+    client = pymongo.MongoClient(os.getenv('MONGO_URI'))
+    db = client[os.getenv('MONGO_DATABASE')]
+    collection = db['scrapy_bogota_apartments_processed']
+
+    # Ruta al archivo de datos procesados
+    PROCESSED_DATA = 'data/processed/apartments.csv'
+
+    # Leer los datos procesados desde el archivo CSV
+    logging.info('Reading the processed data')
+    df = pd.read_csv(PROCESSED_DATA, low_memory=False)
     logging.info('Processed data read successfully')
-except Exception as error:
-    logging.error(error)
-    exit(1)
 
-# Save the processed data to MongoDB
-logging.info('Saving the processed data to MongoDB')
-# leer, buscar si existe, sie existe mirar si es igual, si es igual no hacer nada, si es diferente actualizar, si no existe insertar
-try:
+    # Guardar los datos procesados en MongoDB
+    logging.info('Saving the processed data to MongoDB')
     for index, row in df.iterrows():
         apartment = collection.find_one({'codigo': row['codigo']})
-        if apartment is None:
-            collection.insert_one(row.to_dict())
-        else:
+        if apartment:
             if apartment != row.to_dict():
                 collection.update_one({'codigo': row['codigo']}, {'$set': row.to_dict()})
+        else:
+            collection.insert_one(row.to_dict())
 
     logging.info('Processed data saved successfully')
 
-except Exception as error:
-    logging.error(error)
-    exit(1)
+except FileNotFoundError as e:
+    logging.error(f'File not found: {e}')
+
+except pd.errors.EmptyDataError as e:
+    logging.error(f'Empty data error: {e}')
+
+except Exception as e:
+    logging.error(f'An error occurred: {e}')
+
+finally:
+    # Cerrar la conexión a MongoDB
+    if 'client' in locals():
+        logging.info('Closing the connection to MongoDB')
+        client.close()
 
-# Close the connection to MongoDB
-logging.info('Closing the connection to MongoDB')
-client.close()
+    logging.info(f'Process finished at {datetime.now()}')
 
-logging.info(f'Process finished at {datetime.now()}')
diff --git a/visualizations/venta/.gitkeep → ETL/data_pipeline.py b/visualizations/venta/.gitkeep → ETL/data_pipeline.py
diff --git a/bogota_apartments/items.py b/bogota_apartments/items.py
@@ -154,6 +154,8 @@ class ApartmentsItem(scrapy.Item):
 
     imagenes = scrapy.Field()
 
+    # imagenes_paths = scrapy.Field()
+
     website = scrapy.Field(output_processor = TakeFirst())
 
     datetime = scrapy.Field(output_processor = TakeFirst())

diff --git a/bogota_apartments/pipelines.py b/bogota_apartments/pipelines.py
@@ -176,4 +176,4 @@ def process_item(self, item, spider):
             return item
 
         self.db[self.collection].insert_one(data)
-        return item
+        return item
diff --git a/bogota_apartments/settings.py b/bogota_apartments/settings.py
@@ -1,13 +1,5 @@
-# Scrapy settings for bogota_apartments project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://docs.scrapy.org/en/latest/topics/settings.html
-#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-
 from dotenv import load_dotenv
+from datetime import datetime
 import os
 
 load_dotenv()
@@ -19,28 +11,21 @@
 
 VERSION = '2.0.0'
 
-# Splash settings
-SPLASH_URL = 'http://localhost:8050/'  # send requests to render web pages and execute JavaScript code.
-DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'  # dupe filter is a mechanism that prevents Scrapy from making duplicate requests to a website. 
-HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' # stores the cache on the local file system
 
 # Database settings - uncomment if you want to use MongoDB
 MONGO_URI = os.getenv('MONGO_URI')
 MONGO_DATABASE = os.getenv('MONGO_DATABASE')
 
-if not os.getenv('MONGO_COLLECTION_RAW') or not os.getenv('MONGO_COLLECTION_PROCESSED'):
-    MONGO_COLLECTION_RAW = 'scrapy_bogota_apartments'
-    MONGO_COLLECTION_PROCESSED = 'scrapy_bogota_apartments_processed'
-
-else:
-    MONGO_COLLECTION_RAW = os.getenv('MONGO_COLLECTION_RAW')
-    MONGO_COLLECTION_PROCESSED = os.getenv('MONGO_COLLECTION_PROCESSED')
+
+# Asignación condicional con valores por defecto
+MONGO_COLLECTION_RAW = os.getenv('MONGO_COLLECTION_RAW', 'scrapy_bogota_apartments')
+MONGO_COLLECTION_PROCESSED = os.getenv('MONGO_COLLECTION_PROCESSED', 'scrapy_bogota_apartments_processed')
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = "bogota_apartments (+http://www.yourdomain.com)"
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -75,8 +60,8 @@
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = { 
-    'scrapy_splash.SplashCookiesMiddleware': 723,  # This middleware handles cookies in requests made to Splash, and it is assigned the priority of 723
-    'scrapy_splash.SplashMiddleware': 725,  # This middleware provides the integration between Scrapy and Splash and is assigned the priority of 725.
+    # 'scrapy_splash.SplashCookiesMiddleware': 723,  # This middleware handles cookies in requests made to Splash, and it is assigned the priority of 723
+    # 'scrapy_splash.SplashMiddleware': 725,  # This middleware provides the integration between Scrapy and Splash and is assigned the priority of 725.
     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,  # This middleware is responsible for handling HTTP compression, and it is assigned the priority of 810.
     'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
 }
@@ -120,6 +105,6 @@
 FEED_EXPORT_ENCODING = 'utf-8'
 
 # Logging settings
-# LOG_STDOUT = True
-# LOG_FILE = f'logs/scrapy_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
-# LOG_LEVEL = 'DEBUG'
+LOG_STDOUT = True
+LOG_FILE = f'logs/scrapy_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
+LOG_LEVEL = 'DEBUG'
diff --git a/bogota_apartments/spiders/habi.py b/bogota_apartments/spiders/habi.py
@@ -1,3 +1,5 @@
+# Author: Erik Garcia (@erik172)
+# Version: Stable
 from fake_useragent import UserAgent
 from datetime import datetime
 import json