Enable the AutoThrottle extension, increase crawl depth, and reduce H…

…TTP cache to 24 hours.
nmalcolm · Aug 28, 2017 · 7ba7ddc · 7ba7ddc
1 parent 95cf175
commit 7ba7ddc
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -64,7 +64,7 @@ $ scrapy crawl inventus -a domain=facebook.com -t csv -o Facebook.csv
 
 # Configuration
 
-Configurations can be made to how Inventus behaves. For example, by default Inventus will ignore robots.txt, has a 30 second timeout, caches crawl data for a week, and has a 0.25 second delay between requests. These and more can all be changed by editing the `inventus_spider/settings.py` file. Scrapy's settings are [well documented](https://doc.scrapy.org/en/latest/topics/settings.html#aws-access-key-id) too.
+Configurations can be made to how Inventus behaves. By default Inventus will ignore robots.txt, has a 30 second timeout, caches crawl data for 24 hours, has a crawl depth of 5, and uses Scrapy's AutoThrottle extension. These and more can all be changed by editing the `inventus_spider/settings.py` file. Scrapy's settings are [well documented](https://doc.scrapy.org/en/latest/topics/settings.html#aws-access-key-id) too.
 
 # Bugs / Suggestions / Feedback
 

diff --git a/inventus_spider/settings.py b/inventus_spider/settings.py
@@ -70,32 +70,32 @@
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+AUTOTHROTTLE_START_DELAY = 0.25
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+AUTOTHROTTLE_MAX_DELAY = 10
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+AUTOTHROTTLE_TARGET_CONCURRENCY = 5.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 HTTPCACHE_ENABLED = True
-HTTPCACHE_EXPIRATION_SECS = 604800 # 1 week
+HTTPCACHE_EXPIRATION_SECS = 86400 # 1 day
 HTTPCACHE_DIR = 'httpcache'
 HTTPCACHE_IGNORE_HTTP_CODES = []
 HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 
 # The maximum depth that will be allowed to crawl for any site.
 # If zero, no limit will be imposed.
-DEPTH_LIMIT = 3
+DEPTH_LIMIT = 5
 
 # The amount of time (in secs) that the downloader should wait before downloading consecutive pages
 # from the same website.
-DOWNLOAD_DELAY = 0.25
+DOWNLOAD_DELAY = 0.2
 
 # The amount of time (in secs) that the downloader will wait before timing out.
 DOWNLOAD_TIMEOUT = 30