diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 00000000..027fdb9b Binary files /dev/null and b/.DS_Store differ diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 6309ea12..31a02377 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,10 +2,11 @@ - - - + + + + - + - - + + @@ -47,8 +48,8 @@ - - + + @@ -56,12 +57,12 @@ - + - - + + @@ -69,37 +70,38 @@ - - - - - - + + - - + + - + - - + + + + + + + - - + + @@ -128,15 +130,6 @@ - - - - - - - - - @@ -149,7 +142,6 @@ - <tbody> 207 Urb Tresc @@ -179,6 +171,7 @@ investing investing_scrapper scrapper + investpy_test @@ -219,21 +212,20 @@ - - \ No newline at end of file diff --git a/README.md b/README.md index 517b8f8b..b2f88f7d 100644 --- a/README.md +++ b/README.md @@ -60,9 +60,36 @@ You can check all the available equities/funds you can retrieve data from in Inv (**NOTE**: you will need an active Internet connection in order to get the scraper working.) -## Performance Analysis and Case Study +## Performance Analysis -Detailed in Jupyter Notebook +In this section I am going to explain the case study when developing the package and all the possible options when scraping in order to let you know which is the most efficient way to make a historical data scraper as far as I know based on my research over the past weeks. + +Lets start with the first step before scraping a web, in this case [investing](https://es.investing.com/), the process of either downloading the web or sending a post request to a web. As we all know, there a two main tools used to get the HTML code from a website: +* [urllib3](https://pypi.org/project/urllib3/): urllib3 is a powerful, sanity-friendly HTTP client for Python. Much of the Python ecosystem already uses urllib3 and you should too. urllib3 brings many critical features that are missing from the Python standard libraries. +* [requests](https://pypi.org/project/requests/): Requests allows you to send organic, grass-fed HTTP/1.1 requests, without the need for manual labor. There's no need to manually add query strings to your URLs, or to form-encode your POST data. Keep-alive and HTTP connection pooling are 100% automatic, thanks to urllib3. + +This unit tests are made with an stable Internet connection and done 500 times each, so we have a wide range of results in order to get to a better conclusion. +As we can see they are very related and similar, but with a significant efficiency difference when calculating the download time of a website's HTML code of a POST request, as shown in the graph: + +![urllib3 vs requests](https://github.com/alvarob96/investpy/blob/master/statistic%20plots/urllib3-requests.png) + +If we analyse the graph, we can see that the mean time when sending a POST request is better when we use **requests** instead of **urllib3**, and it is also more stable and more consistent so on. + +Once we have the HTML code resulting as the response to the POST request, we need to scrap the data from it and insert it into a pandas.DataFrame, so we are looking for a fast HTML parsing tool that allows us to retrieve huge loads of data really fast, so the user of the package does not wait too much. +The main Python packages used for HTML parsing are: +* [bs4](https://pypi.org/project/beautifulsoup4/): Beautiful Soup is a library that makes it easy to scrape information from web pages. It sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree. +* [lxml](https://pypi.org/project/lxml/): lxml is a Pythonic, mature binding for the libxml2 and libxslt libraries. It provides safe and convenient access to these libraries using the ElementTree API. It extends the ElementTree API significantly to offer support for XPath, RelaxNG, XML Schema, XSLT, C14N and much more. + +This unit tests are made with an stable Internet connection and done 500 times each, so we have a wide range of results in order to get to a better conclusion. +To determine which has a better time performance, we are going to parse a HTML that contains historical data from the last 10 years, to see which package works better for huge loads of data, as shown in the graph: + +![bs4 vs lxml](https://github.com/alvarob96/investpy/blob/master/statistic%20plots/bs4-lxml.png) + +We can clearly see that **lxml** completely outperforms **bs4**, with a much more better time result when retrieving huge loads of data from a HTML file; and it is more stable, with less fluctuations being more consistent. + +To sum up, we can clearly determine that the best combination in this use case is to use **requests** to download the HTML code and process the POST/GET requests, while when parsing the HTML (data extraction) we determine that **lxml** completely outperforms any other Python HTML parser. + +If you have any other package you want to compare with the ones used in this case, feel free to send me a mail to alvarob96@usal.es and I will try my best to answer fast. ## Future Work @@ -81,3 +108,5 @@ For further information or any question feel free to contact me via email at alv ## Disclaimer This Python Package has been made for research purposes in order to fit a needs that Investing.com does not cover, so this package works like an API for Investing.com developed in an altruistic way. Conclude that I am not related at all with Investing.com or any similar company, so I contacted Investing.com via mail and they gave me permission to develop this scraper with the condition of mentioning the source where I retrieve the data from. + +To clear any doubt if this is legal or not, I will tell you literally what *Enrique from Investing.com Support* answered me when I asked them for permission to develop this scraper: "[...] *thank you for contacting and choosing us (as the reliable source to get the data from)* [...] *you can use and retrieve all the data that Investing.com offers to the users as far as you specify which is the source you get the data from* [...]". diff --git a/investpy/__init__.py b/investpy/__init__.py index 0e3332ba..db746e60 100644 --- a/investpy/__init__.py +++ b/investpy/__init__.py @@ -8,8 +8,8 @@ import requests from lxml.html import fromstring -from investpy_test import user_agent as ua, equities as ts, funds as fs -from investpy_test.Data import Data # TypeError: 'module' object is not callable +from investpy import user_agent as ua, equities as ts, funds as fs +from investpy.Data import Data # TypeError: 'module' object is not callable def get_recent_data(equity, as_json=False, order='ascending'): diff --git a/investpy/equities.py b/investpy/equities.py index 0e44a328..8d2d0be1 100644 --- a/investpy/equities.py +++ b/investpy/equities.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup import pkg_resources -from investpy_test import user_agent as ua +from investpy import user_agent as ua def get_equity_names(): diff --git a/investpy/funds.py b/investpy/funds.py index 0de213f6..12489215 100644 --- a/investpy/funds.py +++ b/investpy/funds.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup import pkg_resources -from investpy_test import user_agent as ua +from investpy import user_agent as ua def get_fund_names(): diff --git a/setup.py b/setup.py index 6ebc2b45..ee9940f4 100644 --- a/setup.py +++ b/setup.py @@ -11,10 +11,10 @@ def readme(): setup( name='investpy', - version='0.5', + version='0.6', packages=find_packages(), url='', - download_url='https://github.com/alvarob96/investpy/archive/0.5.tar.gz', + download_url='https://github.com/alvarob96/investpy/archive/0.6.tar.gz', license='MIT License', author='Alvaro Bartolome', author_email='alvarob96@usal.es', diff --git a/statistic plots/bs4-lxml.png b/statistic plots/bs4-lxml.png new file mode 100644 index 00000000..baea86ff Binary files /dev/null and b/statistic plots/bs4-lxml.png differ diff --git a/statistic plots/urllib3-requests.png b/statistic plots/urllib3-requests.png new file mode 100644 index 00000000..44803362 Binary files /dev/null and b/statistic plots/urllib3-requests.png differ