From b1f368851712bcccf0e7b6dc487e0a44afdf8c3d Mon Sep 17 00:00:00 2001 From: asepscareer Date: Wed, 24 Jul 2024 15:59:07 +0700 Subject: [PATCH] update version to 1.0.5 - Excluded pandas dependency to streamline the library and reduce external dependencies. --- CHANGELOG.rst | 4 + README.md | 56 ++++++++--- requirements.txt | 1 - setup.py | 18 ++-- test.py | 59 ++++++++++-- ycnbc/base.py | 160 ++++++++++++++++--------------- ycnbc/uri.py | 6 +- ycnbc/utils.py | 242 +++++++++++++++++++++++++++-------------------- ycnbc/version.py | 2 +- 9 files changed, 330 insertions(+), 218 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index caa261c..bcd07d2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,10 @@ Change Log =========== +1.0.5 +------- +- Excluded pandas dependency to streamline the library and reduce external dependencies. + 1.0.4 ------- - Remapping Query Data diff --git a/README.md b/README.md index e331e75..8746ee3 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,6 @@ ycnbc is **not** affiliated, endorsed, or vetted by CNBC, It's an open source to ### Requirements - Python >=3.5+ -- pandas>=0.24.0 - requests>=2.23.0 - lxml>=4.5.1 @@ -43,16 +42,49 @@ import ycnbc data = ycnbc.News() -# get trending news -trending_ = data.trending() # return DataFrame - -# get latest news -latest_ = data.latest() # return DataFrame - -# get news by category -economy_ = data.economy() # return DataFrame - -# etc. +# Get trending news +trending_ = data.trending() + +# Get latest news +latest_ = data.latest() + +# Get news by category +economy_ = data.economy() +jobs_ = data.jobs() +white_house_ = data.white_house() +hospitals_ = data.hospitals() +transportation_ = data.transportation() +media_ = data.media() +internet_ = data.internet() +congress_ = data.congress() +policy_ = data.policy() +finance_ = data.finance() +life_ = data.life() +defense_ = data.defense() +europe_politics_ = data.europe_politics() +china_politics_ = data.china_politics() +asia_politics_ = data.asia_politics() +world_politics_ = data.world_politics() +equity_opportunity_ = data.equity_opportunity() +politics_ = data.politics() +wealth_ = data.wealth() +world_economy_ = data.world_economy() +central_banks_ = data.central_banks() +real_estate_ = data.real_estate() +health_science_ = data.health_science() +small_business_ = data.small_business() +lifehealth_insurance_ = data.lifehealth_insurance() +business_ = data.business() +energy_ = data.energy() +industrials_ = data.industrials() +retail_ = data.retail() +cybersecurity_ = data.cybersecurity() +mobile_ = data.mobile() +technology_ = data.technology() +cnbc_disruptors_ = data.cnbc_disruptors() +tech_guide_ = data.tech_guide() +social_media_ = data.social_media() +climate_ = data.climate() ``` Note: @@ -70,6 +102,6 @@ the [LICENSE.txt](./LICENSE.txt) file in the release for details. ### P.S. -Please drop me an note with any feedback you have. +Please drop me a note with any feedback you have. **Asep Saputra** diff --git a/requirements.txt b/requirements.txt index 94444b7..4a3f892 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ -pandas>=0.24.0 requests>=2.23.0 lxml>=4.5.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 5f7a973..3565d28 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,9 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- -# -# ycnbc - CNBC data downloader -# https://github.com/asepscareer/yfinance -"""ycnbc - cnbc data downloader""" +"""ycnbc - CNBC data downloader""" from setuptools import setup, find_packages -# from codecs import open import io from os import path @@ -18,7 +14,6 @@ version = line.replace("version = ", "").replace('"', '') # --- /get version --- - here = path.abspath(path.dirname(__file__)) # Get the long description from the README file @@ -38,8 +33,6 @@ classifiers=[ 'License :: OSI Approved :: Apache Software License', 'Development Status :: 5 - Production/Stable', - - 'Operating System :: OS Independent', 'Intended Audience :: Developers', 'Topic :: Internet :: WWW/HTTP :: Dynamic Content :: News/Diary', @@ -48,7 +41,6 @@ 'Topic :: Scientific/Engineering :: Interface Engine/Protocol Translator', 'Topic :: Software Development :: Libraries', 'Topic :: Software Development :: Libraries :: Python Modules', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', @@ -59,9 +51,11 @@ platforms=['any'], keywords='scrape news, cnbc library, cnbc python, cnbc api', packages=find_packages(exclude=['contrib', 'docs', 'tests', 'examples']), - install_requires=['pandas>=0.24.0','requests>=2.23.0','lxml>=4.5.1'], + install_requires=['requests>=2.23.0', 'lxml>=4.5.1'], entry_points={ - 'console_scripts': ['sample=sample:main',], + 'console_scripts': [ + # 'sample=sample:main', + ], }, ) @@ -70,4 +64,4 @@ NOTE: ycnbc is not affiliated, endorsed, or vetted by CNBC. You should refer to CNBC!'s terms of use for details on your rights to use the actual data downloaded. """ -) \ No newline at end of file +) diff --git a/test.py b/test.py index c743897..f493f96 100644 --- a/test.py +++ b/test.py @@ -2,14 +2,57 @@ import unittest data = ycnbc.News() + + class TestData(unittest.TestCase): - def test_trending(self): - assert(data.trending().empty is False) - assert(data.latest().empty is False) - assert(data.economy().empty is False) - assert(data.health_science().empty is False) - assert(data.finance().empty is False) - + def test_cnbc_news(self): + methods = [ + 'latest', + 'trending', + 'economy', + 'jobs', + 'white_house', + 'hospitals', + 'transportation', + 'media', + 'internet', + 'congress', + 'policy', + 'finance', + 'life', + 'defense', + 'europe_politics', + 'china_politics', + 'asia_politics', + 'world_politics', + 'equity_opportunity', + 'politics', + 'wealth', + 'world_economy', + 'central_banks', + 'real_estate', + 'health_science', + 'small_business', + 'lifehealth_insurance', + 'business', + 'energy', + 'industrials', + 'retail', + 'cybersecurity', + 'mobile', + 'technology', + 'cnbc_disruptors', + 'tech_guide', + 'social_media', + 'climate' + ] + + for method_name in methods: + with self.subTest(method=method_name): + method = getattr(data, method_name) + response = method() + self.assertNotIn("error", response, f"{method_name} returned an error") + if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/ycnbc/base.py b/ycnbc/base.py index 6c79e9d..ad3e315 100644 --- a/ycnbc/base.py +++ b/ycnbc/base.py @@ -19,125 +19,123 @@ # limitations under the License. # -from __future__ import print_function +from .utils import CNBCNews -from .utils import getnews, latest, trending -class News(): +class News: + def __init__(self): + self.news = CNBCNews() def latest(self): - return latest() + return self.news.latest() def trending(self): - return trending() + return self.news.trending() def economy(self): - return getnews('economy') + return self.news.by_category('economy') - def jobs(self): - return getnews('jobs') + def jobs(self): + return self.news.by_category('jobs') - def white_house(self): - return getnews('white-house') + def white_house(self): + return self.news.by_category('white-house') - def hospitals(self): - return getnews('hospitals') + def hospitals(self): + return self.news.by_category('hospitals') - def transportation(self): - return getnews('transportation') + def transportation(self): + return self.news.by_category('transportation') - def jobs(self): - return getnews('jobs') + def media(self): + return self.news.by_category('media') - def climate(self): - return getnews('climate') + def internet(self): + return self.news.by_category('internet') - def media(self): - return getnews('media') + def congress(self): + return self.news.by_category('congress') - def internet(self): - return getnews('internet') + def policy(self): + return self.news.by_category('policy') - def congress(self): - return getnews('congress') + def finance(self): + return self.news.by_category('finance') - def policy(self): - return getnews('policy') + def life(self): + return self.news.by_category('life') - def finance(self): - return getnews('finance') + def defense(self): + return self.news.by_category('defense') - def life(self): - return getnews('life') - - def defense(self): - return getnews('defense') - - def europe_politics(self): - return getnews('europe-politics') - - def china_politics(self): - return getnews('china-politics') - - def asia_politics(self): - return getnews('asia-politics') - - def world_politics(self): - return getnews('world-politics') - - def equity_opportunity(self): - return getnews('equity-opportunity') - - def politics(self): - return getnews('politics') + def europe_politics(self): + return self.news.by_category('europe-politics') - def wealth(self): - return getnews('wealth') + def china_politics(self): + return self.news.by_category('china-politics') - def world_economy(self): - return getnews('world-economy') + def asia_politics(self): + return self.news.by_category('asia-politics') - def central_banks(self): - return getnews('central-banks') + def world_politics(self): + return self.news.by_category('world-politics') - def real_estate(self): - return getnews('real-estate') + def equity_opportunity(self): + return self.news.by_category('equity-opportunity') - def health_science(self): - return getnews('health-and-science') + def politics(self): + return self.news.by_category('politics') - def small_business(self): - return getnews('small-business') + def wealth(self): + return self.news.by_category('wealth') + + def world_economy(self): + return self.news.by_category('world-economy') + + def central_banks(self): + return self.news.by_category('central-banks') + + def real_estate(self): + return self.news.by_category('real-estate') + + def health_science(self): + return self.news.by_category('health-and-science') + + def small_business(self): + return self.news.by_category('small-business') def lifehealth_insurance(self): - return getnews('life-and-health-insurance') + return self.news.by_category('life-and-health-insurance') def business(self): - return getnews('business') - + return self.news.by_category('business') + def energy(self): - return getnews('energy') + return self.news.by_category('energy') def industrials(self): - return getnews('industrials') + return self.news.by_category('industrials') def retail(self): - return getnews('retail') - + return self.news.by_category('retail') + def cybersecurity(self): - return getnews('cybersecurity') - - def mobile(self): - return getnews('mobile') + return self.news.by_category('cybersecurity') def mobile(self): - return getnews('technology') - + return self.news.by_category('mobile') + + def technology(self): + return self.news.by_category('technology') + def cnbc_disruptors(self): - return getnews('cnbc-disruptors') - + return self.news.by_category('cnbc-disruptors') + def tech_guide(self): - return getnews('tech-guide') - + return self.news.by_category('tech-guide') + def social_media(self): - return getnews('social-media') \ No newline at end of file + return self.news.by_category('social-media') + + def climate(self): + return self.news.by_category('climate') diff --git a/ycnbc/uri.py b/ycnbc/uri.py index 4331382..a522230 100644 --- a/ycnbc/uri.py +++ b/ycnbc/uri.py @@ -21,8 +21,10 @@ _BASE_URL_ = 'https://www.cnbc.com' _HEADERS_ = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,' + 'application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/39.0.2171.95 Safari/537.36' } \ No newline at end of file diff --git a/ycnbc/utils.py b/ycnbc/utils.py index 5693188..dd0592f 100644 --- a/ycnbc/utils.py +++ b/ycnbc/utils.py @@ -18,107 +18,147 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from .uri import _BASE_URL_, _HEADERS_ -from requests import get +from __future__ import print_function from lxml import html -from pandas import DataFrame - -def trending(): - try: - page = get(_BASE_URL_, headers=_HEADERS_) - tree = html.fromstring(page.content) - except Exception: - pass - - trending_news = tree.xpath("//li[contains(@class, 'TrendingNowItem')]") - assert len(trending_news) > 0, 'Data Not Found' - title, source = [], [] - - for i in trending_news: - text = i.xpath(".//a/text()") - link = list(i.iterlinks())[0][2] - - title.append(' '.join(text)) - source.append(link) - - data = { - 'Title': title, - 'Link': source - } - df = build_df(data) - return df - - -def latest(): - try: - page = get(_BASE_URL_, headers=_HEADERS_) - tree = html.fromstring(page.content) - except Exception: - pass - - source, title, posttime = [], [], [] - - links = tree.xpath("//a[contains(@class, 'LatestNews')]") - assert len(links) > 0, 'Data Not Found' - - latest_news = tree.xpath("//ul[contains(@class, 'LatestNews')]") - assert len(latest_news) > 0, 'Data Not Found' - - for i in links: - source.append(list(i.iterlinks())[0][2]) - for i in latest_news: - el = i.xpath("li") - for rs in el: - text = rs.xpath(".//a/text()") - posttime_ = rs.xpath(".//span/time/text()") - - title.append(' '.join(text)) - posttime.append(' '.join(posttime_)) - - data = { - 'Headline': title, - 'Post Time': posttime, - 'Link': source - } - df = build_df(data) - return df - -def getnews(category): - try: - page = get("{}/{}".format(_BASE_URL_, category), headers=_HEADERS_) - tree = html.fromstring(page.content) - - source, title, posttime = [], [], [] - news = tree.xpath("//div[contains(@class, 'Card-titleContainer')]") - assert len(news)>0, 'Data Not Found' - - posttime_news = tree.xpath("//span[contains(@class, 'Card-time')]") - assert len(posttime_news)>0, 'Data Not Found' - - for i in posttime_news: - text = i.xpath(".//text()") - posttime.append(' '.join(text)) - for i in news: - text = i.xpath("..//div/text()") - - source.append(list(i.iterlinks())[0][2]) - title.append(' '.join(text)) - - data = { - 'Headline': title, - 'Post Time': posttime, - 'Link': source - } +from requests import get +from .uri import _BASE_URL_, _HEADERS_ - return build_df(data) - except: - msg = { - 'data': [None], - 'msg': ['This page or category contains news with PRO tags.'] - } - return build_df(msg) -def build_df(values): - df = DataFrame(data=values) - df = df.convert_dtypes() - return df \ No newline at end of file +class CNBCNews: + def __init__(self): + self.base_url = _BASE_URL_ + self.headers = _HEADERS_ + + def _fetch_page(self, endpoint=""): + """ + Fetches and parses the web page content. + + Args: + endpoint (str): The specific endpoint to fetch data from. + + Returns: + html.Element: Parsed HTML tree if successful, otherwise an error dictionary. + """ + try: + url = f"{self.base_url}/{endpoint}" if endpoint else self.base_url + page = get(url, headers=self.headers) + page.raise_for_status() # Ensure we raise an error for bad HTTP responses + return html.fromstring(page.content) + except Exception as e: + return {"error": str(e)} + + def trending(self): + """ + Fetches trending news. + + Returns: + dict: Dictionary containing titles and links of trending news, or an error message. + """ + try: + tree = self._fetch_page() + if "error" in tree: + return tree + + trending_news = tree.xpath("//li[contains(@class, 'TrendingNowItem')]") + if not trending_news: + return {"error": "Data Not Found"} + + title, source = [], [] + for i in trending_news: + text = i.xpath(".//a/text()") + link = list(i.iterlinks())[0][2] if list(i.iterlinks()) else None + title.append(' '.join(text)) + source.append(link) + + return { + 'Title': title, + 'Link': source + } + except Exception as e: + return {"error": str(e)} + + def latest(self): + """ + Fetches the latest news. + + Returns: + dict: Dictionary containing headlines, post times, and links of the latest news, or an error message. + """ + try: + tree = self._fetch_page() + if "error" in tree: + return tree + + source, title, posttime = [], [], [] + + links = tree.xpath("//a[contains(@class, 'LatestNews')]") + if not links: + return {"error": "No Latest News links found"} + + latest_news = tree.xpath("//ul[contains(@class, 'LatestNews')]") + if not latest_news: + return {"error": "No Latest News list found"} + + for i in links: + link = list(i.iterlinks())[0][2] if list(i.iterlinks()) else None + source.append(link) + + for i in latest_news: + el = i.xpath("li") + for rs in el: + text = rs.xpath(".//a/text()") + posttime_ = rs.xpath(".//span/time/text()") + + title.append(' '.join(text)) + posttime.append(' '.join(posttime_)) + + return { + 'Headline': title, + 'Post Time': posttime, + 'Link': source + } + except Exception as e: + return {"error": str(e)} + + def by_category(self, category): + """ + Fetches news based on the category. + + Args: + category (str): The news category to fetch. + + Returns: dict: Dictionary containing headlines, post times, and links for the specified category, or an error + message. + """ + try: + tree = self._fetch_page(category) + if "error" in tree: + return tree + + source, title, posttime = [], [], [] + + news = tree.xpath("//div[contains(@class, 'Card-titleContainer')]") + if not news: + return {"error": "No news items found"} + + posttime_news = tree.xpath("//span[contains(@class, 'Card-time')]") + if not posttime_news: + return {"error": "No post time found"} + + for i in posttime_news: + text = i.xpath(".//text()") + posttime.append(' '.join(text)) + + for i in news: + text = i.xpath("..//div/text()") + link = list(i.iterlinks())[0][2] if list(i.iterlinks()) else None + source.append(link) + title.append(' '.join(text)) + + return { + 'Headline': title, + 'Post Time': posttime, + 'Link': source + } + except Exception as e: + return {"error": str(e)} diff --git a/ycnbc/version.py b/ycnbc/version.py index 2745891..4920851 100644 --- a/ycnbc/version.py +++ b/ycnbc/version.py @@ -1 +1 @@ -version = "1.0.4" \ No newline at end of file +version = "1.0.5"