From 911f5033cf74c49707642766b4c33d614703bc6e Mon Sep 17 00:00:00 2001 From: Andrei Paraschiv Date: Wed, 17 Jan 2024 00:58:41 +0200 Subject: [PATCH] Documentation changes v0.9.2 (#604) * feat(doc): :memo: adding evaluation results * feat(doc): :rocket: Documentation Update. Added Examples, documented new features --- README.md | 48 ++-- docs/user_guide/advanced.rst | 114 +++++++++- docs/user_guide/api_reference.rst | 23 ++ docs/user_guide/assets/chatgpt_chat200x75.png | Bin 0 -> 10461 bytes ...chatgpt_chat.png => chatgpt_chat75x75.png} | Bin docs/user_guide/examples.rst | 210 +++++++++++++++++- newspaper/api.py | 21 +- newspaper/article.py | 6 +- newspaper/configuration.py | 28 ++- newspaper/exceptions.py | 2 +- newspaper/mthreading.py | 24 +- newspaper/source.py | 15 +- tests/test_source.py | 6 + 13 files changed, 435 insertions(+), 62 deletions(-) create mode 100644 docs/user_guide/assets/chatgpt_chat200x75.png rename docs/user_guide/assets/{chatgpt_chat.png => chatgpt_chat75x75.png} (100%) diff --git a/README.md b/README.md index 1d75baa..b05834c 100755 --- a/README.md +++ b/README.md @@ -4,12 +4,15 @@ [![Coverage status](https://coveralls.io/repos/github/AndyTheFactory/newspaper4k/badge.svg?branch=master)](https://coveralls.io/github/AndyTheFactory/newspaper4k) [![Documentation Status](https://readthedocs.org/projects/newspaper4k/badge/?version=latest)](https://newspaper4k.readthedocs.io/en/latest/) -At the moment the Newspaper4k Project is a fork of the well known newspaper3k by [codelucas](https://github.com/codelucas/newspaper) which was not updated since Sept 2020. The initial goal of this fork is to keep the project alive and to add new features and fix bugs. +At the moment the Newspaper4k Project is a fork of the well known newspaper3k by [codelucas](https://github.com/codelucas/newspaper) which was not updated since September 2020. The initial goal of this fork is to keep the project alive and to add new features and fix bugs. I have duplicated all issues on the original project and will try to fix them. If you have any issues or feature requests please open an issue here. -**Experimental ChatGPT helper bot for Newspaper4k:** -[![ChatGPT helper](docs/user_guide/assets/chatgpt_chat.png)](https://chat.openai.com/g/g-OxSqyKAhi-newspaper-4k-gpt) +| | | +|-------------|-------------| +| **Experimental ChatGPT helper bot for Newspaper4k:** | [![ChatGPT helper](docs/user_guide/assets/chatgpt_chat200x75.png)](https://chat.openai.com/g/g-OxSqyKAhi-newspaper-4k-gpt)| + + ## Python compatibility - Recommended: Python 3.8+ @@ -29,10 +32,10 @@ You can start directly from the command line, using the included CLI: python -m newspaper --url="https://edition.cnn.com/2023/11/17/success/job-seekers-use-ai/index.html" --language=en --output-format=json --output-file=article.json ``` - +More information about the CLI can be found in the [CLI documentation](https://newspaper4k.readthedocs.io/en/latest/user_guide/cli_reference.html). ## Using the Python API -Alternatively, you can use the Python API: +Alternatively, you can use Newspaper4k in Python: ### Processing one article / url at a time @@ -82,22 +85,22 @@ import newspaper cnn_paper = newspaper.build('http://cnn.com', number_threads=3) print(cnn_paper.category_urls()) -> ['https://cnn.com', 'https://money.cnn.com', 'https://arabic.cnn.com', -> 'https://cnnespanol.cnn.com', 'http://edition.cnn.com', -> 'https://edition.cnn.com', 'https://us.cnn.com', 'https://www.cnn.com'] +>> ['https://cnn.com', 'https://money.cnn.com', 'https://arabic.cnn.com', +>> 'https://cnnespanol.cnn.com', 'http://edition.cnn.com', +>> 'https://edition.cnn.com', 'https://us.cnn.com', 'https://www.cnn.com'] article_urls = [article.url for article in cnn_paper.articles] print(article_urls[:3]) -> ['https://arabic.cnn.com/middle-east/article/2023/10/30/number-of-hostages-held-in-gaza-now-up-to-239-idf-spokesperson', -> 'https://arabic.cnn.com/middle-east/video/2023/10/30/v146619-sotu-sullivan-hostage-negotiations', -> 'https://arabic.cnn.com/middle-east/article/2023/10/29/norwegian-pm-israel-gaza'] +>> ['https://arabic.cnn.com/middle-east/article/2023/10/30/number-of-hostages-held-in-gaza-now-up-to-239-idf-spokesperson', +>> 'https://arabic.cnn.com/middle-east/video/2023/10/30/v146619-sotu-sullivan-hostage-negotiations', +>> 'https://arabic.cnn.com/middle-east/article/2023/10/29/norwegian-pm-israel-gaza'] article = cnn_paper.articles[0] article.download() article.parse() print(article.title) -> المتحدث باسم الجيش الإسرائيلي: عدد الرهائن المحتجزين في غزة يصل إلى +>> المتحدث باسم الجيش الإسرائيلي: عدد الرهائن المحتجزين في غزة يصل إلى ``` Or if you want to get bulk articles from the website (have in mind that this could take a long time and could get your IP blocked by the newssite): @@ -130,7 +133,7 @@ article.download() article.parse() print(article.title) -> 晶片大战:台湾厂商助攻华为突破美国封锁? +>> 晶片大战:台湾厂商助攻华为突破美国封锁? if article.config.use_meta_language: # If we use the autodetected language, this config attribute will be true @@ -138,7 +141,7 @@ if article.config.use_meta_language: else: print(article.config.language) -> zh +>> zh ``` # Docs @@ -158,8 +161,25 @@ detailed guides using newspaper. - Autoatic article text summarization - Author extraction from text - Easy to use Command Line Interface (`python -m newspaper....`) +- Output in various formats (json, csv, text) - Works in 10+ languages (English, Chinese, German, Arabic, \...) +# Evaluation + +## Evaluation Results + + +Using the dataset from [ScrapingHub](https://github.com/scrapinghub/article-extraction-benchmark) I created an [evaluator script](tests/evaluation/evaluate.py) that compares the performance of newspaper against it's previous versions. This way we can see how newspaper updates improve or worsen the performance of the library. + +| Version | Corpus BLEU Score | Corpus Precision Score | Corpus Recall Score | Corpus F1 Score | +|--------------------|-------------------|------------------------|---------------------|-----------------| +| Newspaper3k 0.2.8 | 0.8660 | 0.9128 | 0.9071 | 0.9100 | +| Newspaper4k 0.9.0 | 0.9212 | 0.8992 | 0.9336 | 0.9161 | +| Newspaper4k 0.9.1 | 0.9224 | 0.8895 | 0.9242 | 0.9065 | +| Newspaper4k 0.9.2 | 0.9426 | 0.9070 | 0.9087 | 0.9078 | + +Precision, Recall and F1 are computed using overlap of shingles with n-grams of size 4. The corpus BLEU score is computed using the [nltk's bleu_score](https://www.nltk.org/api/nltk.translate.bleu). + # Requirements and dependencies Following system packages are required: diff --git a/docs/user_guide/advanced.rst b/docs/user_guide/advanced.rst index bbc2a83..d224332 100755 --- a/docs/user_guide/advanced.rst +++ b/docs/user_guide/advanced.rst @@ -11,7 +11,7 @@ Multi-threading article downloads **Downloading articles one at a time is slow.** But spamming a single news source like cnn.com with tons of threads or with ASYNC-IO will cause rate limiting -and also doing that is very mean. +and also doing that can lead to your ip to be blocked by the site. We solve this problem by allocating 1-2 threads per news source to both greatly speed up the download time while being respectful. @@ -19,22 +19,50 @@ speed up the download time while being respectful. .. code-block:: python import newspaper - from newspaper import news_pool + from newspaper.mthreading import fetch_news slate_paper = newspaper.build('http://slate.com') tc_paper = newspaper.build('http://techcrunch.com') espn_paper = newspaper.build('http://espn.com') papers = [slate_paper, tc_paper, espn_paper] - news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total - news_pool.join() + results = fetch_news(papers, threads=4) + #At this point, you can safely assume that download() has been #called on every single article for all 3 sources. - print(slate_paper.articles[10].html) + print(slate_paper.articles[10].tite) #' ...' + +In addition to :any:`Source` objects, :any:`fetch_news` also accepts :any:`Article` objects or simple urls. + +.. code-block:: python + + article_urls = [f'https://abcnews.go.com/US/x/story?id={i}' for i in range(106379500, 106379520)] + articles = [Article(url=u) for u in article_urls] + + results = fetch_news(articles, threads=4) + + urls = [ + "https://www.foxnews.com/media/homeowner-new-florida-bill-close-squatting-loophole-return-some-fairness", + "https://edition.cnn.com/2023/12/27/middleeast/dutch-diplomat-humanitarian-aid-gaza-sigrid-kaag-intl/index.html", + ] + + results = fetch_news(urls, threads=4) + + # or everything at once + papers = [slate_paper, tc_paper, espn_paper] + papers.extend(articles) + papers.extend(urls) + + results = fetch_news(papers, threads=4) + + +**Note:** in previous versions of newspaper, this could be done with the ``news_pool`` call, but it was not very robust +and was replaced with a ThreadPoolExecutor implementation. + Keeping just the Html of the main body article ------------------------------------------------ @@ -191,12 +219,84 @@ The full available options are available under the :any:`Configuration` section Caching ------- -TODO +The Newspaper4k library provides a simple caching mechanism that can be used to avoid repeatedly downloading the same article. Additionally, when building an :any:`Source` object, the category url detection is cached for 24 hours. + +Both mechanisms are enabled by default. The article caching is controlled by the ``memoize_articles`` parameter in the :any:`newspaper.build()` function or, alternatively, when creating an :any:`Source` object, the ``memoize_articles`` parameter in the constructor. Setting it to ``False`` will disable the caching mechanism. + +The category detection caching is controlled by `utils.cache_disk.enabled` setting. This disables the caching decorator on the ``Source._get_category_urls(..)`` method. + +For example: + +.. code-block:: python + + import newspaper + from newspaper import utils + + cbs_paper = newspaper.build('http://cbs.com') + + # Disable article caching + utils.cache_disk.enabled = False + + cbs_paper2 = newspaper.build('http://cbs.com') # The categories will be re-detected + + # Enable article caching + utils.cache_disk.enabled = True + + cbs_paper3 = newspaper.build('http://cbs.com') # The cached category urls will be loaded + + Proxy Usage -------------- -TODO +Often times websites block repeated access from a single IP address. Or, some websites might limit access from certain geographic locations (due to legal reasons, etc.). To bypass these restrictions, you can use a proxy. Newspaper supports using a proxy by passing the ``proxies`` parameter to the :any:`Article` object's constructor or :any:`Source` object's constructor. The ``proxies`` parameter should be a dictionary, as required by the ``requests library``, with the following format: + +.. code-block:: python + + from newspaper import Article + + # Define your proxy + proxies = { + 'http': 'http://your_http_proxy:port', + 'https': 'https://your_https_proxy:port' + } + + # URL of the article you want to scrape + url = 'https://abcnews.go.com/Technology/wireStory/indonesias-mount-marapi-erupts-leading-evacuations-reported-casualties-106358667' + + # Create an Article object, passing the proxies parameter + article = Article(url, proxies=proxies) + + # Download and parse the article + article.download() + article.parse() + + # Access the article's text, keywords, and summary + print("Title:", article.title) + print("Text:", article.text) + +or the shorter version: + +.. code-block:: python + + from newspaper import article + + # Define your proxy + proxies = { + 'http': 'http://your_http_proxy:port', + 'https': 'https://your_https_proxy:port' + } + + # URL of the article you want to scrape + url = 'https://abcnews.go.com/Technology/wireStory/indonesias-mount-marapi-erupts-leading-evacuations-reported-casualties-106358667' + + # Create an Article object, + article = article(url, proxies=proxies) + + # Access the article's text, keywords, and summary + print("Title:", article.title) + print("Text:", article.text) + Cookie Usage (simulate logged in user) -------------------------------------- diff --git a/docs/user_guide/api_reference.rst b/docs/user_guide/api_reference.rst index 4f7d7da..13b39f0 100755 --- a/docs/user_guide/api_reference.rst +++ b/docs/user_guide/api_reference.rst @@ -6,6 +6,20 @@ Newspaper API .. autosummary:: :toctree: generated +Function calls +-------------- + +.. autofunction:: newspaper.article + +.. autofunction:: newspaper.build + +.. autofunction:: newspaper.mthreading.fetch_news + +.. autofunction:: newspaper.hot + +.. autofunction:: newspaper.languages + + Configuration ------------- @@ -44,7 +58,9 @@ Source .. automethod:: newspaper.Source.purge_articles() .. automethod:: newspaper.Source.feeds_to_articles() .. automethod:: newspaper.Source.categories_to_articles() +.. automethod:: newspaper.Source.generate_articles() .. automethod:: newspaper.Source.download_articles() +.. automethod:: newspaper.Source.download() .. automethod:: newspaper.Source.size() Category @@ -55,3 +71,10 @@ Category Feed ---- .. autoclass:: newspaper.source.Feed + + +Exceptions +---------- +.. autoclass:: newspaper.ArticleException + +.. autoclass:: newspaper.ArticleBinaryDataException diff --git a/docs/user_guide/assets/chatgpt_chat200x75.png b/docs/user_guide/assets/chatgpt_chat200x75.png new file mode 100644 index 0000000000000000000000000000000000000000..3eac3040a9d17ec4ddc277ff3d61d981ecfec735 GIT binary patch literal 10461 zcmV<3C?eO1P)Px#1ZP1_K>z@;j|==^1poj532;bRa{vGi!~g&e!~vBn4jTXfC~`?eK~#8N?VSgl z74_Bk|HH!4cM*`Ph>D1U*bp0nD9u=6*VrY-mdDp5K6!ld#8;zl^2BKJYD}?fjG`g( z#2QgVuwkc&6g!HAfTHYD7k2qS=R0%H-LvD|duN+~-XEXCoxN@5&Ybf<{fsN3h$3<$ z$UJ`SD4Tdyi9Bp8)K6%~%yw6y4nn-PUTCGzQlZ5{??RO)UC!S{6wz3W5b7-SkA zk%X^RKtw3I$ipz9aY6?OAsFq2(&7uDH-(-Rnk@9P2uzhcHWr+yPC{O25l`LpNOwTKtwPOl83W}z9ZC~=b8v_2~8CG zs|boc0rkOtLN7*}Ay`p(?$hwETKq}q!e}!T?DwP1A~r*uCUkeSnFWj6+-S3ofUFa0 z8*P3S%_<-w7-))1g$@p%Xg;hK`kT2Jw26YY<`1f0u_Vh1!KrY#OkFTqN4$iRjO~uo~n;q3xng_o&d3(PoH38l;ub zfkL|pwG?_o=oO(#Aq9>he%MB#2YJF^h)^FPe4Gb(!e9@f9imP5kIW<3Mu-K7<5?xd zKBovN*cba`wS1B%1SbjYB7{l$oX`hCNf;`GX|=#c60rSQ**RwfWjOw4gyPsoh<$KfIj5tA_+9vmw+ktH3h^BNBEPGzkb>_} zXFQ(<*-MD?$#0(8WC9|B(M=vW*dxLxHV^z#gyjEbwsXM>$o~_v@5g6C3xa$CGE(Tz zLIc9TYJnefvCyROugJNFp}_G%{}XM7m?1>q0vs;%ShN`e<{lMohVQ{=x?G5bC=M&w zKNkA`(WVAO2N~V}&?pai4U)5c_y4{3`-N!|`JxVM<>k1U3kjMw=mS5;`y1biIW>i8e!| z5fB!e>CtA0K}}*}iEyw2uu^PJ!T~OmV|zfTW9IKd_*lrUNcc~Mz-l~ANQDEY=OTh$ zLiY=?`T@*N#i5x;z$&f60qkpNCN)#J;0Oo8rNV*MgE3UIuzDte$X{GIz#oJVaK*_& zDjYC98VN@hY%PS4aZGXSDs+dC6%Ifo=^-Jz81wr$PY6e~5DSzp7DS);EiiE$Xi^o9 zIQA7%;rNSCJjhZA@QC$Ae=0P;(E=jO$AyJKlnLq$&d^G+EGoSZ^=uR4XrU+MI8ph9 zz(rvNRG`LBj5c!un^*B8A($7fbSh6|@SmAS)keSyLEKL}B=ZP(4U-M%=LcmT!I?rA z2puN`6Cy0RaI+9hwzH6p{hyn81Vnm%Cxqa`tYd{Rn=owv`6^z^g&zv>eq~mGDP{Lz z&lwt(bH#asDFF*c(&D82ipD!k=-1(25zwd#BBu2PLWzk6&}?YK#yRQ2bc93yDU^o! zvfwHr2$eM(T;+zfZcWX4S1!-?YSF^2+gKewS6N*p0<*EsveO8#ay}-0q28+ zq?#o_FIp0KKOsfHO8)z3GlX4@U^IY#X1aCdR)2R(MoEZ0_fDq3xs1 z5Y|VrxJwA%;B}$5g{&!wiKKur@FU4LS*U;jYuj7=n-I)}#<@l4S|Nol2z*u-0IUv< zM`6#IHCOS=6cdCn3m+G{f+qwg2w@rmjzM8Rmk-wosR_&XBIF?&EFi*c`~aLb5^Sl# z4l6`35QP8ewv(H@_c8vhL(laK$6e@Vjr*ZnH2MPn!hxsy8+O>&@7c1AtEgTl-luO0 zLBR%{B*&f01hGJitc^Cq6%LJ%1iO;<5aKhI6gb}sE`UOrMup(E%p*`HvcjXEwZR9X zclbm2IV@PYuq^Yacp>u$xQ20)=DR|y&@RK!818 zHkYNrk~pltq)=gBE`+bBu-70T(EI>@Nd+xLg9JpFj4ZIcgyXjU!~HcQPjdsh_Xz~i zi-L_}PSuDISaj`B?#|e2R4{k=8GirLc5cP`)sgw7B;QvlpQ-z)f;d|QJgY+U00E&y z0ib=70uY#03QPqACQd}W1`8?B0(!^@focdYCMJi5V|gvfyi)K!9AkWtr+Fd+gjr3u ze9*8lV5QX>H}-=w4mbyddNGTrSg0?oNvkD1_M8#UPC92CTiSq+4ZvV(5;n#!=Dgbd z6WQZjCE*8|M?jTY-O)VtiTp1t2cw(u4I*{a&m`EI>-2qug;-9Nn7cl}GlPWP`Ibf$l5 z=m{QSJFTp{2%h*WVz$-847)~tM_QF&|FxW|I zr<@(&b6m1@KmY5Irv$A^BJ=gbFaG6!_uAy(51+~TK$8R|(W7V*C;o|R+1CAI;GzDz z1NRSF1i9t7s(OQabLnFLmj%xS*UNX6v}`S05Txg z`3;)~VZ~J^moO*cR~&H?rG!g)jSHyz0BEqdEe$gwkO-m`JV0cLS|SC}D1r%GSeOA2 z*3>AB#{6v_1i#RRmzO+IiP;90=_NJIh_ig+G`g+b3H zAOLVxeKiP)w0cFXR>P_-TDS}LIMk_dyuIurKW^Hc!5@~t@5)-Wb!Fm=mzA_~om$B2 zEw~1O8!wt+n=*ElUxr(YHrl=Sj={BuobLW-@L|42v{Hpn<@c(+fVoke>DZq52CH&aZpyJtP+e>>v=fBlOO z`3L6B@b7*8sTYLb(Xyn(9Xn`PFm0b>z1z4!jOr9Rgm`?rHZ~KR1Xg}B$CySi8P&el zfux**b7Lb*dR{pQ7&fljiPl&Lo6}`0iftAh#03g-fK?h`cdM<Jm_yf1w)palH9PaDx*-!a17EBE~ zwQ3h~&l>rEgZL2J%iq85Kg^HoKO`t`8wFIVf@gxitssdkU!jgYrkH4) zD|-#}=ZJRdRo*rGTke}P-JddlYEa&)O&T%zftXr)Qdq?r15_joi^d?Lf@wt_G#9j9 zO`}s5`Wh=-uJ6EkCJM&q5nAkT3u|H#Siu1$e2VFWnPcn3;^22s34n&uUStxP2# zA{@l~XA9-cA@F4t!t5uO^>F__c$D9xcfYL90ajJ5b;mt^lba%1rIYv-D{AE0%i|4w zhxp@nIl%QS?~*;9`Aa_Vx6OMxxPF=VUSfXnK2R=?FgETVAlb4;3M?4&65eRzS#U*bH`4AxS{KF-U7-ku6Rk9P8Y# z`|Rm&8-;l^ppI~WWh=k*Q`UUx+94dG9WL$G-@Sdv4?H;!6^@U-T;{KN{vUqO)3*oL ze=duc_$N5+!%N${{YA^L56m&Z0d%IcM1kG`EpS}Dg`yU;2Y!g`rgmI_7J=4!yvMA)}H5dqy12^^I>GJKZJSUqzF1x9?wU2#@BHM=!KI651tn+=d2d${rU|=_@(+zW-#sdH z_b!L}_F`tSKT0|I;!g73$5$yBArDu7&e`<9MWKfZ|ENTWem zVe=KIg+9m0@ny0Q}!4cbZXndb#B+uowny8!NSq!`Kv`! zt*u@khLA%p5KVH&;yFRZ+9+JVRl71bs;s;0E0yImS{8!(^u})~xL$~)7eq9zIY6Ny z-%o_>8)aUBGy-BPOTPlb6gcFUh|i&#W5wE4?#IvE8(h2e&A^p*aM?UYMm`iUHuS*z zvx8T}FL5nJS&0et!!CW@8wZc~KOJ^#uuZ3}v!>UQishj!Y`ayr;L>5o1#=HR+h5pa zJ6Ew`t$dMp&#kR+ODd!Hl(cB!`nKQ7b8IPqlW6k{-y{(6g2GGH48UpT`zi1E#vzS> zBR@=J4y>`s9|DCg+^7pLx$ZduH0MyLMJSd_Mcvp%eEhZBo-f8=u=&w8U z3^%Cf_TkHzUK3_d^E-*@HA3jY`7ij@>!W}K8TRWAJv02wxVGiatzYFTtJY+XqixI5 z>~;#+evBfD2$5GnGG<3!^U8^zF{al5Uz%~q7cF1nfAZnXptMz+I(hyGz7E5N@*@AW zE-CSU88p^U9CNN4*>8`mb5EFe(t;WOw7HK4RpM~JBj(gGa~=)GPr1##_`!T%vvH$i zzkG(z-nrZ1zGG?ItQfA`u+C@0!i7Qhxrid_ppXV34o;js5g{iE|MlyvNnGbX-REA0A{aNPCLSDLL-y_|bdyUHGKUS`b!t5F`h;2rGwkoH zK6eMqm=OGI`UL;h(#7F-mdR(&+Iw`cbN4>s&em6NaBr>rT-=^CN;1UGvLFsIV%Z9RAQhcBoODjEPM+-#_wNz*nem8pWH)Y zO05x7icI?Mz54|>j5;gKu#awCCOcjqI_1|cdnXt$?Y7{C7a#FUE58VL$@{^|s%PQ)XqLO3a{K5+P3~zCz?_;_J`&VN-4jezfT2 z;Q94y+|i*ab-R1%g9X02rY5YpI&`PKg8Rpw>+T&m+S7fg+4PGSy%ZcIpMCP}nOV8) z(!MPC{h(24b0L62IW2c6qKFWM1SD^ZYUn8llZw-vWy~J!#94-MlMn>Luw*Y3k-S{@ z@+0o#x&I8_7oUT~Uzx1>om#bZ6IU&B`#*bUFzwyi+P(6&9o+Z!7#X}W`h5SpzC(Oi z`CC#N`X^&vxj&#rEAv)2rvH%Or`-p5O5sz0m~%s#f{JJwP+<&hRwyJOS`U8oY_kx#)kM{%0 zx@9YIThtM!uR;%^Qk?fApS(4gH1|2rHei28_uM(uOyyX5LU%2A&Jp1PJ9pbAI6;nQ zRl0mg3OOpGnPEEwH#rIkfSsO^h?0_)?w8^$(*VWa?MvIZ{~SCzWLWw#&`sFgZU=RdrmB?kS9d| zb_pKY^Qg!{_BC#XINjr3`$ur$)H}nz4&Fxy7vSHI=-FSSqa>`af{i&S?GM?tO}X2? zQ}^hL1yu9WZTQZ*$}mq7M;Km~@AE5W@II=35lD`O&oF?FSy((z zla+$;t^9sSg=&+bkbvZxERhAA+fkh4pMN|jc)0cczi935 ztoYOodG4;@7tj1HY<9}L)g3eFz~I9%7x@3)b`QU%dVMGuZAEbI95B*9d)S3;&uw;! zD;L+gK=_tAEk!#@SPBE1t>A*tNHB2`1_1PCGdoHq-1|X|EPT{m`+r_LdLps#8U=s41f5v$=rE)N&TO52^ox0qI7h58DoA zQ)CwrgF*sAp$y@RIgqjJ zLSUNp7GcyPN3eLMm|F!jj^LBZlP*s?XsjW?9#oqUTiK8(^2`iFn>-gQY)Y3f0LP4Lnf(eg0tJk&#U?OV#9d&IdP@+MSs-v-c8PbNyb%zlB z_e?OcJVr!Er<-hYc-!hekU%!MSODReGi@Y=9q1O!l_6Amr#C+w6D zOdH!rub;6aN8{7lnX6^nW$k+v^!DT1+x>?T9OWmS-*`V*n^t^iK>Z^SRzUQDUm zUwgtYUmaO5+jr?1TrvElVAk+6J!S1T?=;+hcKDCnetmZhFMN$S)AXU!GJDkQhXW`6 zNUMxPUkG$5_HEV9oidOSP!Zn#>?8lTiY2an;qQn~5@`;cBV;E#GCqh2AdL0FBultL#!TG${*QrS)}Eop7wg5k!m6KMp*-lrii=s8zx(?LvG zao&kio%7Z+!4Xf~>>gYAvR}KQDtr#d`}$F5x^ssfCVobg=zC|`$NsYE_xk;2+!Nfl zQWhl2?+%wAS6l%786Co$_%m7 zLCh9OVbg{Ph^>f7YZ2J8o1~#{wf1x^>~l%`$#wZ^BPyF2(g=ufSmNY|x)7%Su(Eap z?XKNqu@M1*F)15)Ni@OXvmOf0d-4wV>f$&3Mic5KJO1aF|LOZryEV9a>03e2QhdIR z)oz!PR&L^+NBfaG?jAA)8F?nY1AC@7$a|0Ow2fOha=ah1?ao;Sn*7$Q{#yAyW^NS# zv!R+|%uHIKF|fsL@&k2p2X>*IWt$Y8iUTKH9ccj5+A*F z2}pbl5J3TrP{Hu^xj%=^KP3s?CPP615kG`buMQK-jTWukGfUqOC^H8*+1G44*jH~@ z7cP0!SCxyP++F!EH}v_tgR7pI=x2TWPB`Cc%#@phAHMfO5M%_#Z`|l^?02A_KK6Wn z;^5)g9{1UwzU#k4Q;0COlJCyrbbW-0KZ{LbbM z@p9n_fm$couADXmXzvA2jsb_B+(W*>+ZKdkiC*DU+kwrD;Yk|BgQg|1}sE{uNZMspu$0py1vv(*5&4Z|v{g8gK$zB^E<*g2vsmE( zBpeAH$DL^eo`8cD4v^v!gi$jzQe4QJGwhxU*;Tp_M2m2O^RBXdS%sKX69$e7Cy_+Y zinb8veahQ2{XHMd3LaP`?u-aZh?16We0eW-X8-;De%tMmo%+uEr@k}GU-{N^!JMj< z4xgkfbEp-fX?CDMa_j}3X^oVv-2cY2{{F>tgGrTQnV>a9kkVY04DJx^0YiOVV12+T zXVt`XsheH>DyUOpvY{Pl&1YF_lu6?ngp&o~*AQ(=@;j^sVJVS4%3xm}s-DNJr}hw$PD3;cBpo(>)qUxo4%CNQdL78XBlSmUM) zJ;9IezjyfEOgriG_#Z-Zs=WwKU4ttX4iEvcb8)vsBne0%SOGyulkP44tw(AB0U{*W zqj}8{zK~m{)rgS%X~zTnDT7CZGpz~%f91cP|GT?&+1r6DZ5K{@SNnpcaveU4uW^%h zJIo)q+pz3$GsybKi(U=NMVRWLH55}tIQUIlI;?=O>ZWP@vI0V`D_6zuw;J_YaH}Rw zf4}t!BT=hyJF^a2iz1W~`VADP(@GnbybDB&1l?P; zb1m1c4Q8&b5P?|ls>B(Gszn?w>eAQUvd=M|2`1qO1ZbNR=S>OPig2WLree5Iep4Y6 zXG;KuXL?syOsX`uVWXW|!^V`N*xO$hkwLvQ2oNno(uo_G6jaGD?Fyn(VKr|XcWQ^O zu5Y_euC4e8tHr7QWZg>llif#Uqh0_r>*E!l`Hw3;r4RAhoiMHv{GB4l_D^I{rHWbX8BUTb?f$w5DK^% zP8@`e-}&qlf8)$2+?flf1*=4u8Ymp#MiCBrF^VW6FB+sl6rxFp)e{qM6woItLnqw3 zlgoR!1H1I`eL8k^?OK(Fy&NB}`og{R`QqTN3aS^p>yRm7X@FS;2$$|H0+Lj0S40tY z&?o@`A|#X*G|7!w!fZS_5;E+eDVBj~9ukUZ91P|L3WnmlA{?}>7Ewe&G+ID_2ni7s zd&}g_6*q`**mlt(ipYxwMn~&Hn2rYglZ2YHWwo)OD%sw7_1g}z@sy1~O~MCF2s$D; zF8XCQDVjusC`6M`HjVRZOQ$ynP&P441UYT^GY&%91BJQ^Q8z?@f~r_bG~%F4B7R}1 zQ_>q@S}@`!Z6uJ|DoQ}ofiF!V_c6b=$;^Mt51qU@Vt1{4*-Y%o1jIvBcO!G7p1u|#x%ZaZ2| zx$UxoO4ay#bJ1NwX+T{WbxdT*DURfr?-xRtDS}K|6qEYLn+lB?DO$J$qjBI|aBMlF zxlOKGg%PWTm{LKb|E)rsvd0*||5Q01%&YnWL7+?<&5;X~W80hZGbAn7z;8?%`ZKr( zIm6Wk(U5EgMgzC<*e?I}#TNQjX!2nu}w zNhU^ff?(H~atuks*9rmCgFE{w=z%y<2vdu3G^l-;k<#^iw@lzOXw>mSWUCcd3DGTx zX`_AsL)6e#3d+n8BnZJOR6fEyEXD{a&`#_ZA?EWMxkFInXig2HXvkpj<`xhZG?4Qe zHmMJi58|ItqQW>ym;oOpK5nxqf#x_vgu`~gr3L-sc7Q`Z`@EYF$x^Lj;)L5i49dK9 zbz({&Qv?;1dBijMG^Np;1{mU;HsXfg!vciA0T9NdJA+Y}$?;%*Ss#Qx5t^wGX08pj zSxhyt0&Bs0Fco2XzK`;j_(DXR??poTdocI3yD?u+i}gV$f5GoNR0va++!fPQFy(WB za(znX=|<$(Fi+zXSXi`aI-3;!6@i6>MU~HFyDq;r0kIHa;X?95=vSaOf`f&bSTVTR z*oCRfSYn{b=9C6%4uGGa^Pez>>JxCtCm;Yu(w=Mj(V4VpVFlzmAv>QcA5IZs;F^MB zQmxd@1uGyh3JX~LhOKC?zebxOh*UC9m|#pC2o*jXivTSV`KIB(z%+m*!3qfeBbraI zhk|jcb`BSSm+`k~*Np>#`mvB!qBEkEImStVZ&<`bY!U%Uf(Q))p)*2R`R!2Uw4iC{ z1Q8CV1U3{zigXTC5)60ZHC}8R>0p7;#e$*?7c3;V#1=5 z1OghE#vwMF__r|i2BFoE{MnFBGFHrk8YvuTFgsYy4wGZzIgupH5ljUoR;Dz-M^I2E zkYv7OS}jA^?Lx4vqwF@vWG0>FF-GofTXPgn&^Vr29w zRsk!u0zVZ0jrZ5*7#Xh3cOg`Ev9kUa+bSGPjA0*CH*-Dl9~C+h6?0Pi`V~6vv`%;F z=9VB*)VG?1R^SmL9E}}U2}4-e&@ihagB374m_}|X7lx*ZnBAZxS^gCi{Lq!bla+I+3XBp1c?iBEmZ2TWWBMDSe*m7PL~ zFzUXnHu#;8?t{e^A5x(?6?*XEZO9xORA+JC(gg$?_1{D=m`c(V{ma&00IWm`gazl4 zG5<-UbCSTV0I;7KP7U~7IQ--zlF(2AvHj{vqeTlLo+GmDy96TQzyu+LODLRZo4w_M zo$+c1JJ=CNX<;Wd0))&mcG5`HxCMc)qcgfN;kk2jYHY3ok_Oyo;y@9MT;p*X0^;I! z_<(H+!zUmTHByKfRcU}3g-{WZVuj{{+7&I1`Ur)EU6mUmAn^q$*8)WpjB_+m2>*ys zI1#2?Vd91eg+-E&Y!^Ze16wU;gnx zvUa2nIR_oJV?h=@-_W-tD3f9)>NcTJd@82FVBW<|A%ztXyUOQ-6;=WPHWGzEYM30T zU|0dMV^#stP`z0U)`6Y1rRN*7Fv*nMVl)ULyW>PCju0a8)0`kdW)p)%pNQVj_kuHs zsYJ;}bD*IDf-lC3uHa&mgd}oApaAEZyad4vYp>S_wlEW6QS;_>q4;gr6xQ^^Ji{O3 z{3Q+LP6EG!iU6(w!4Nx_jfkR}aQ1nw9Yg4d$dQ<~Hv@_%eC@yvdYXvX*=XbDiL!}T zO%(D%?+KkRlxo0k69NIH4vW>S5ZP=vxeoi9H3!mwq?~@%-We(_2s_*TClMh|KpsS& ziznS+Fn#S8JdpuHrq4rYE&De_HeNwOF#bA}Z{{vOp1zOBmHnKh_1P)Q~gWe~*qbNMMb?bv7w4UQ;%lV2tpejv@l_Dec<4R}gnqfcd_UztMHEp)5k(YHL=i Source: - """Returns a constructed source object without + """Returns a constructed :any:`Source` object without downloading or parsing the articles + + Args: + url (str): The url of the source (news website) to build. For example, + `https://www.cnn.com`. + dry (bool): If true, the source object will be constructed but not + downloaded or parsed. + config (Configuration): A configuration object to use for the source. + kwargs: Any other keyword arguments to pass to the Source constructor. + If you omit the config object, you can add any configuration + options here. + + Returns: + Source: The constructed :any:`Source` object. + """ config = config or Configuration() config.update(**kwargs) @@ -40,11 +55,11 @@ def build_article(url="", config=None, **kwargs) -> Article: def languages(): - """Returns a list of the supported languages""" + """Prints a list of the supported languages""" print_available_languages() -def popular_urls(): +def popular_urls() -> List[str]: """Returns a list of pre-extracted popular source urls""" with open(POPULAR_URLS, encoding="utf-8") as f: urls = ["http://" + u.strip() for u in f.readlines()] diff --git a/newspaper/article.py b/newspaper/article.py index 22da870..89f7e86 100755 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -140,9 +140,9 @@ class Article: def __init__( self, url: str, - title: str = "", - source_url: str = "", - read_more_link: str = "", + title: Optional[str] = "", + source_url: Optional[str] = "", + read_more_link: Optional[str] = "", config: Optional[Configuration] = None, **kwargs: Dict[str, Any], ): diff --git a/newspaper/configuration.py b/newspaper/configuration.py index c882d57..2dba63e 100755 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -31,7 +31,9 @@ class Configuration: """Modifies Article / Source properties. + Attributes: + min_word_count (int): minimum number of word tokens in an article text min_sent_count (int): minimum number of sentences in an article text max_title (int): :any:`Article.title` max number of chars. ``title`` @@ -60,9 +62,9 @@ class Configuration: memorize_articles (bool): If True, it will cache and save articles run between runs. The articles are *NOT* cached. It will save the parsed article urls between different - `Source`.`generate_articles()` runs. default True. - disable_category_cache (bool): If True, it will not cache the `Source` - category urls. default False. + :any:`Source.generate_articles()` runs. default True. + disable_category_cache (bool): If True, it will not cache + the :any:`Source` category urls. default False. fetch_images (bool): If False, it will not download images to verify if they obide by the settings in top_image_settings. default True. @@ -72,7 +74,7 @@ class Configuration: from the article body html. Affected property is :any:`Article.article_html`. Default True. - http_success_only (bool): if True, it will raise an ``ArticleException`` + http_success_only (bool): if True, it will raise an :any:`ArticleException` if the html status_code is >= 400 (e.g. 404 page). default True. stopwords_class (obj): unique stopword classes for oriental languages, don't toggle @@ -88,13 +90,13 @@ class Configuration: and could hang the process due to huge binary files (such as movies) default False. ignored_content_types_defaults (dict): dictionary of content-types - and a default stub content. - These content type will not be downloaded. - **Note:** - If `allow_binary_content` is False, - binary content will lead to `ArticleBinaryDataException` for - `Article.download()` and will be skipped in `Source.build()`. This - will override the defaults in :any:`ignored_content_types_defaults` + and a default stub content. These content type will not be downloaded. + + **Note:** If :any:`allow_binary_content` is False, + binary content will lead to :any:`ArticleBinaryDataException` for + :any:`Article.download()` and will be skipped in + :any:`Source.build()`. This will override the defaults + in :any:`ignored_content_types_defaults` if these match binary files. use_cached_categories (bool): if set to False, the cached categories will be ignored and a the :any:`Source` will recompute the category @@ -206,8 +208,9 @@ def __init__(self): def update(self, **kwargs): """Update the configuration object with the given keyword arguments. + Arguments: - **kwargs: The keyword arguments to update. + **kwargs: The keyword arguments to update. """ for key, value in kwargs.items(): @@ -292,6 +295,7 @@ def language(self, value: str): def use_meta_language(self): """Read-only property that indicates whether the meta language read from the website was used or the language was explicitly set. + Returns: bool: True if the meta language was used, False if the language was explicitly set. diff --git a/newspaper/exceptions.py b/newspaper/exceptions.py index 1893fce..66e57d0 100755 --- a/newspaper/exceptions.py +++ b/newspaper/exceptions.py @@ -4,7 +4,7 @@ class ArticleBinaryDataException(Exception): """Exception raised for binary data in urls. - will be raised if allow_binary_content is False. + will be raised if :any:`Configuration.allow_binary_content` is False. """ diff --git a/newspaper/mthreading.py b/newspaper/mthreading.py index e846e8b..db7fb33 100755 --- a/newspaper/mthreading.py +++ b/newspaper/mthreading.py @@ -21,17 +21,19 @@ def fetch_news( If there is a problem in detecting the language of the urls, then instantiate the `Article` object yourself with the language parameter and pass it in. - Arguments: - news_list {List[Union[str, Article, Source]]} -- List of sources, - articles, urls or a mix of them. - - threads {int} -- Number of threads to use for fetching. This affects - how many items from the news_list are fetched at once. In order to control - how many threads are used in a `Source` object, use the - `Configuration`.`number_threads` setting. This could result in - a high number of threads. Maximum number of threads would be - `threads` * `Configuration`.`number_threads`. - + Args: + news_list(List[Union[str, Article, Source]]): List of sources, + articles, urls or a mix of them. + + threads(int): Number of threads to use for fetching. This affects + how many items from the news_list are fetched at once. In order to + control + how many threads are used in a `Source` object, use the + `Configuration`.`number_threads` setting. This could result in + a high number of threads. Maximum number of threads would be + `threads` * `Configuration`.`number_threads`. + Returns: + List[Union[Article, Source]]: List of articles or sources. """ def get_item(item: Union[str, Article, Source]) -> Union[Article, Source]: diff --git a/newspaper/source.py b/newspaper/source.py index c76386c..10080cb 100755 --- a/newspaper/source.py +++ b/newspaper/source.py @@ -244,7 +244,7 @@ def set_description(self): self.description = metadata["description"] def download(self): - """Downloads html of source""" + """Downloads html of source, i.e. the news site homppage""" self.html = network.get_html(self.url, self.config) def download_categories(self): @@ -408,14 +408,23 @@ def _generate_articles(self): return list(uniq.values()) def generate_articles(self, limit=5000): - """Saves all current articles of news source, filter out bad urls""" + """Creates the :any:`Source.articles` List of :any:`Article` objects. + It gets the Urls from all detected categories and RSS feeds, checks + them for plausibility based on their URL (using some heuristics defined + in the ``urls.valid_url`` function). These can be further + downloaded using :any:`Source.download_articles()` + + Args: + limit (int, optional): The maximum number of articles to generate. + Defaults to 5000. + """ articles = self._generate_articles() self.articles = articles[:limit] log.debug("%d articles generated and cutoff at %d", len(articles), limit) def download_articles(self) -> List[Article]: """Starts the ``download()`` for all :any:`Article` objects - from the ``articles`` property. It can run single threaded or + in the :any:`Source.articles` property. It can run single threaded or multi-threaded. Returns: List[:any:`Article`]: A list of downloaded articles. diff --git a/tests/test_source.py b/tests/test_source.py index 3d2a75a..c7affff 100755 --- a/tests/test_source.py +++ b/tests/test_source.py @@ -110,6 +110,8 @@ def test_empty_url_source(self): with pytest.raises(ValueError): Source(url=None) + # Skip if GITHUB_ACTIONS. It can fail because of internet access + @pytest.mark.skipif("GITHUB_ACTIONS" in os.environ, reason="Skip if GITHUB_ACTIONS") def test_build_source(self, cnn_source): source = Source(cnn_source["url"], verbose=False, memorize_articles=False) source.clean_memo_cache() @@ -130,6 +132,8 @@ def test_build_source(self, cnn_source): # assert sorted(source.category_urls()) == sorted(cnn_source["category_urls"]) # assert sorted(source.feed_urls()) == sorted(cnn_source["feeds"]) + # Skip if GITHUB_ACTIONS. It can fail because of internet access + @pytest.mark.skipif("GITHUB_ACTIONS" in os.environ, reason="Skip if GITHUB_ACTIONS") def test_memorize_articles(self, cnn_source): source = Source(cnn_source["url"], verbose=False, memorize_articles=True) source.clean_memo_cache() @@ -184,6 +188,8 @@ def stub_func(_, domain): with pytest.raises(Exception): stub_func(None, source.domain) + # Skip if GITHUB_ACTIONS. It can fail because of internet access + @pytest.mark.skipif("GITHUB_ACTIONS" in os.environ, reason="Skip if GITHUB_ACTIONS") def test_get_feeds(self, feed_sources): for feed_source in feed_sources: source = Source(feed_source["url"])