From 911f5033cf74c49707642766b4c33d614703bc6e Mon Sep 17 00:00:00 2001
From: Andrei Paraschiv <andrei@thephpfactory.com>
Date: Wed, 17 Jan 2024 00:58:41 +0200
Subject: [PATCH] Documentation changes v0.9.2 (#604)

* feat(doc): :memo: adding evaluation results

* feat(doc): :rocket: Documentation Update. Added Examples, documented new features
---
 README.md                                     |  48 ++--
 docs/user_guide/advanced.rst                  | 114 +++++++++-
 docs/user_guide/api_reference.rst             |  23 ++
 docs/user_guide/assets/chatgpt_chat200x75.png | Bin 0 -> 10461 bytes
 ...chatgpt_chat.png => chatgpt_chat75x75.png} | Bin
 docs/user_guide/examples.rst                  | 210 +++++++++++++++++-
 newspaper/api.py                              |  21 +-
 newspaper/article.py                          |   6 +-
 newspaper/configuration.py                    |  28 ++-
 newspaper/exceptions.py                       |   2 +-
 newspaper/mthreading.py                       |  24 +-
 newspaper/source.py                           |  15 +-
 tests/test_source.py                          |   6 +
 13 files changed, 435 insertions(+), 62 deletions(-)
 create mode 100644 docs/user_guide/assets/chatgpt_chat200x75.png
 rename docs/user_guide/assets/{chatgpt_chat.png => chatgpt_chat75x75.png} (100%)

diff --git a/README.md b/README.md
index 1d75baa..b05834c 100755
--- a/README.md
+++ b/README.md
@@ -4,12 +4,15 @@
 [![Coverage status](https://coveralls.io/repos/github/AndyTheFactory/newspaper4k/badge.svg?branch=master)](https://coveralls.io/github/AndyTheFactory/newspaper4k)
 [![Documentation Status](https://readthedocs.org/projects/newspaper4k/badge/?version=latest)](https://newspaper4k.readthedocs.io/en/latest/)
 
-At the moment the Newspaper4k Project is a fork of the well known newspaper3k  by [codelucas](https://github.com/codelucas/newspaper) which was not updated since Sept 2020. The initial goal of this fork is to keep the project alive and to add new features and fix bugs.
+At the moment the Newspaper4k Project is a fork of the well known newspaper3k  by [codelucas](https://github.com/codelucas/newspaper) which was not updated since September 2020. The initial goal of this fork is to keep the project alive and to add new features and fix bugs.
 
 I have duplicated all issues on the original project and will try to fix them. If you have any issues or feature requests please open an issue here.
 
-**Experimental ChatGPT helper bot for Newspaper4k:**
-[![ChatGPT helper](docs/user_guide/assets/chatgpt_chat.png)](https://chat.openai.com/g/g-OxSqyKAhi-newspaper-4k-gpt)
+| <!-- -->    | <!-- -->    |
+|-------------|-------------|
+| **Experimental ChatGPT helper bot for Newspaper4k:**         | [![ChatGPT helper](docs/user_guide/assets/chatgpt_chat200x75.png)](https://chat.openai.com/g/g-OxSqyKAhi-newspaper-4k-gpt)|
+
+
 
 ## Python compatibility
     - Recommended: Python 3.8+
@@ -29,10 +32,10 @@ You can start directly from the command line, using the included CLI:
 python -m newspaper --url="https://edition.cnn.com/2023/11/17/success/job-seekers-use-ai/index.html" --language=en --output-format=json --output-file=article.json
 
 ```
-
+More information about the CLI can be found in the [CLI documentation](https://newspaper4k.readthedocs.io/en/latest/user_guide/cli_reference.html).
 ## Using the Python API
 
-Alternatively, you can use the Python API:
+Alternatively, you can use Newspaper4k in Python:
 
 ### Processing one article / url at a time
 
@@ -82,22 +85,22 @@ import newspaper
 
 cnn_paper = newspaper.build('http://cnn.com', number_threads=3)
 print(cnn_paper.category_urls())
-> ['https://cnn.com', 'https://money.cnn.com', 'https://arabic.cnn.com',
-> 'https://cnnespanol.cnn.com', 'http://edition.cnn.com',
-> 'https://edition.cnn.com', 'https://us.cnn.com', 'https://www.cnn.com']
+>> ['https://cnn.com', 'https://money.cnn.com', 'https://arabic.cnn.com',
+>> 'https://cnnespanol.cnn.com', 'http://edition.cnn.com',
+>> 'https://edition.cnn.com', 'https://us.cnn.com', 'https://www.cnn.com']
 
 article_urls = [article.url for article in cnn_paper.articles]
 print(article_urls[:3])
-> ['https://arabic.cnn.com/middle-east/article/2023/10/30/number-of-hostages-held-in-gaza-now-up-to-239-idf-spokesperson',
-> 'https://arabic.cnn.com/middle-east/video/2023/10/30/v146619-sotu-sullivan-hostage-negotiations',
-> 'https://arabic.cnn.com/middle-east/article/2023/10/29/norwegian-pm-israel-gaza']
+>> ['https://arabic.cnn.com/middle-east/article/2023/10/30/number-of-hostages-held-in-gaza-now-up-to-239-idf-spokesperson',
+>> 'https://arabic.cnn.com/middle-east/video/2023/10/30/v146619-sotu-sullivan-hostage-negotiations',
+>> 'https://arabic.cnn.com/middle-east/article/2023/10/29/norwegian-pm-israel-gaza']
 
 article = cnn_paper.articles[0]
 article.download()
 article.parse()
 
 print(article.title)
-> المتحدث باسم الجيش الإسرائيلي: عدد الرهائن المحتجزين في غزة يصل إلى
+>> المتحدث باسم الجيش الإسرائيلي: عدد الرهائن المحتجزين في غزة يصل إلى
 
 ```
 Or if you want to get bulk articles from the website (have in mind that this could take a long time and could get your IP blocked by the newssite):
@@ -130,7 +133,7 @@ article.download()
 article.parse()
 
 print(article.title)
-> 晶片大战：台湾厂商助攻华为突破美国封锁？
+>> 晶片大战：台湾厂商助攻华为突破美国封锁？
 
 if article.config.use_meta_language:
   # If we use the autodetected language, this config attribute will be true
@@ -138,7 +141,7 @@ if article.config.use_meta_language:
 else:
   print(article.config.language)
 
-> zh
+>> zh
 ```
 
 # Docs
@@ -158,8 +161,25 @@ detailed guides using newspaper.
 -   Autoatic article text summarization
 -   Author extraction from text
 -   Easy to use Command Line Interface (`python -m newspaper....`)
+-   Output in various formats (json, csv, text)
 -   Works in 10+ languages (English, Chinese, German, Arabic, \...)
 
+# Evaluation
+
+## Evaluation Results
+
+
+Using the dataset from [ScrapingHub](https://github.com/scrapinghub/article-extraction-benchmark) I created an [evaluator script](tests/evaluation/evaluate.py) that compares the performance of newspaper against it's previous versions. This way we can see how newspaper updates improve or worsen the performance of the library.
+
+| Version            | Corpus BLEU Score | Corpus Precision Score | Corpus Recall Score | Corpus F1 Score |
+|--------------------|-------------------|------------------------|---------------------|-----------------|
+| Newspaper3k 0.2.8  | 0.8660            | 0.9128                 | 0.9071              | 0.9100          |
+| Newspaper4k 0.9.0  | 0.9212            | 0.8992                 | 0.9336              | 0.9161          |
+| Newspaper4k 0.9.1  | 0.9224            | 0.8895                 | 0.9242              | 0.9065          |
+| Newspaper4k 0.9.2  | 0.9426            | 0.9070                 | 0.9087              | 0.9078          |
+
+Precision, Recall and F1 are computed using overlap of shingles with n-grams of size 4. The corpus BLEU score is computed using the [nltk's bleu_score](https://www.nltk.org/api/nltk.translate.bleu).
+
 # Requirements and dependencies
 
 Following system packages are required:
diff --git a/docs/user_guide/advanced.rst b/docs/user_guide/advanced.rst
index bbc2a83..d224332 100755
--- a/docs/user_guide/advanced.rst
+++ b/docs/user_guide/advanced.rst
@@ -11,7 +11,7 @@ Multi-threading article downloads
 
 **Downloading articles one at a time is slow.** But spamming a single news source
 like cnn.com with tons of threads or with ASYNC-IO will cause rate limiting
-and also doing that is very mean.
+and also doing that can lead to your ip to be blocked by the site.
 
 We solve this problem by allocating 1-2 threads per news source to both greatly
 speed up the download time while being respectful.
@@ -19,22 +19,50 @@ speed up the download time while being respectful.
 .. code-block:: python
 
     import newspaper
-    from newspaper import news_pool
+    from newspaper.mthreading import fetch_news
 
     slate_paper = newspaper.build('http://slate.com')
     tc_paper = newspaper.build('http://techcrunch.com')
     espn_paper = newspaper.build('http://espn.com')
 
     papers = [slate_paper, tc_paper, espn_paper]
-    news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total
-    news_pool.join()
+    results = fetch_news(papers, threads=4)
+
 
     #At this point, you can safely assume that download() has been
     #called on every single article for all 3 sources.
 
-    print(slate_paper.articles[10].html)
+    print(slate_paper.articles[10].tite)
     #'<html> ...'
 
+
+In addition to :any:`Source` objects, :any:`fetch_news` also accepts :any:`Article` objects or simple urls.
+
+.. code-block:: python
+
+    article_urls = [f'https://abcnews.go.com/US/x/story?id={i}' for i in range(106379500, 106379520)]
+    articles = [Article(url=u) for u in article_urls]
+
+    results = fetch_news(articles, threads=4)
+
+    urls = [
+        "https://www.foxnews.com/media/homeowner-new-florida-bill-close-squatting-loophole-return-some-fairness",
+        "https://edition.cnn.com/2023/12/27/middleeast/dutch-diplomat-humanitarian-aid-gaza-sigrid-kaag-intl/index.html",
+    ]
+
+    results = fetch_news(urls, threads=4)
+
+    # or everything at once
+    papers = [slate_paper, tc_paper, espn_paper]
+    papers.extend(articles)
+    papers.extend(urls)
+
+    results = fetch_news(papers, threads=4)
+
+
+**Note:** in previous versions of newspaper, this could be done with the ``news_pool`` call, but it was not very robust
+and was replaced with a ThreadPoolExecutor implementation.
+
 Keeping just the Html of the  main body article
 ------------------------------------------------
 
@@ -191,12 +219,84 @@ The full available options are available under the :any:`Configuration` section
 Caching
 -------
 
-TODO
+The Newspaper4k library provides a simple caching mechanism that can be used to avoid repeatedly downloading the same article. Additionally, when building an :any:`Source` object, the category url detection is cached for 24 hours.
+
+Both mechanisms are enabled by default. The article caching is controlled by the ``memoize_articles`` parameter in the :any:`newspaper.build()` function or, alternatively, when creating an :any:`Source` object, the ``memoize_articles`` parameter in the constructor. Setting it to ``False`` will disable the caching mechanism.
+
+The category detection caching is controlled by `utils.cache_disk.enabled` setting. This disables the caching decorator on the ``Source._get_category_urls(..)`` method.
+
+For example:
+
+.. code-block:: python
+
+    import newspaper
+    from newspaper import utils
+
+    cbs_paper = newspaper.build('http://cbs.com')
+
+    # Disable article caching
+    utils.cache_disk.enabled = False
+
+    cbs_paper2 = newspaper.build('http://cbs.com') # The categories will be re-detected
+
+    # Enable article caching
+    utils.cache_disk.enabled = True
+
+    cbs_paper3 = newspaper.build('http://cbs.com') # The cached category urls will be loaded
+
+
 
 Proxy Usage
 --------------
 
-TODO
+Often times websites block repeated access from a single IP address. Or, some websites might limit access from certain geographic locations (due to legal reasons, etc.). To bypass these restrictions, you can use a proxy. Newspaper supports using a proxy by passing the ``proxies`` parameter to the :any:`Article` object's constructor or :any:`Source` object's constructor. The ``proxies`` parameter should be a dictionary, as required by the ``requests library``,  with the following format:
+
+.. code-block:: python
+
+    from newspaper import Article
+
+    # Define your proxy
+    proxies = {
+        'http': 'http://your_http_proxy:port',
+        'https': 'https://your_https_proxy:port'
+    }
+
+    # URL of the article you want to scrape
+    url = 'https://abcnews.go.com/Technology/wireStory/indonesias-mount-marapi-erupts-leading-evacuations-reported-casualties-106358667'
+
+    # Create an Article object, passing the proxies parameter
+    article = Article(url, proxies=proxies)
+
+    # Download and parse the article
+    article.download()
+    article.parse()
+
+    # Access the article's text, keywords, and summary
+    print("Title:", article.title)
+    print("Text:", article.text)
+
+or the shorter version:
+
+.. code-block:: python
+
+    from newspaper import article
+
+    # Define your proxy
+    proxies = {
+        'http': 'http://your_http_proxy:port',
+        'https': 'https://your_https_proxy:port'
+    }
+
+    # URL of the article you want to scrape
+    url = 'https://abcnews.go.com/Technology/wireStory/indonesias-mount-marapi-erupts-leading-evacuations-reported-casualties-106358667'
+
+    # Create an Article object,
+    article = article(url, proxies=proxies)
+
+    # Access the article's text, keywords, and summary
+    print("Title:", article.title)
+    print("Text:", article.text)
+
 
 Cookie Usage (simulate logged in user)
 --------------------------------------
diff --git a/docs/user_guide/api_reference.rst b/docs/user_guide/api_reference.rst
index 4f7d7da..13b39f0 100755
--- a/docs/user_guide/api_reference.rst
+++ b/docs/user_guide/api_reference.rst
@@ -6,6 +6,20 @@ Newspaper API
 .. autosummary::
    :toctree: generated
 
+Function calls
+--------------
+
+.. autofunction:: newspaper.article
+
+.. autofunction:: newspaper.build
+
+.. autofunction:: newspaper.mthreading.fetch_news
+
+.. autofunction:: newspaper.hot
+
+.. autofunction:: newspaper.languages
+
+
 Configuration
 -------------
 
@@ -44,7 +58,9 @@ Source
 .. automethod:: newspaper.Source.purge_articles()
 .. automethod:: newspaper.Source.feeds_to_articles()
 .. automethod:: newspaper.Source.categories_to_articles()
+.. automethod:: newspaper.Source.generate_articles()
 .. automethod:: newspaper.Source.download_articles()
+.. automethod:: newspaper.Source.download()
 .. automethod:: newspaper.Source.size()
 
 Category
@@ -55,3 +71,10 @@ Category
 Feed
 ----
 .. autoclass:: newspaper.source.Feed
+
+
+Exceptions
+----------
+.. autoclass:: newspaper.ArticleException
+
+.. autoclass:: newspaper.ArticleBinaryDataException
diff --git a/docs/user_guide/assets/chatgpt_chat200x75.png b/docs/user_guide/assets/chatgpt_chat200x75.png
new file mode 100644
index 0000000000000000000000000000000000000000..3eac3040a9d17ec4ddc277ff3d61d981ecfec735
GIT binary patch
literal 10461
zcmV<3C?eO1P)<h;3K|Lk000e1NJLTq0077U002t}1^@s6HGIhE00001b5ch_0Itp)
z=>Px#1ZP1_K>z@;j|==^1poj532;bRa{vGi!~g&e!~vBn4jTXfC~`?eK~#8N?VSgl
z74_Bk|HH!4cM*`Ph>D1U*bp0nD9u=6*VrY-mdDp5K6!ld#8;zl^2BKJYD}?fjG`g(
z#2QgVuwkc&6g!HAfTHYD7k2qS=R0%H-LvD|duN+~-XEXCoxN@5&Ybf<{fsN3h$3<$
z$UJ`SD4Tdyi9Bp8)K6%~%yw6y4nn-PUTCGzQlZ5{??RO)UC!S{6wz3W5b7-Sk<e>A
zk%X^RKtw3I$ipz9aY6?OAsFq2(&7uDH-(-Rnk@9P2uzhcHWr+yPC{O25l<wbwb1TD
zyysP(C}Ok5K%uRoO}9koy=XH8i!h5X_)>`LpNOwTKtwPOl83W}z9ZC~=b8v_2~8CG
zs|boc0rkOtLN7*}Ay`p(?$hwETKq}q!e}!T?DwP1A~r*uCUkeSnFWj6+-S3ofUFa0
z8*P3S%_<-w7-))1g$@p%Xg;hK`kT<TA}9z=eegY@d!x-P?h`sC+RUOTAX^UJ5yEu0
z*js2;v{^?$Ku$Smk$G%Vh+qs9dPFucgPL170A||vg=Wcqe<M^LKHG30G?;=#Y<c)E
zp}25>2Jw26YY<`1f0u_Vh1!KrY#OkFTqN4$iRjO~uo~n;q3xng_o&d3(PoH38l;ub
zfkL|pwG?_o=oO(#Aq9>he%MB#2YJF^h)^FPe4Gb(!e9@f9imP5kIW<3Mu-K7<5?xd
zKBovN*cba`wS1B%1SbjYB7{l$oX`hCNf;`GX<R0Rolc;HW9HZ`IPU&JoR61;FjvkO
zBH+T`>|=#c60rSQ**RwfWjOw4gyPsoh<$KfIj5tA_+9vmw+ktH3h^BNBEPGzkb>_}
zXFQ(<*-MD?$#0(8WC9|B(M=vW*dxLxHV^z#gyjEbwsXM>$o~_v@5g6C3xa$CGE(Tz
zLIc9TYJnefvCyROugJNFp}_G%{}XM7m?1>q0vs;%ShN`e<{lMohVQ{=x?G5bC=M&w
zKNkA`(WV<N^c$i5!@p{Ak<cHb%@F+NUkdFR{#6&B3GsU{K^2@IOz8zezZJS#2wqUY
zXh#V>AO2N~V}&?pai4U)5c_y4{3`-N!|`JxVM<>k1U3kjMw=mS5;`y1biIW>i8e!|
z5fB!e>CtA0K}}*}iEyw2uu^PJ!T~OmV|zfTW9IKd_*lrUNcc~Mz-l~ANQDEY=OTh$
zLiY=?`T@*N#i5x;z$&f60qkpNCN)#J;0Oo8rNV*MgE3UIuzDte$X{GIz#oJVaK*_&
zDjYC98VN@hY%PS4aZGXSDs+dC6%Ifo=^-Jz81wr$PY6e~5DSzp7DS);EiiE$Xi^o9
zIQA7%;rNSCJjhZA@QC$Ae=0P;(E=jO$AyJKlnLq$&d^G+EGoSZ^=uR4XrU+MI8ph9
zz(rvNRG`LBj5c!un^*B8A($7fbSh6|@SmAS)keSyLEKL}B=ZP(4U-M%=LcmT!I?rA
z2puN`6Cy0RaI+9hwzH6p{hyn81Vnm%Cxqa`tYd{Rn=owv`6^z^g&zv>eq~mGDP{Lz
z&lwt(bH#asDFF*c(&D82ipD!k=-1(25zwd#BBu2PLWzk6&}?YK#yRQ2bc93yDU^o!
zvfwHr2$eM(T;+zfZcWX4S1!-?YSF^2+gKewS6N*p0<*EsveO8#ay}-<n={^>0q28+
zq?#o_FIp0KKOsfHO8)z3GlX4@U^IY<g)I+QP<Dzo>#X1aCdR)2R(MoEZ0_fDq3xs1
z5Y|VrxJwA%;B}$5g{&!wiKKur@FU4LS*U;jYuj7=n-I)}#<@l4S|Nol2z*u-0IUv<
zM`6#IHCOS=6cdCn3m+G{f+qwg2w@rmjzM8Rmk-wosR_&XBIF?&EFi*c`~aLb5^Sl#
z4l6`35QP8ewv(H@_c8vhL(laK$6e@Vjr*ZnH2MPn!hxsy8+O>&@7c1AtEgTl-luO0
zLBR%{B*&f01hGJitc^Cq6%LJ%1iO;<5aKhI6gb}sE`UOrMup(E%p*`HvcjXEwZR9X
zclbm2IV@PYuq^Yacp>u$xQ20)=DR|y&<GZ0))_(y)o5`n%seVy&O8Eq0$>@RK!818
zHkYNrk~pltq)=gBE`+bBu-70T(EI>@Nd+xLg9JpFj4<loh3b{`s}Wiu41aOg?cKtW
z-}hG^a+(`AV4t9Ww{3$SWnEnNj$66DU3vuj_Z=8qwBM1zl(FZ#y9SK(8#iur6&q_B
zF3q)o*=~$B6QOd5GipsOoZTc?vl78;6C(xRr-++b2o@Rx{8fmH40w(6tbPNZTN7=D
z&gvK;%aE8@_$OBT<%4xnfmSpiWC#<BAEqIKLDUGdr@pZJPyxieQ{+2mae-FEwF15e
zJ}@9+i0Mec1G8U!Cx8idXSA6U_)B#Ju%SE$lj%XF0>ZIcgyXjU!~HcQPjdsh_Xz~i
zi-L_}PSuDISaj`B?#|e2R4{k=8GirLc5cP`)sgw7B;QvlpQ-z)f;d|QJgY+U00E&y
z0ib=70uY#03QPqACQd}W1`8?B0(!^@focdYCMJi5V|gvfyi)K!9AkWtr+Fd+gjr3u
ze9*8lV5QX>H}-=w4mbyddNGTrSg0?oNvkD1_M8#UPC92CTiSq+4ZvV(5;n#!=Dgbd
z6WQZjCE*8|M?j<!-!nwL1w@!@ygcMGITFE6R&QA6?$~*R|NfA%L21h<2(qE3+Rggt
zZGYp-|MWk7>TY-O)VtiTp1t2cw(u4I*{a&m`EI>-2qug;-9Nn7cl}GlPWP`Ibf$l5
z=m{QSJFTp{2%h*WVz$-847)~tM<D^nO|n|Vnn?<Ku~~tM)5j^EFF_F0RAR*<Y6O(Y
zRX{LFY=FR;W)^52m;sHhfJx$j(XAPUR!}JOBFt(s6I%V55aC&VC)EmtK<L)un9L*4
zeb~GUd77kirteJy&M*EFeig_0FCk1Nw5?q`D}*$Y9tWl^Q9i{DnaBE>_QF&|FxW|I
zr<@(&b6m1@KmY5Irv$A^BJ=gbFaG6!_uAy(51+~TK$8R|(W7V*C;o|R+1CAI;GzDz
z1NRSF1i9t7s(OQabLnFLmj%xS*UNX6v}`S05Tx<WrdOVHId(oJB{+WIKZLAf2{2>g
z`3;)~VZ~J^moO*cR~&H?rG!g)jSHyz0BEqdEe$gwkO-m`JV0cLS|SC}D1r%GSeOA2
z*3>AB#{6v_1i<VZBaCT(r~YoPtD1u(9yu1iA4Vr!O_epkoYP={{r!LtE4d7`5Z|l7
z-_RmYw5@7Yj*q|q+8Q$spASEPkhmrd$xWy>#RRmzO+IiP;90=_NJIh_ig+G`g+b3H
zAOLVxeKiP)w0cFXR>P_-TDS}LIMk_dyuIurKW^Hc!5@~t@5)-Wb!Fm=mzA_~om$B2
zEw~1O8!wt+n=*ElUxr(YHrl=Sj={BuobLW-@L|42v{Hpn<<yT9fwzulE|4_K2b-)z
zyTaU|MJWVBUM8d_iy}!IC^K+T=R})X#LX+N8~FgLn-Ns}78MS@$A)kf924a;2#6I9
zaz84pp9`r5gNgWl1P@_Q;fN!?7M&%;_gmp0d1))4?K%625c^<XtnwrXFA=I2pXC5T
zUI?05z+@poso)s*PAwq9OsoKe6jOnEgi6r_)$1zVl|6RxJ9h0I^6ZyuR=SH{oD{q%
zK18RIC<)3zkONsE&y8(U?xqes-v4>@c(+fVoke>DZq52CH&aZpyJtP+e>>v=fBlOO
z`3L6B@b7*8sTYLb(Xyn(9Xn`PFm0b>z1z4!jOr9Rgm`?rHZ~KR1Xg}B$CySi8P&el
zfux**b7Lb*dR{pQ7&fljiPl&Lo6}`0iftAh#03g-fK?h`cdM<<v8u@y2Z^|Njs%~F
zAb^eb5Z1O4E(DK#k=-u@7|dp(`CWv4ZOINm;4twxcM2^{CxlJZOn+ZvkWWC=fwxId
zOe-5Ezc|H08$kpF8<%)<UcebEFp<Q0{z2#M+_>Jm_yf1w)palH9PaDx*-!a17EBE~
zwQ3h~&l>rEgZL2J%iq85Kg^HoKO`t`8wFIV<vU*faH0R>f@gxitssdkU!jgYrkH4)
zD|-#}=ZJRdRo*rGTke}P-JddlYEa&)O&T%zftXr)Qdq?r15_joi^d?Lf@wt_G#9j9
zO`}s5`Wh=-uJ6EkCJM&q5nAkT3u|H#Siu1$e2VFWnPcn3;^22s34n&uU<J$s+2ci+
zN!kH?A$%zknM9<LG+7+Hk9}d{a6GvLgLBGxux3yipb7b$jc6&*QfSRMSTu>StxP2#
zA{@l~XA9-cA@F4t!t5uO^>F__c$D9xcfYL90ajJ5b;mt^lba%1rIYv-D{AE0%i|4w
zhxp@nIl%QS?~*;9`Aa_Vx6OMxxPF=VUSfXnK2R=?Fg<VN7gtrUbNh(ax%Z$`eZQ?E
zEwxrO&N)xs?(SW=%$0>ETVAlb4;3M?4&65eRzS#U*bH`4AxS{KF-U7-ku6Rk9P8Y#
z`|Rm&8-;l^ppI~WWh=k*Q`UUx+94dG9WL$G-@Sdv4?H;!6^@U-T;{KN{vUqO)3*oL
ze=duc_$N5+!%N${{YA^L56m&Z0d<nsU#(i@E_&hb0p=TMTiV)<@4ugS2w-7Qy-6Wl
z5k-6j1qDR32kOR#>%IcM1kG`EpS}Dg`yU;2Y!g`rgmI_7J=4!yvMA)}H5<gd6w_#p
z2*>dqy12^^I>GJKZJSUqzF1x9?wU2#@BHM=!KI651tn+=d2d${rU|=_@(+zW-#sdH
z_b!L}_F`tSKT0|I;!g73$5$<L4=sEtYf<gqbBAC+>yBArDu7&e`<9MWKfZ|ENTWem
zVe=KIg+9m0<vrbnLk<(I5t%ffud47rfBHT@dd6MB7i*)HJqY!TKp4MIn=;q7Riu=r
zzdOet_QXx@tcBBpwJ@{@MNQ2HcZ~?#>@ny0Q}!4cbZXndb#B+uowny8!NSq!`Kv`!
zt*u@khLA%p5KVH&;yFRZ+9+JVRl71bs;s;0E0yImS{8!(^u})~xL$~)7eq9zIY6Ny
z-%o_>8)aUBGy-BPOTPlb6gcFUh|i&#W5wE4?#IvE8(h2e&A^p*aM?UYMm`iUHuS*z
zvx8T}FL5nJS&0et!!CW@8wZc~KOJ^#uuZ3}v!>UQishj!Y`ayr;L>5o1#=HR+h5pa
zJ6Ew`t$dMp&#kR+ODd!Hl(cB!`nKQ7b8IPqlW6k{-y{(6g2GGH48UpT`zi1E#vzS>
z<ZV4&CE8>BR@=J4y>`s9|DCg+^7pLx$ZduH0MyLMJSd_Mcvp%eEhZBo-f8=u=&w8U
z3^%Cf_TkHzUK3_d^E-*@HA3jY`7ij@>!W}K8TRWAJv02wxVGiatzYFTtJY+XqixI5
z>~;#+evBfD2$5GnGG<3!^U8^zF{al5Uz%~q7cF1nfAZnXptMz+I(hyGz7E5N@*@AW
zE-CSU88p^U9CNN4*>8`mb5EFe(t;WOw7HK4RpM~JBj(gGa~=)GPr1##_`!T%vvH$i
zzkG(z-nrZ1zGG?ItQfA`u+C@0!i7Qhxrid_ppXV34o;js5g{iE|MlyvNnGb<FL^JF
zI<>X-REA0A{aNPCLSDLL-y_|bdyUHGKUS`b!t5<Yvz#+zTrg+Uh5p>F`h;2rGwkoH
zK6eMqm=OGI`UL;h(#7F-mdR(&+Iw`cbN4>s&em6NaBr>rT-=^CN;1UGvLF<U2^aD0
zLLu&<V9u5x*FmTuf)ZEWrq;iGcg3>sIV%Z9RAQhcBoODjEPM+-#_wNz*nem8pWH)Y
zO05x7icI?Mz54|>j5;gKu#awCCOcjqI_1|cdnXt$?Y7{C7a#FUE58VL$@{^|s<r<4
z)yrk)sRaZnuUBtN5#M|i;+~w^1E>%PQ)XqLO3a{K5+P3~zCz?_;_J`&VN-4jezfT2
z;Q94y+|i*ab-R1%g9X02rY5YpI&`PKg8Rpw>+T&m+S7fg+4PGSy%ZcIpMCP}nOV8)
z(!MPC{h(24b0L62IW2c6qKFWM1SD^ZYUn8llZw-vWy~J!#94-MlMn>Luw*Y3k-S{@
z@+0o#x&I8_7oUT~Uzx1>om#bZ6IU&B`#*bUFzwyi+P(6&9o+Z!7#X}W`h5SpzC(Oi
z`CC#N`X^&vxj&#rEAv)2rvH%Or`-p5O5sz0m~%s#f{JJwP+<&hRwyJOS`U<nkiA71
zKCY}SXBpgWJ9(~dHLd19dUos_{H*t2Pbjx46MpsbkDhc)rZQ1PDq1|2iFTmOWDUND
z0F#SsJlnlHbq#)Uz|p}Q2cPZ7w#o2yt9I_#xswA;OcLrnb-)4PF{X%da*#RA$s(E+
z40wQvplmP3=72&1Qt0*2#Cc%}tqBnb&mL3O!&S)2u0ley6zJT(%>8oY_kx#)kM{%0
zx@9YIThtM!uR;%^Qk?fApS(4gH1|2rHei28_uM(uOyyX5LU%2A&Jp1PJ9pbAI6;nQ
zRl0mg3OOpGnPEEwH#rIkfSsO^h?0_)?w8^$(*VWa?MvIZ{~SCzWLWw#&<t13pB6mv
z)@y#<hV^06aOideg7b$S7O=wO$S}ZxjkuVE2Pdn4b;e}F0e?rd$q92G4SqEJ?yx8k
zv@U5WA4M055)qB_+s_vTnlNnBs<j)|rB7sv<p(<{_<NfOw3GfHA^N0g1Fa1MMxX(S
z?mXM|r=S;|*BHlR!FzdL$F7q2qzQ}O3i`H_Kw}a^-4x`!elEmh5bcxC1;YQ7Ah8c^
zQ%%yqFg)+mf<`Kd`3(IcRNC^q9X3I`Fu&h6VQn%L5|Di5&>`sFgZU=RdrmB?kS9d|
zb_pKY^Qg!{_BC#XINjr3`$ur$)H}nz4&Fxy7vSHI=-FSSqa>`af{i&S?GM?tO}X2?
zQ}^hL1yu9WZTQZ*$}mq7M;Km~@A<T%*6iy;<5e!ds1`UXY1&E~1G?L2qM}=l5kWNN
z=#Vqb!|?NIkfMJhKAsdjEW~TJM}*)4Jrq1ow<4wOIBx$fM7JJ&7q-s<AOMVPVyGL9
zV~pH+PKX9EhRIRAoomJxB`ge;<2%_0lSC*pp~r=3C>E5W@II=35lD`O&oF?FSy((z
zla+$;t^9sSg=&+bkbvZxERhAA+fkh4pMN|jc<jwrvW$GhuKNXZMx5nO>)0cczi935
ztoYOodG4;@7tj1HY<9}L)g3eFz~I9%7x@3)b`QU%dVMGuZAEbI95B*9d)S3;&uw;!
zD;L+gK=_tAEk!#@SPBE1t>A*tNHB2`1_1PCGdoH<?FwBB#aGTn3}_7IU&X;HPOEMb
z5QwBX1Na8<(X9+xu$_0d@r(WxggRGsGy-Y0WudV=w(n$<pwLC3OR3<}8(uMC*cbm*
z_ZA)GFK!t#-aWSW(f+_v@%L+pIJI_HEm;usn|e!d^DC1X&JYSppU&NbpB;33Fmw1B
z{?Gk}`8P+M>q-1|X|EPT{m`+r_LdLps#8U=s41f5v$=rE)N&TO52^ox0qI7h58DoA
zQ)CwrgF*sA<PoM-lgGBVGTksKQbd#k?ZL1IK5qrdPDG^YnMsv^Mk;#lnXEzt(B6-O
z*6duMZi}9F=6TUY$pjNSPL!@loo=FS$eRpl1mufovjD{0*`urW$4Aw?m_StQ9yVZT
z@bH-P+#Lf($OX?_a}lWX-+ew9`^3$`gY#y1t-{?)w8aJcekVX+_#0n>p$y@RIgqjJ
zLSUNp7GcyPN3eLMm|F!jj^LBZlP*s?XsjW?9#oqUTi<qP&J@b~8x0Em5EeB23K$We
z>K8(^2`iFn>-gQY)Y3f0LP4Lnf(eg0tJk&#U?OV#9d&IdP@+MSs-v-c8PbNyb%zlB
z_e?OcJVr!Er<-hY<UQHKFif~eJ%Xi2``SeGqy;nFWl!JdKUlsbyzoS@&fa@W@V;n<
zD|!u#{1fpzX0Kb}j$QCnuyE;zbr^&`^hXxF<a<s2V=!;ohlN(n5aExhsd5+h+SPaK
z80Gv349r-*I3U_q0E~@kZs2bKn1>c-!hekU%!MSODReGi@Y=9q1O!l_6Amr#C+w6D
zOdH!rub;6aN8{7lnX6^nW$k+v^!DT1<U>+x>?T9OWmS-*`V*n^t^iK>Z^SRzUQDUm
zUwgtYUmaO5+jr?1TrvElVAk+6J!S1T?=;+hcKDCnetmZhFMN$S)AXU!GJDkQhXW`6
zNUMxPUkG$5_HEV9oidOSP!Zn#>?8lTiY2an;qQn~5@`;cBV;E#GCqh2AdL0F<Vu57
zi$x69x3aUb6iMHiG!>BultL#!TG${*QrS)}Eop7wg5k!m6KMp*-lrii=s8zx(?LvG
zao&kio%7Z+!4Xf~>>gYAvR}KQDtr#d`}$F5x^ssfCVobg=zC|`$NsYE_xk;2+!Nfl
zQWhl2<wes3F36tPTxVD_^!M*~yziYcub9+$&%zm@d6h!M>?+%wAS6l%786Co$_%m7
zLCh9OVbg{Ph^>f7YZ2J8o1~#{wf1x^>~l%`$#wZ^BPyF2(g=ufSmNY|x)7%Su(Eap
z?XKNqu@M1*F)15)Ni@OXvmOf0d-4wV>f$&3Mic5KJO1aF|LOZryEV9a>03e2QhdIR
z)oz!PR&L^+NBfaG?jAA)8F?nY1AC@7$a|0Ow2fOha=ah1?ao;Sn*7$Q{#yAyW^NS#
zv!R+|%uHIKF|fsL<VdF>@&k2p2X>*IWt$Y8iUTKH9ccj5+A*F<jEkeK(KUcSV#lQ>
z2}pbl5J3TrP{Hu^xj%=^KP3s?CPP615kG`buMQK-jTWukGfUqOC^H8*+1G44*jH~@
z7cP0!SCxyP++F!EH}v_tgR7pI=x2TWPB`Cc%#@phAHMfO5M%_#Z`|l^?02A_KK6Wn
z;^5)g9{1UwzU#k4Q;0COlJC<R$;12a<0p(gH=OqyJ|TEv(R_d8>yrbbW-0KZ{LbbM
z@p9n_fm$couADXmXzvA2jsb_B+(W*>+ZKdkiC*DU+kwrD;Yk|BgQg<yK=O`zYuNj4
zaLgL+x3?!?ur)ao%r26Yb~631GLLCteI^p2%*y3B?BcLVkY%WRD4Tc{2c7qRe}re4
zkxTA_@Gj30$vXY1JA%j8d=U<bV`=0C-n$A$C%X-k%WGjGPfVgKdJXcY4;t=v?Aj}P
zY&2orJ?~j}#i#RwHlhv4ykFS4kNf?ov$9%9LF8=mo3Hqz<~|<wg6No_0ytd+<etoS
zbAfqOx&q^np(U&%OR`QwrI=xMjK2lpRUJI7=LmBsO|%iF(?9`<!a@!+fcd9Jn;{TB
ztD)lHJDFpkK<lYsV^->|1}s<s(c<xHY7&d_sghqS(wJC)6O`9R57|_h@z(jzX0=Gy
z4&}iu!%y|6mi2Ti)~^XCv9n~dQ_LQ^4#Vh@2-bO9ZR>E{uNZMspu$0p<knXo_dAHA
zf5j*Bg7)&6&?pz-8s5F1x580Xz23e4$$S3NY4^CJ<~%CGQIb|TunX;tFPjUDBqegi
z$sxI;!hz;MAZ#9iX@vMBkv$oK+gnpIl52>y1vv(*5&4Z|v{g8gK$zB^E<*g2vsmE(
zBpeAH$DL^eo`8cD4v^v!gi$jzQe4QJGwhxU*;Tp_M2m2O^RBXdS%sKX69$e7Cy_+Y
zinb8veahQ2{XHMd3LaP`?u-aZh?16We0eW-X8-;De%tMmo%+uEr@k}GU-{N^!JMj<
z4xgkfbEp-fX?CDMa_j}3X^oVv-2cY2{{F>tgGrTQnV>a9kkVY04DJx^0YiOVV12+T
zXVt`XsheH>DyUOpvY{Pl&1YF_lu6?ngp&o~*AQ(=@;j^sVJ<D-LHL=eh^*!aNFiW0
zq8Z6qShFw=olRWpq$Y!N!=P~NV$3cZN!ujIGI1ef+EK}T=I!0a+o}?c@bDgo`y+SV
zzs~oQ+gQ5t3;*R>VS4%3xm}s-DNJr}hw$PD3;cBpo(>)qUxo4%CNQdL78XBlSmUM)
zJ;9IezjyfEOgriG_#Z-Zs=WwKU4ttX4iEvcb8)vsBne0%SOGyulkP44tw(AB0U{*W
zqj}8{zK~m{)rgS%X~zTnDT7CZGpz~%f91cP|GT?&+1r6DZ5K{@SNnpcaveU4uW^%h
zJIo)q+pz3$GsybKi(U=NMVRWLH55}tIQUIlI;?=O>ZWP@vI0V`D_6zuw;J_YaH}Rw
zf4}t!BT=hyJF^a2i<q<&27hq*qG0CYg|0(&4bw(k>z1W~`VADP(@GnbybDB&1l?P;
zb1m1c4Q8&b5P?|ls>B(Gszn?w>eAQUvd=M|2`1qO1ZbNR=S>OPig2WLree5Iep4Y6
zXG;KuXL?syOsX`uVWXW|!^V`N*xO$hkwLvQ2oNno(uo_G6jaGD?Fyn(VKr|XcWQ^O
zu5Y_euC4e8tHr7QWZg>llif#Uqh0_r>*E!l`Hw3;r<FJCFWIKNYd9><I^_(vKH~Wa
z!3QEhDV^~+rik{4_f%}jAa6nTn))W2ugC}d8%mv-LSL|Q``d_m3kVP)p_YRTd%+Bx
zf*a|CU4CJr4|a}rO=RlfJ&*7Qi)PS?B!%$odvpECvmOqXh%j{$p;1$;9^fPR9%=7X
zRK&Lw4e)70Q){u9THK2weq)rJC>4RAhoiMHv{GB4l_D^I{rHWbX8BUTb?f$w5DK^%
zP8@`e-}&qlf8)$2+?flf1*=4u8Ymp#MiCBrF^VW6FB+sl6rxFp)e{qM6woItLnqw3
zlgoR!1H1I`eL8k^?OK(Fy&NB}`og{R`QqTN3aS^p>yRm7X@FS;2$$|H0+Lj0S40tY
z&?o@`A|#X*G|7!w!fZS_5;E+eDVBj~9ukUZ91P|L3WnmlA{?}>7Ewe&G+ID_2ni7s
zd&}g_6*q`**mlt(ipYxwMn~&Hn2rYglZ2YHWwo)OD%sw7_1g}z@sy1~O~MCF2s$D;
zF8XCQDVjusC`6M`HjVRZOQ$ynP&P441UYT^GY&%91BJQ^Q8z?@f~r_bG~%F4B7R}1
zQ_>q@S}@`!Z6uJ|DoQ}of<z$SuSrBIvh4y!B`za=fZn5ha{%|F(H<m{(wj6C-=r&t
zFd(g)-0&TfYje>iF!V_c6b=$;^Mt51qU@Vt1{4*-Y%o1jIvBcO!G7p1u|#x%ZaZ2|
zx$UxoO4ay#bJ1NwX+T{WbxdT*DURfr?-xRtDS}K|6qEYLn+lB?DO$J$qjBI|aBMlF
zxlOKGg%PWTm{LKb|E)rsvd0*||5Q01%&YnWL7+?<&5;X~W80hZGbAn7z;8?%`ZKr(
zIm6<b1I)3aFC&*}a)A&<r*SlwfM9ylY^TE->Wk(U5EgMgzC<*e?I}#TNQjX!2nu}w
zNhU^ff?(H~atuks*9rmCgFE{w=z%y<2vdu3G^l-;k<#^iw@lzOXw>mSWUCcd3DGTx
zX`_AsL)6e#3d+n8BnZJOR6fEyEXD{a&`#_ZA?EWMxkFInXig2HXvkpj<`xhZG?4Qe
zHmMJi58|ItqQW>ym;oOpK5nxqf#x_vgu`~gr3L-sc7Q`Z`@EYF$x^Lj;)L5i49dK9
zbz({&Qv?;1dBijMG^Np;1{mU;HsXfg!vciA0T9NdJA+Y}$?;%*Ss#Qx5t^wGX08pj
zSxhyt0&Bs0Fco2XzK`;j_(DXR??poTdocI3yD?u+i}gV$f5GoNR0va++!fPQFy(WB
za(znX=|<$(Fi+zXSXi`aI-3;!6@i6>MU~HFyDq;r0kIHa;X?95=vSaOf`f&bSTVTR
z*oCRfSYn{b=9C6%4uGGa^Pez>>JxCtCm;Yu(w=Mj(V4VpVFlzmAv>QcA5IZs;F^MB
zQmxd@1uGyh3JX~LhOKC?zebxOh*UC9m|#pC2o*jXivTSV`KIB(z%+m*!3qfeBbraI
zhk|jcb`BSSm+`k~*Np>#`mvB!qBEkEImStVZ&<`bY!U%Uf(Q))p)*2R`R!2Uw4iC{
z1Q8CV1U3{zigXTC5)<b`J^_ip#=;O^p{;=6+oTOk<MS!Qp#5~vTRvC;$py}*^{E~e
z(qRo8J6elzMOwJZbNnu}SH4xAMIu*3&I(9Apq>60ZHC}8R>0p7;#e$*?7c3;V#1=5
z1OghE#vwMF__r|i2BFoE{MnFBGFHrk8YvuTFgsYy4wGZzIgupH5ljUoR;Dz-M^I2E
zkYv7OS}jA^?Lx4<?ez%-ThIefK!Vv_D%6-am>vqwF@vWG0>FF-GofTXPgn&^Vr29w
zRsk!u0zVZ0jrZ5*7#Xh3cOg`Ev9kUa+bSGPjA0*CH*-Dl9~C+h6?0Pi`V~6vv`%;F
z=9VB*)VG?1R^SmL9E}}U2}4-e&@ihagB374m_}|X7l<ffvN53A&b?ywOF}~h#2$A;
zf*}MPSO8e{@uzI>x*ZnBAZxS^gCi{Lq!bla+I+3XBp1c?iBEmZ2TWWBMDSe*m7PL~
zFzUXnHu#;8?t{e^A5x(?6?*XEZO9xORA+JC(gg$?_1{D=m`c(V{ma&00IWm`gazl4
zG5<-UbCSTV0I;7KP7U~7IQ--zlF(2AvHj{vqeTlLo+GmDy96TQzyu+LODLRZo4w_M
zo$+c1JJ=CNX<;Wd0))&mcG5`HxCMc)qcgfN;kk2jYHY3ok_Oyo;y@9MT;p*X0^;I!
z_<(H+!zUmTHByKfRcU}3g-{WZVuj{{+7&I1`Ur)EU6mUmAn^q$*8)WpjB_+m2>*ys
zI1#2?Vd91eg+-E&Y!^Ze16wU;g<DZ*Bx+7<t^$%TctDiuGNIuj9L+ILcY5X#G`4n&
zDdz|k;aUW_konWV_Lk5zP!OhO%3*zwPh;z~;E^DL<}=9Ua61Q9@AzmSg`Jq#5G3(|
zhO&Wb6RQF7f#R^w#Z4;^A7PZmmZm`{?bt^I<5xn_@U5nY&5Yyd5mJn!p^RJz4b>nx
zvUa2nIR_oJV?h=@-_W-tD3f9)>NcTJd@82FVBW<|A%ztXyUOQ-6;=WPHWGzEYM30T
zU|0dMV^#stP`z0U)`6Y1rRN*7Fv*nMVl)ULyW>PCju0a8)0`kdW)p)%pNQVj_kuHs
zsYJ;}bD*IDf-lC3uHa&mgd}oApaAEZyad4vYp>S_wlEW6QS;_>q4;gr6xQ^^Ji{O3
z{3Q+LP6EG!iU6(w!4Nx_jfkR}aQ1nw9Yg4d$dQ<~Hv@_%eC@yvdYXvX*=XbDiL!}T
zO%(D%?+KkRlxo0k69NIH4vW>S5ZP=vxeoi9H3!mwq?~@%-We(_2s_*TClMh|KpsS&
ziznS+Fn#S8JdpuHrq4rYE&De_HeNwOF#bA}Z{{vOp1zOBmHnKh_<Xi$LP_saN~*`h
zH;lip&b=HrR}Dhs0sDV<(HJ)gHP)7`FE~+X32b>1P)Q~gWe~*qbNMMb?b<wOs0LBM
z{Dh(7VOT)GfH0|+d*nls?GF$ndt;rhLI_PRvyGA$f)boh1==p{^(0yruR$YYk8h{<
zc2tlD!90#%D<YFDHc1r`Xq0%_3&)p5t`kv2U6AD`qD8)eC<-fEzEPXlybYZztM+nZ
z=hEVs64BxsbG%8(IrarZB{3{Tjl7mrjZLDES~?PhIxYw;&%To0&ueHC765WBg?u=|
zv3!>v7w4UQ;%lV2tpejv@l_Dec<4R}gnqfcd_UztMHEp)5k(YHL=i<a7o7WFYzcd<
T^<r`|00000NkvXXu0mjfm=ma3

literal 0
HcmV?d00001

diff --git a/docs/user_guide/assets/chatgpt_chat.png b/docs/user_guide/assets/chatgpt_chat75x75.png
similarity index 100%
rename from docs/user_guide/assets/chatgpt_chat.png
rename to docs/user_guide/assets/chatgpt_chat75x75.png
diff --git a/docs/user_guide/examples.rst b/docs/user_guide/examples.rst
index 90343f0..68aa4f5 100755
--- a/docs/user_guide/examples.rst
+++ b/docs/user_guide/examples.rst
@@ -3,20 +3,214 @@
 Examples and Tutorials
 ======================
 
-Building and Crawling a News Source
------------------------------------
 
+1. Building and Crawling a News Sources using a Multithreaded approach
+----------------------------------------------------------------------
+Building and crawling news websites can require the handling of multiple sources simultaneously and processing a large volume of articles. You can singnificantly improve the performance of this process by using multiple threads when crawling. Even if Python is not truly multithreaded (due to the GIL), i/o requests can be handled in parallel.
 
 
-Getting Articles with Scrapy
-----------------------------
+.. code-block:: python
 
 
+    from newspaper import Source
+    from newspaper.mthreading import fetch_news
+    import threading
 
-Using Playwright to Scrape Websites built with Javascript
----------------------------------------------------------
+    class NewsCrawler:
 
+        def __init__(self, source_urls, config=None):
+            self.sources = [Source(url, config=config) for url in source_urls]
+            self.articles = []
 
+        def build_sources(self):
+            # Multithreaded source building
+            threads = [threading.Thread(target=source.build) for source in self.sources]
+            for thread in threads:
+                thread.start()
+            for thread in threads:
+                thread.join()
 
-Using Playwright to Scrape Websites that require login
-------------------------------------------------------
+        def crawl_articles(self):
+            # Multithreaded article downloading
+            self.articles = fetch_news(self.sources, threads=4)
+
+        def extract_information(self):
+            # Extract information from each article
+            for source in self.sources:
+                print(f"Source {source.url}")
+                for article in source.articles[:10]:
+                    article.parse()
+                    print(f"Title: {article.title}")
+                    print(f"Authors: {article.authors}")
+                    print(f"Text: {article.text[:150]}...")  # Printing first 150 characters of text
+                    print("-------------------------------")
+
+    if __name__ == "__main__":
+        source_urls = ['https://slate.com', 'https://time.com']  # Add your news source URLs here
+        crawler = NewsCrawler(source_urls)
+        crawler.build_sources()
+        crawler.crawl_articles()
+        crawler.extract_information()
+
+
+2. Getting Articles with Scrapy
+--------------------------------
+
+Install Necessary Packages
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+    pip install scrapy
+    pip install newspaper4k
+
+Create the scrapy project:
+
+.. code-block:: bash
+
+    scrapy startproject news_scraper
+
+This command creates a new folder news_scraper with the necessary Scrapy files.
+
+
+Code the Scrapy Spider
+^^^^^^^^^^^^^^^^^^^^^^
+Navigate to the news_scraper/spiders folder and create a new spider. For example, news_spider.py:
+
+    .. code-block:: python
+
+        import scrapy
+        import newspaper
+
+        class NewsSpider(scrapy.Spider):
+            name = 'news'
+            start_urls = ['https://abcnews.go.com/elections']  # Replace with your target URLs
+
+            def parse(self, response):
+                # Extract URLs from the response and yield Scrapy Requests
+                for href in response.css('a::attr(href)'):
+                    yield response.follow(href, self.parse_article)
+
+            def parse_article(self, response):
+                # Use Newspaper4k to parse the article
+                article = newspaper.article(response.url, language='en', input_html=response.text)
+                article.parse()
+                article.nlp()
+
+                # Extracted information
+                yield {
+                    'url': response.url,
+                    'title': article.title,
+                    'authors': article.authors,
+                    'text': article.text,
+                    'publish_date': article.publish_date,
+                    'keywords': article.keywords,
+                    'summary': article.summary,
+                }
+
+
+Run the Spider
+^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+    scrapy crawl news -o output.json
+
+
+3. Using Playwright to Scrape Websites built with Javascript
+-------------------------------------------------------------
+
+Install Necessary Packages
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+    pip install newspaper4k
+    pip install playwright
+    playwright install
+
+Scrape with Playwright
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+    from playwright.sync_api import sync_playwright
+    import newspaper
+    import time
+
+    def scrape_with_playwright(url):
+        # Using Playwright to render JavaScript
+        with sync_playwright() as p:
+            browser = p.chromium.launch()
+            page = browser.new_page()
+            page.goto(url)
+            time.sleep(1) # Allow the javascript to render
+            content = page.content()
+            browser.close()
+
+        # Using Newspaper4k to parse the page content
+        article = newspaper.article(url, input_html=content, language='en')
+
+        return article
+
+    # Example URL
+    url = 'https://ec.europa.eu/commission/presscorner/detail/en/ac_24_84'  # Replace with the URL of your choice
+
+    # Scrape and process the article
+    article = scrape_with_playwright(url)
+    article.nlp()
+
+    print(f"Title: {article.title}")
+    print(f"Authors: {article.authors}")
+    print(f"Publication Date: {article.publish_date}")
+    print(f"Summary: {article.summary}")
+    print(f"Keywords: {article.keywords}")
+
+
+4. Using Playwright to Scrape Websites that require login
+----------------------------------------------------------
+
+
+.. code-block:: python
+
+    from playwright.sync_api import sync_playwright
+    import newspaper
+
+    def login_and_fetch_article(url, login_url, username, password):
+        # Using Playwright to handle login and fetch article
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)  # Set headless=False to watch the browser actions
+            page = browser.new_page()
+
+            # Automating login
+            page.goto(login_url)
+            page.fill('input[name="log"]', username)  # Adjust the selector as per the site's HTML
+            page.fill('input[name="pwd"]', password)  # Adjust the selector as per the site's HTML
+            page.click('input[type="submit"][value="Login"]')  # Adjust the selector as per the site's HTML
+
+            # Wait for navigation after login
+            page.wait_for_url('/')
+            # Navigating to the article
+            page.goto(url)
+            content = page.content()
+            browser.close()
+
+        # Using Newspaper4k to parse the page content
+        article = newspaper.article(url, input_html=content, language='en')
+
+        return article
+
+    # Example URLs and credentials
+    login_url = 'https://www.undercurrentnews.com/login/'  # Replace with the actual login URL
+    article_url = 'https://www.undercurrentnews.com/2024/01/08/editors-choice-farmed-shrimp-output-to-drop-in-2024-fallout-from-us-expanded-russia-ban/'  # Replace with the URL of the article you want to scrape
+    username = 'tester_news'  # Replace with your username
+    password = 'test'  # Replace with your password
+
+    # Fetch and process the article
+    article = login_and_fetch_article(article_url, login_url, username, password)
+    article.nlp()
+    print(f"Title: {article.title}")
+    print(f"Authors: {article.authors}")
+    print(f"Publication Date: {article.publish_date}")
+    print(f"Summary: {article.summary}")
+    print(f"Keywords: {article.keywords}")
diff --git a/newspaper/api.py b/newspaper/api.py
index 5aaadc5..4e16eb6 100755
--- a/newspaper/api.py
+++ b/newspaper/api.py
@@ -3,6 +3,7 @@
 # Copyright (c) Lucas Ou-Yang (codelucas)
 
 
+from typing import List
 import feedparser
 
 from .article import Article
@@ -14,8 +15,22 @@
 
 
 def build(url="", dry=False, config=None, **kwargs) -> Source:
-    """Returns a constructed source object without
+    """Returns a constructed :any:`Source` object without
     downloading or parsing the articles
+
+    Args:
+        url (str): The url of the source (news website) to build. For example,
+            `https://www.cnn.com`.
+        dry (bool): If true, the source object will be constructed but not
+            downloaded or parsed.
+        config (Configuration): A configuration object to use for the source.
+        kwargs: Any other keyword arguments to pass to the Source constructor.
+            If you omit the config object, you can add any configuration
+            options here.
+
+    Returns:
+        Source: The constructed :any:`Source` object.
+
     """
     config = config or Configuration()
     config.update(**kwargs)
@@ -40,11 +55,11 @@ def build_article(url="", config=None, **kwargs) -> Article:
 
 
 def languages():
-    """Returns a list of the supported languages"""
+    """Prints a list of the supported languages"""
     print_available_languages()
 
 
-def popular_urls():
+def popular_urls() -> List[str]:
     """Returns a list of pre-extracted popular source urls"""
     with open(POPULAR_URLS, encoding="utf-8") as f:
         urls = ["http://" + u.strip() for u in f.readlines()]
diff --git a/newspaper/article.py b/newspaper/article.py
index 22da870..89f7e86 100755
--- a/newspaper/article.py
+++ b/newspaper/article.py
@@ -140,9 +140,9 @@ class Article:
     def __init__(
         self,
         url: str,
-        title: str = "",
-        source_url: str = "",
-        read_more_link: str = "",
+        title: Optional[str] = "",
+        source_url: Optional[str] = "",
+        read_more_link: Optional[str] = "",
         config: Optional[Configuration] = None,
         **kwargs: Dict[str, Any],
     ):
diff --git a/newspaper/configuration.py b/newspaper/configuration.py
index c882d57..2dba63e 100755
--- a/newspaper/configuration.py
+++ b/newspaper/configuration.py
@@ -31,7 +31,9 @@
 
 class Configuration:
     """Modifies Article / Source properties.
+
     Attributes:
+
         min_word_count (int): minimum number of word tokens in an article text
         min_sent_count (int): minimum number of sentences in an article text
         max_title (int): :any:`Article.title` max number of chars. ``title``
@@ -60,9 +62,9 @@ class Configuration:
         memorize_articles (bool): If True, it will cache and save
             articles run between runs. The articles are *NOT* cached.
             It will save the parsed article urls between different
-            `Source`.`generate_articles()` runs. default True.
-        disable_category_cache (bool): If True, it will not cache the `Source`
-            category urls. default False.
+            :any:`Source.generate_articles()` runs. default True.
+        disable_category_cache (bool): If True, it will not cache
+            the :any:`Source` category urls. default False.
         fetch_images (bool): If False, it will not download images
             to verify if they obide by the settings in top_image_settings.
             default True.
@@ -72,7 +74,7 @@ class Configuration:
             from the article body html.
             Affected property is :any:`Article.article_html`.
             Default True.
-        http_success_only (bool): if True, it will raise an ``ArticleException``
+        http_success_only (bool): if True, it will raise an :any:`ArticleException`
              if the html status_code is >= 400 (e.g. 404 page). default True.
         stopwords_class (obj): unique stopword classes for oriental languages,
             don't toggle
@@ -88,13 +90,13 @@ class Configuration:
             and could hang the process due to huge binary files (such as movies)
             default False.
         ignored_content_types_defaults (dict): dictionary of content-types
-            and a default stub content.
-            These content type will not be downloaded.
-            **Note:**
-             If `allow_binary_content` is False,
-            binary content will lead to `ArticleBinaryDataException` for
-            `Article.download()` and will be skipped in `Source.build()`. This
-            will override the defaults in :any:`ignored_content_types_defaults`
+            and a default stub content. These content type will not be downloaded.
+
+            **Note:** If :any:`allow_binary_content` is False,
+            binary content will lead to :any:`ArticleBinaryDataException` for
+            :any:`Article.download()` and will be skipped in
+            :any:`Source.build()`. This will override the defaults
+            in :any:`ignored_content_types_defaults`
             if these match binary files.
         use_cached_categories (bool): if set to False, the cached categories
             will be ignored and a the :any:`Source` will recompute the category
@@ -206,8 +208,9 @@ def __init__(self):
 
     def update(self, **kwargs):
         """Update the configuration object with the given keyword arguments.
+
         Arguments:
-                **kwargs: The keyword arguments to update.
+            **kwargs: The keyword arguments to update.
         """
 
         for key, value in kwargs.items():
@@ -292,6 +295,7 @@ def language(self, value: str):
     def use_meta_language(self):
         """Read-only property that indicates whether the meta language
         read from the website was used or the language was explicitly set.
+
         Returns:
             bool: True if the meta language was used, False if the language
             was explicitly set.
diff --git a/newspaper/exceptions.py b/newspaper/exceptions.py
index 1893fce..66e57d0 100755
--- a/newspaper/exceptions.py
+++ b/newspaper/exceptions.py
@@ -4,7 +4,7 @@
 
 class ArticleBinaryDataException(Exception):
     """Exception raised for binary data in urls.
-    will be raised if allow_binary_content is False.
+    will be raised if :any:`Configuration.allow_binary_content` is False.
     """
 
 
diff --git a/newspaper/mthreading.py b/newspaper/mthreading.py
index e846e8b..db7fb33 100755
--- a/newspaper/mthreading.py
+++ b/newspaper/mthreading.py
@@ -21,17 +21,19 @@ def fetch_news(
     If there is a problem in detecting the language of the urls, then instantiate
     the `Article` object yourself with the language parameter and pass it in.
 
-    Arguments:
-        news_list {List[Union[str, Article, Source]]} -- List of sources,
-        articles, urls or a mix of them.
-
-        threads {int} -- Number of threads to use for fetching. This affects
-        how many items from the news_list are fetched at once. In order to control
-        how many threads are used in a `Source` object, use the
-        `Configuration`.`number_threads` setting. This could result in
-        a high number of threads. Maximum number of threads would be
-        `threads` * `Configuration`.`number_threads`.
-
+    Args:
+        news_list(List[Union[str, Article, Source]]): List of sources,
+            articles, urls or a mix of them.
+
+        threads(int):  Number of threads to use for fetching. This affects
+            how many items from the news_list are fetched at once. In order to
+            control
+            how many threads are used in a `Source` object, use the
+            `Configuration`.`number_threads` setting. This could result in
+            a high number of threads. Maximum number of threads would be
+            `threads` * `Configuration`.`number_threads`.
+    Returns:
+        List[Union[Article, Source]]: List of articles or sources.
     """
 
     def get_item(item: Union[str, Article, Source]) -> Union[Article, Source]:
diff --git a/newspaper/source.py b/newspaper/source.py
index c76386c..10080cb 100755
--- a/newspaper/source.py
+++ b/newspaper/source.py
@@ -244,7 +244,7 @@ def set_description(self):
         self.description = metadata["description"]
 
     def download(self):
-        """Downloads html of source"""
+        """Downloads html of source, i.e. the news site homppage"""
         self.html = network.get_html(self.url, self.config)
 
     def download_categories(self):
@@ -408,14 +408,23 @@ def _generate_articles(self):
         return list(uniq.values())
 
     def generate_articles(self, limit=5000):
-        """Saves all current articles of news source, filter out bad urls"""
+        """Creates the :any:`Source.articles` List of :any:`Article` objects.
+        It gets the Urls from all detected categories and RSS feeds, checks
+        them for plausibility based on their URL (using some heuristics defined
+        in the ``urls.valid_url`` function). These can be further
+        downloaded using :any:`Source.download_articles()`
+
+        Args:
+            limit (int, optional): The maximum number of articles to generate.
+                Defaults to 5000.
+        """
         articles = self._generate_articles()
         self.articles = articles[:limit]
         log.debug("%d articles generated and cutoff at %d", len(articles), limit)
 
     def download_articles(self) -> List[Article]:
         """Starts the ``download()`` for all :any:`Article` objects
-        from the ``articles`` property. It can run single threaded or
+        in the :any:`Source.articles` property. It can run single threaded or
         multi-threaded.
         Returns:
             List[:any:`Article`]: A list of downloaded articles.
diff --git a/tests/test_source.py b/tests/test_source.py
index 3d2a75a..c7affff 100755
--- a/tests/test_source.py
+++ b/tests/test_source.py
@@ -110,6 +110,8 @@ def test_empty_url_source(self):
         with pytest.raises(ValueError):
             Source(url=None)
 
+    # Skip if GITHUB_ACTIONS. It can fail because of internet access
+    @pytest.mark.skipif("GITHUB_ACTIONS" in os.environ, reason="Skip if GITHUB_ACTIONS")
     def test_build_source(self, cnn_source):
         source = Source(cnn_source["url"], verbose=False, memorize_articles=False)
         source.clean_memo_cache()
@@ -130,6 +132,8 @@ def test_build_source(self, cnn_source):
         # assert sorted(source.category_urls()) == sorted(cnn_source["category_urls"])
         # assert sorted(source.feed_urls()) == sorted(cnn_source["feeds"])
 
+    # Skip if GITHUB_ACTIONS. It can fail because of internet access
+    @pytest.mark.skipif("GITHUB_ACTIONS" in os.environ, reason="Skip if GITHUB_ACTIONS")
     def test_memorize_articles(self, cnn_source):
         source = Source(cnn_source["url"], verbose=False, memorize_articles=True)
         source.clean_memo_cache()
@@ -184,6 +188,8 @@ def stub_func(_, domain):
         with pytest.raises(Exception):
             stub_func(None, source.domain)
 
+    # Skip if GITHUB_ACTIONS. It can fail because of internet access
+    @pytest.mark.skipif("GITHUB_ACTIONS" in os.environ, reason="Skip if GITHUB_ACTIONS")
     def test_get_feeds(self, feed_sources):
         for feed_source in feed_sources:
             source = Source(feed_source["url"])