-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper_faz.py
102 lines (81 loc) · 4.1 KB
/
scraper_faz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
from dateutil import parser
import time
import random
import pymongo
from dotenv import load_dotenv
import os
def request_articles(execution, param_from = datetime.today() - timedelta(days=1), param_to = datetime.today() - timedelta(days=1), url = "https://www.faz.net/suche/?query=&ct=article&ct=blog&ct=storytelling&author=&from=&to="):
now = datetime.now()
url = re.sub("from=.*&", "from=" + (param_from - timedelta(days=1)).strftime("%d.%m.%Y") + "&", url)
url = re.sub("to=.*", "to=" + (param_to - timedelta(days=1)).strftime("%d.%m.%Y"), url)
print("Requesting: ", url)
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")
links = soup.find_all("a")
try:
page = int(re.search("/s(\d).html", url).groups()[0])
except:
page = 1
links_articles = [link.get("href") for link in links if link.get("href") and re.match("https://.*/suche/.{6,}html", link.get("href"))]
print("Articles count: ", len(links_articles))
regex = "https://.*/suche/s" + str(page + 1) + ".html.*"
links_pages = [link for link in links if link.get("href") and re.match(regex, link.get("href"))]
if len(links_pages) > 0:
print("Found a next page. Requesting: ", links_pages[0].get("href"))
time.sleep(random.uniform(2, 5))
links_articles.extend(request_articles(execution, param_from=param_from, param_to=param_to, url=links_pages[0].get("href")))
execution["ts"] = now
execution["source"] = "https://www.faz.net"
execution["from"] = param_from
execution["to"] = param_to
execution["articles_count"] = len(links_articles)
return links_articles
def parse_article_author(dict, soup):
try:
dict["author"] = soup.find("a", attrs="atc-MetaAuthorLink").contents[1]
dict["hrefAuthor"] = "https://www.faz.net" + soup.find("a", attrs="atc-MetaAuthorLink").get("href")
except:
try:
dict["author"] = soup.find("span", attrs="atc-MetaAuthor").contents[1]
except:
try:
dict["source"] = soup.find("span", attrs="atc-Footer_Quelle").text
except:
print("unable to find an author")
def parse_article_premium(dict, soup):
try:
dict["premiumArticle"] = True if soup.find("span", attrs="ico-Base atc-HeadlineIcon").find("span", attrs="o-Icon ico-Base_FazPlus") else False
except:
dict["premiumArticle"] = False
def parse_article_keywords(dict, soup):
try:
keywords = set()
[keywords.add(keyword.strip()) for keyword in soup.find("meta", {"name": "keywords"}).get("content").split(",")]
[keywords.add(keyword.strip()) for keyword in soup.find("meta", {"name": "news_keywords"}).get("content").split(",")]
dict["keywords"] = list(keywords)
except:
print("unable to parse keywords")
def parse_article(url):
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")
multipage = soup.find("div", attrs="atc-ContainerFunctions_Navigation").find("a", attrs="btn-Base_Link")
if multipage:
print("multiplage article found. Requesting: ", multipage.get("href"))
time.sleep(random.uniform(0.5, 2))
page = urlopen(multipage.get("href"))
soup = BeautifulSoup(page, "html.parser")
article = {}
article["_id"] = soup.find("span", attrs="js-sharing-link-select-trigger js-sharing-link lay-Sharing_PermalinkUrl").text.split("/")[3].split("?")[0]
parse_article_keywords(article, soup)
parse_article_premium(article, soup)
article["hrefPermalink"] = soup.find("span", attrs="js-sharing-link-select-trigger js-sharing-link lay-Sharing_PermalinkUrl").text
article["lastUpdate"] = parser.parse(soup.find("time", attrs="atc-MetaTime").get("datetime"))
article["title"] = soup.title.string
article["introText"] = soup.find("p", attrs="atc-IntroText").string.strip()
article["paragraphs"] = [paragraph.text for paragraph in soup.find_all("p", attrs="atc-TextParagraph")]
parse_article_author(article, soup)
return article