-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.R
53 lines (39 loc) · 1.64 KB
/
scraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# This script scrapes the abgeordnetenwatch.de profile page of each deputy from `data/deputies.json` in
# order to extract the links to social media platforms; saves the result in
# `data/deputies_custom_links.csv`.
#
# December 2018, Markus Konrad <markus.konrad@wzb.eu>
#
library(jsonlite)
library(rvest)
library(dplyr)
# data from members of the 19th German Bundestag
# obtained from https://www.abgeordnetenwatch.de/api/parliament/bundestag/deputies.json
deputies <- fromJSON('data/deputies_20190702.json')
sleep_sec <- 10 # according to robots.txt
# get profile URLs
n_profiles <- nrow(deputies$profiles)
print(paste('Num. profiles:', n_profiles))
prof_urls <- deputies$profiles$meta %>% select(uuid, url)
#prof_urls <- prof_urls %>% head(10)
# function to fetch HTML from profile page and extract "further links" section
# ("Weiterführende Links von ...") on the page
fetch_urls <- function(profile_row) {
print(paste('fetching profile page at', profile_row$url))
# wait and fetch HTML
Sys.sleep(sleep_sec)
html <- read_html(profile_row$url)
# extract links
links <- html_nodes(html, 'div.deputy__custom-links ul.link-list li a')
urls <- html_attr(links, 'href')
if (length(urls) == 0) {
urls <- NA
}
# return data frame for this deputy which will be concatenated to a single data frame
# of all deputies
data.frame(profile_row, custom_links = urls, stringsAsFactors = FALSE)
}
# apply fetch_urls to each profile
prof_urls_complete <- prof_urls %>% rowwise() %>% do(fetch_urls(.))
# save result
write.csv(prof_urls_complete, 'data/deputies_custom_links_20190702.csv', row.names = FALSE)