-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNewsScraper.py
157 lines (137 loc) · 5.21 KB
/
NewsScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
News scraper, scrapes articles from a newspaper then adds them to a database.
"""
__title__ = 'NewsScraper'
__author__ = 'Ivan Fernando Galaviz Mendoza'
from datetime import datetime
from time import mktime
import json
import logging
from pymongo import MongoClient
from newspaper import Article, build
import feedparser as fp
import constants
from classifier import Classifier
logging.basicConfig(filename=constants.LOG_FILENAME,
format='%(asctime)s-%(levelname)s-%(message)s',
datefmt='%d-%b-%y %H:%M:%S')
# TODO: Assign timezones accordingly to the region (e.g. Jalisco)
# TODO: Create a temp dict to use when losing connection to the db
# TODO: Create index from the data to prevent repeated registers (db.profiles.create_index())
def try_to_get_utc(date, link):
try:
return datetime.utcfromtimestamp(mktime(date))
except Exception:
logging.warning(
'Could not get UTC time from {}'.format(link), exc_info=True)
return date
def log_invalid_text(link):
logging.warning('Ignoring {} due to invalid body.'.format(link))
def start_classification():
Classifier()
def scrape_news():
# Loads the JSON file with news sites
with open(constants.NEWSPAPERS_PATH) as newspapers_file:
companies = json.load(newspapers_file)
# Initialize database connection
client = MongoClient()
# Assign database
db = client.test
for company, info in companies.items():
# If a RSS link is provided in the JSON file,
# this will be the first choice.
# Reason for this is that,
# RSS feeds often give more consistent and correct data.
# If you do not want to scrape from the RSS-feed,
# just leave the RSS attr empty in the JSON file.
if 'rss' in info:
parse_rss(company, info, db)
else:
parse_link(company, info, db)
# Close DB connection
client.close()
start_classification()
def parse_link(company, info, db):
article_link = info['link']
paper = build(article_link, language='es')
none_type_count = 0
article_count = 0
for article in paper.articles:
if article_count > constants.ARTICLES_TO_DOWNLOAD:
break
try:
article.download()
article.parse()
except Exception:
logging.warning('Could not download/parse {}'.format(article_link),
exc_info=True)
continue
# Again, for consistency, if there is no found publish date
# the article will be skipped.
# After 10 downloaded articles from the same newspaper
# without publish date, the company will be skipped.
if article.publish_date is None:
none_type_count += 1
if none_type_count > 10:
logging.warning(
'Skipping {} because of too many noneType dates...'.format(
company))
break
article_count += 1
continue
article_text = article.text
article_url = article.url
if not article_text:
log_invalid_text(article_url)
continue
db.test.insert_one({
constants.NEWSPAPER: company,
constants.TITLE: article.title,
constants.TEXT: article_text,
constants.TAGS: list(article.tags),
constants.LINK: article_url,
constants.PUB_DATE: article.publish_date,
constants.EXTRACT_DATE: datetime.utcnow(),
constants.HAS_BEEN_CLASSIFIED: False,
constants.IS_VIOLENT: None
})
def parse_rss(company, info, db):
parsed_dict = fp.parse(info['rss'])
article_count = 0
for entry in parsed_dict.entries:
# Check if publish date is provided, if no the article is skipped.
# This is done to keep consistency in the data
# and to keep the script from crashing.
if not hasattr(entry, 'published'):
continue
if article_count > constants.ARTICLES_TO_DOWNLOAD:
break
article_link = entry.link
try:
article = Article(article_link, fetch_images=False)
article.download()
article.parse()
except Exception:
# If the download for some reason fails (ex. 404)
# the script will continue downloading the next article.
logging.warning('Could not download/parse {}'.format(article_link),
exc_info=True)
continue
article_text = article.text
if not article_text:
log_invalid_text(article_link)
continue
db.test.insert_one({
constants.NEWSPAPER: company,
constants.TITLE: article.title,
constants.TEXT: article_text,
constants.TAGS: list(article.tags),
constants.LINK: article_link,
constants.PUB_DATE: try_to_get_utc(entry.published_parsed,
article_link),
constants.EXTRACT_DATE: datetime.utcnow(),
constants.HAS_BEEN_CLASSIFIED: False,
constants.IS_VIOLENT: None
})