-
Notifications
You must be signed in to change notification settings - Fork 0
/
news_extractor.py
56 lines (47 loc) · 1.54 KB
/
news_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import newspaper
import csv
class newsExtractor(object):
"""docstring for newsExtractor"""
def __init__(self, url):
self.url = url
self.paper = newspaper.build(url,memoize_articles=False)
self.result=[["Title","Article"]]
print("initialized successfully")
def extract_article(self):
length = self.paper.size()
for i in range(length):
#print(i)
article=self.paper.articles[i]
article.download()
article.parse()
articleText=article.text
articleTitle=article.title
self.result.append([articleTitle,articleText])
print("extracted successfully")
def store_result(self,filename):
with open(filename,'w') as f:
w= csv.writer(f)
for row in self.result:
w.writerow(row)
India_urls = ['http://www.hindustantimes.com/','http://www.thehindu.com/','http://timesofindia.indiatimes.com/',
'http://www.tribuneindia.com/','http://indianexpress.com/','http://economictimes.indiatimes.com/',
'http://www.deccanherald.com/'
]
newspaperNames = ['ht','hindu','toi','tribune','ie','et','dh']
i=0;
date = ''
for url in India_urls:
paper = newsExtractor(url)
paper.extract_article()
paper.store_result(newspaperNames[i]+date+'.csv')
#urls = ['','','','','','','',]
#hindustan_times = newsExtractor("http://www.tribuneindia.com/")
#hindustan_times.extract_article()
#hindustan_times.store_result("tribune_7_Aug.csv")
#http://www.hindustantimes.com/
#http://www.thehindu.com/
#http://timesofindia.indiatimes.com/
#http://www.tribuneindia.com/
#http://indianexpress.com/
#http://economictimes.indiatimes.com/
#http://www.deccanherald.com/