-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_news_from_to.js
159 lines (113 loc) · 4.28 KB
/
scrape_news_from_to.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
const axios = require("axios");
const cheerio = require("cheerio");
const fs = require("fs");
const fastcsv = require('fast-csv')
const domain = 'https://economictimes.indiatimes.com'
// you can get start time and end times by navigating to any day of the archives
// https://economictimes.indiatimes.com/archive.cms
// and copying the starttime parameter from the url.
const start_time = 43101
const end_time = 43465
// url of a page that displays list of news articles on a particular day.
const get_day_url = (time) => `https://economictimes.indiatimes.com/archivelist/starttime-${time}.cms`
const filename = 'news_2018'
const main = async () => {
try {
const csvStream = fastcsv.format({ headers: true });
const stream = fs.createWriteStream(__dirname + `/${filename}.csv`)
csvStream.pipe(stream)
// contentArray = []
curr = start_time
while(curr <= end_time) {
process.stdout.write("Processing day: " + curr + "\r");
const day_url = get_day_url(curr)
const day_articles = await scrape_day(day_url)
// contentArray.push(...day_articles)
for (article of day_articles) {
csvStream.write(article)
}
curr += 1
}
// fs.writeFile("news_2021_jan_june.json", JSON.stringify(contentArray, null, 2), (err) => {
// if (err) {
// console.log('write array to json error')
// console.log(err.message)
// return;
// }
// console.log("Successfully written data to file");
// });
} catch (err) {
console.log('main function error')
console.log(err.message)
}
}
// scrape all the articles on a particular day
const scrape_day = async (url) => {
try{
articlesArray = []
const { data } = await axios.get(url);
const $ = cheerio.load(data);
content = $('.content a')
promiseArray = []
// some links appear twice in the page, same title but different urls
// this set is used to detect the duplicates
title_set = new Set()
// set the batch size depending on the processing power and internet speed.
// more the batch size more the speed.
let batch_size = 100
let i = 0
for (let elem of content) {
news_link = $(elem).attr('href')
news_title_encoding = news_link.match(/[^/]*\/articleshow/)[0].slice(0, -("/articleshow".length))
if(!title_set.has(news_title_encoding)){
if(i == batch_size) {
articlesArray.push(...(await Promise.all(promiseArray)))
promiseArray = []
i = 0
}
title_set.add(news_title_encoding)
promiseArray.push(new Promise(async (resolve, reject) => {
news_detail = await scrape_page(domain + news_link)
resolve({...news_detail})
}))
i++
}
}
return articlesArray
} catch(err) {
console.log('scrape day error')
console.log(err.message)
}
}
// scrape news details from a particular news page
const scrape_page = async (url) => {
try{
object = {}
const { data } = await axios.get(url);
const $ = cheerio.load(data);
title = $('h1.artTitle')
object.title = title.text()
body = $('div.artText')
bodyText = body.text().replace(/(\r\n|\n|\r)/gm,"") // remove line breaks
// Economic times premium article
if( $('#pricePlan').length ){
summary = $('.artSyn p')
object.summary = summary.text()
object.body = "Partial body: " + bodyText
time = $('time')
object.updatedAt = time.text()
} else {
summary = $('h2.summary')
object.summary = summary.text()
object.body = bodyText
time = $('time')
object.updatedAt = time.text().substr(14)
}
object.url = url
return object
} catch (err) {
console.log('scrape page error')
console.log(err.message)
}
}
main()