-
Notifications
You must be signed in to change notification settings - Fork 0
/
initialise_news.py
142 lines (112 loc) · 4.99 KB
/
initialise_news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import csv
import requests
import json
import pandas as pd
import time
import nltk
import string
import os
import re
from dateutil import parser
from datetime import date, datetime, timedelta
import config
def print_banner(text, length=70):
print(">"*length)
print(text)
print("<"*length)
def print_line(text, length=70):
side_length = int((50-len(text))/2)
print("="*side_length + text + "="*side_length)
def preprocess_text(text):
"""Removing source and punctuations from title"""
text = " ".join(text.split("-")[:-1]) # Remove source of article
text = re.sub(' +', ' ', text) # Remove additional whitespace
return text
def get_extract_end_date(extract_start_date, end_date):
"""Get extract end date based on standardisation rules"""
# To query in the same ISO week only
if extract_start_date.weekday() == 6: # 6 refers to Sunday index
extract_end_date = extract_start_date + timedelta(days=7) # End of week
else:
days_to_sunday = timedelta((6 - extract_start_date.weekday()) % 7)
extract_end_date = extract_start_date + days_to_sunday # End of week
# Ensure extract end date does not exceed end date
if extract_end_date > end_date:
extract_end_date = end_date
return extract_end_date
def extract_news(company):
"""Extract News from Google News API"""
# Initialise api request variables
url = "https://google-news.p.rapidapi.com/v1/search"
api_key_index = 0 # Start with first key
# Initialise variables
extract_start_date = start_date # Start with first day
while extract_start_date < end_date:
# Initialise api request key (for rotation)
headers = {
'x-rapidapi-host': "google-news.p.rapidapi.com",
'x-rapidapi-key': api_keys[api_key_index]
}
extract_end_date = get_extract_end_date(extract_start_date, end_date)
# Make api call
query_string = {"country":"US", "lang":"en", "q":company,
"from":str(extract_start_date), "to":str(extract_end_date)
}
response = requests.request("GET", url, headers=headers,
params=query_string)
response_text = json.loads(response.text)
# Check if api limit has been reached
if 'message' in response_text:
# Update api key index
if api_key_index + 1 < len(api_keys):
new_api_key_index = api_key_index + 1
print_line("API limit reached for API key index {}, switching to {}".format(api_key_index, new_api_key_index))
else:
new_api_key_index = 0
print_line("API limit reached for all API keys, sleeping for 1 hour")
time.sleep(3600)
# Update API key
api_key_index = new_api_key_index
continue
print("Progress: Extracted {} for the time period {} to {}".format(company, extract_start_date, extract_end_date))
# Save query results progressively
with open(output_path, 'a', newline='') as f:
writer = csv.writer(f)
for article in response_text['articles']:
title = preprocess_text(article["title"]).encode('utf-8').decode('utf-8')
date = datetime.strptime(article["published"], '%a, %d %b %Y %H:%M:%S %Z')
date_posted = date.strftime('%Y-%m-%d')
year = date.isocalendar()[0] # Extracting year
week = date.isocalendar()[1] # Extracting week
writer.writerow([company, title, date_posted, year, week])
# Update extract start date to obtain next week data
extract_start_date = extract_end_date
if __name__ == "__main__":
# Set date parameters (year, month, day)
start_date = date(config.START_DATE[0], config.START_DATE[1], config.START_DATE[2])
end_date = date(config.END_DATE[0], config.END_DATE[1], config.END_DATE[2])
print("Start date: " + start_date.strftime('%Y-%m-%d'))
print("End date: " + end_date.strftime('%Y-%m-%d'))
today = date.today()
# Obtain API keys
api_keys = config.API_KEYS
print("{} API keys received".format(len(api_keys)))
# Set path
companies_path = 'companies.txt'
output_path = 'data/initialise_news {}.csv'.format(today)
# Write header for output file
col_names = ['company_name', 'title', 'date_posted', 'year', 'week'] # 'score'
# Create new file with headers if file does not exist
if not os.path.isfile(output_path):
print("Creating new file for storage at {}".format(output_path))
with open(output_path, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(col_names)
else:
print("Storing in existing file {}".format(output_path))
# Load companies for extraction
companies = open(companies_path).read().split()
# Extract news for all companies
for company in companies:
print_banner("Extracting news: " + company)
extract_news(company)