-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_corpus.py
187 lines (170 loc) · 7.4 KB
/
get_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Author: Hugo Keenan
# Aim: This script retrieves article data from the NYT archive API for a user specified year range and saves it to a file of each month's data before concatenating all individual files into a single CSV file.
## Note: It saves each month's data to a separate file in the /corpus directory. concat_data function concatenates files in folder to a single file.
## Note: To run this script successfully, you need to get your API key from the NYT Developer site and save it in a file called "nyt_api_key.txt"
import csv
import sys
import requests
import os
import time
import logging
import argparse
from tqdm import tqdm
# Constants
OUT_DIR = 'corpus'
API_KEY_FILE_PATH = 'nyt_api_key.txt'
BASE_URL = 'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={key}'
MAX_RETRY = 10
RETRY_DELAY = 1
RARE_DELIMITER = '\u241d'
def fetch_from_api(year, month, key, logger, retry_count=0):
"""
Try to get data from the API. If the request fails with a 429 status code (Too many requests),
retry after an exponentially increasing amount of time.
Info is logged to the logger.
"""
url = BASE_URL.format(year=year, month=month, key=key)
logger.info('Fetching {}'.format(url))
try:
res = requests.get(url)
res.raise_for_status() # Raise an exception for any HTTP error status
return res.json()
except requests.HTTPError as e:
if e.response.status_code == 429:
if retry_count < MAX_RETRY:
# Exponential backoff: Wait for an exponentially increasing amount of time before retrying.
delay = (2 ** retry_count) * RETRY_DELAY
logger.info('Got HTTP 429: Too many requests. Retrying in {} seconds.'.format(delay))
time.sleep(delay)
return fetch_from_api(year, month, key, logger, retry_count + 1)
else:
logger.error('Max retry limit reached. Unable to fetch data.')
raise
else:
logger.error('HTTP Error: {}'.format(e))
raise
# Function to read API key from file
def read_api_key_from_file(file_path, logger):
try:
with open(file_path, 'r') as file:
api_key = file.read().strip()
return api_key
except FileNotFoundError:
logger.error(f"API key file '{file_path}' not found.")
raise
except Exception as e:
logger.error(f"Error reading API key file: {e}")
raise
def extract_metadata(api_results):
"""
Extract metadata from the API response
"""
all_metadata = []
for doc in api_results['response']['docs']:
metadata = {
'title': doc['headline']['main'],
'section_name': doc['section_name'],
'snippet': doc['snippet'],
'lead_paragraph': doc['lead_paragraph'],
'year': doc['pub_date'][:4],
'month': doc['pub_date'][5:7],
'web_url': doc['web_url']
}
all_metadata.append(metadata)
return all_metadata
def save_to_csv(data, file_path, delimiter="|||||", missing_value="NA"):
"""
Saves a list of dictionaries to a CSV file with the
"""
# save as CSV with rare delimiter
with open(file_path, 'w', newline='') as file:
writer = csv.DictWriter(file, fieldnames=data[0].keys(), delimiter=RARE_DELIMITER)
writer.writeheader()
for row in data:
# Replace missing values with "NA"
row = {key: value if value else missing_value for key, value in row.items()}
writer.writerow(row)
# re open and replace delimiter with "|||||"
with open(file_path, 'r') as file:
text = file.read()
text = text.replace(RARE_DELIMITER, delimiter)
with open(file_path, 'w') as file:
file.write(text)
def load_from_csv(file_path, delimiter="|||||"):
"""
Load data from a CSV file with a custom delimiter
"""
file_name, file_extension = os.path.splitext(file_path)
with open(file_path, 'r') as file:
text = file.read()
text = text.replace(delimiter, RARE_DELIMITER)
temp_file_path = f"{file_name}_temp.{file_extension}"
with open(temp_file_path, 'w') as file:
file.write(text)
with open(temp_file_path, 'r', newline='') as file:
reader = csv.DictReader(file, delimiter=RARE_DELIMITER)
data = [row for row in reader]
# remove the temp file
os.remove(temp_file_path)
return data
def get_corpus(start_year, end_year, logger, delimiter = '|||||', missing_value='NA'):
if logger is None:
logger = logging.getLogger('NYT downloader')
# Get the API key
try:
NYT_KEY = read_api_key_from_file(API_KEY_FILE_PATH, logger)
except Exception:
logger.error("Unable to retrieve API key. Exiting.")
exit(1)
# Make sure dir exists
os.makedirs(OUT_DIR, exist_ok=True)
# Fetch data from the API
for year in range(start_year, end_year + 1):
for month in range(1,13):
file_path = os.path.join(OUT_DIR, f'nyt_data_{year}_{month}.csv')
if os.path.exists(file_path):
logger.info(f"Data for {year}-{month} already exists. Skipping.")
continue
logger.info(f"Fetching data for {year}-{month}...")
try:
api_results = fetch_from_api(year, month, NYT_KEY, logger)
metadata = extract_metadata(api_results)
logger.info(f"Successfully fetched {len(metadata)} articles.")
except Exception as e:
logger.error(f"Error fetching data for {year}-{month}: {e}")
continue
save_to_csv(metadata, file_path, delimiter=delimiter, missing_value=missing_value)
logger.info(f"Data saved to {file_path}")
def concat_data(out_file='all_nyt_data.csv', dir_path=OUT_DIR, logger=None):
if logger is None:
logger = logging.getLogger('NYT downloader')
# Check if output file already exists
if os.path.exists(out_file):
logger.error(f"Output file {out_file} already exists. Exiting.")
sys.exit(1)
# Get all file paths
file_paths = [os.path.join(dir_path, file) for file in os.listdir(dir_path) if file.endswith('.csv')]
# Sort the files
file_paths.sort(key=lambda x: int(x.split('_')[-2]) * 100 + int(x.split('_')[-1].split('.')[0]))
logger.info(f"Found {len(file_paths)} files.")
# Load data from each file and append to the output file
logger.info(f"Saving data to {out_file}, may take a while...")
for i, file_path in tqdm(enumerate(file_paths)):
with open(file_path, 'r') as file:
file_content = file.read()
if not i == 0:
file_content = file_content.split('\n', 1)[1]
with open(out_file, 'a') as file:
file.write(file_content)
if __name__ == '__main__':
# Set up logging
logger = logging.getLogger('NYT downloader')
logging.basicConfig(level=logging.INFO)
# Get arguments from the command line
parser = argparse.ArgumentParser(description='Download New York Times corpus for a given year and month.')
# get the year and month (default to 2023 and 1)
parser.add_argument('--start_year', default=1930, type=int, help='Year to download data from')
parser.add_argument('--end_year', default=2023, type=int, help='Year to download data till')
args = parser.parse_args()
# Call the main function
get_corpus(args.start_year, args.end_year, logger)