-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_kcna.py
264 lines (220 loc) · 9.73 KB
/
fetch_kcna.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import os
from datetime import datetime
from tqdm import tqdm
# Load the current headlines archive from S3
archive_url = 'https://stilesdata.com/north-korea-news/headlines.json'
current_archive = pd.read_json(archive_url, orient='records')
# Convert date column in current archive to datetime
current_archive['date'] = pd.to_datetime(current_archive['date'], errors='raise')
today = pd.Timestamp.today().strftime('%Y-%m-%d')
key_topics = [
"WPK General Secretary Kim Jong Un's Revolutionary Activities", "Documents",
'Latest News', 'Top News', 'Home News', 'World', "Revolutionary Anecdote",
'Society-Life', 'External', 'News Commentary', 'Always in Memory of People', 'Celebrations for New Year'
]
# Retrieve the proxy service key from environment variables (GitHub Actions secrets)
proxy_service_key = os.getenv('SCRAPE_PROXY_KEY')
# List of user-agents for rotation
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/78.0',
]
# Function to fetch the menu links
def fetch_menu_links(url):
headers = {
'User-Agent': random.choice(user_agents)
}
response = requests.get(
'https://proxy.scrapeops.io/v1/',
params={
'api_key': proxy_service_key,
'url': url,
'premium': 'true'
},
headers=headers
)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
menu_block = soup.find('div', class_='col-md-12 menu-block')
links_data = []
if menu_block:
links = menu_block.find_all('a')
for link in links:
topic = link.text.strip()
link_url = link.get('href')
full_url = f"http://www.kcna.kp{link_url}" # Construct the full URL
links_data.append({'topic': topic, 'link': full_url})
return pd.DataFrame(links_data)
else:
print(f"Failed to retrieve the menu page. Status code: {response.status_code}")
return pd.DataFrame()
# Revised function to distinguish between Juche and Gregorian dates
# def convert_juche_to_gregorian(juche_date):
# if juche_date is None or juche_date == 'Unknown':
# return pd.NaT
# try:
# # Check if date starts with "Juche" to identify Juche format
# if "Juche" in juche_date:
# clean_date = juche_date.replace('[', '').replace(']', '')
# parts = clean_date.split('.')
# # Extract Juche year
# juche_year = int(parts[0].replace('Juche', '').strip())
# month = int(parts[1])
# day = int(parts[2])
# # Convert Juche year to Gregorian year
# gregorian_year = juche_year + 1911
# else:
# # Treat as Gregorian date
# parts = juche_date.split('.')
# gregorian_year = int(parts[0])
# month = int(parts[1])
# day = int(parts[2])
# # Print for debugging purposes
# print(f"Converting date: {juche_date} => Year: {gregorian_year}, Month: {month}, Day: {day}")
# # Validate date range
# if not (1900 <= gregorian_year <= datetime.now().year):
# print(f"Out of bounds date: {gregorian_year}-{month}-{day}")
# return pd.NaT
# return datetime(gregorian_year, month, day)
# except Exception as e:
# print(f"Error converting date: {juche_date} - {e}")
# return pd.NaT
def convert_to_gregorian(date_str):
if date_str is None or date_str == 'Unknown':
return pd.NaT
try:
# Clean the date string by removing brackets
clean_date = date_str.replace('[', '').replace(']', '')
# Split the date into year, month, and day
parts = clean_date.split('.')
year = int(parts[0])
month = int(parts[1])
day = int(parts[2])
# Debug print for verification
print(f"Converting date: {date_str} => Year: {year}, Month: {month}, Day: {day}")
# Ensure the date falls within a reasonable range
if not (1900 <= year <= datetime.now().year):
print(f"Out of bounds date: {year}-{month}-{day}")
return pd.NaT
return datetime(year, month, day)
except Exception as e:
print(f"Error converting date: {date_str} - {e}")
return pd.NaT
# Function to parse articles and media from topic pages
def parse_articles(page_url, topic):
headers = {
'User-Agent': random.choice(user_agents)
}
response = requests.get(
'https://proxy.scrapeops.io/v1/',
params={
'api_key': proxy_service_key,
'url': page_url,
'premium': 'true'
},
headers=headers
)
articles = []
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
# Extract articles
article_lists = soup.find_all('ul', class_='article-link')
for article_list in article_lists:
article_links = article_list.find_all('li')[:5] # Limit to first 5 articles
for article in article_links:
a_tag = article.find('a')
if a_tag:
headline = a_tag.text.strip().split('\r')[0]
link = a_tag.get('href')
date_tag = article.find('span', class_='publish-time')
date = date_tag.text.strip() if date_tag else 'Unknown'
full_link = f"http://www.kcna.kp{link}"
articles.append({
'topic': topic,
'headline': headline,
'link': full_link,
'date': convert_to_gregorian(date)
})
# Extract photos and videos
if topic in ['Photo', 'Video']:
media_divs = soup.find_all('div', class_=['photo', 'video'])
for media in media_divs[:5]: # Limit to first 5 items
title_span = media.find('span', class_='title')
if title_span:
a_tag = title_span.find('a')
headline = a_tag.text.strip().split('\r')[0] if a_tag else 'Unknown'
link = a_tag.get('href') if a_tag else 'Unknown'
date_tag = title_span.find('span', class_='publish-time')
date = date_tag.text.strip() if date_tag else 'Unknown'
full_link = f"http://www.kcna.kp{link}"
articles.append({
'topic': topic,
'headline': headline,
'link': full_link,
'date': convert_to_gregorian(date)
})
else:
print(f"Failed to retrieve the topic page: {page_url}. Status code: {response.status_code}")
return articles
# Function to extract story text from article links
def fetch_story_text(link):
headers = {
'User-Agent': random.choice(user_agents)
}
response = requests.get(
'https://proxy.scrapeops.io/v1/',
params={
'api_key': proxy_service_key,
'url': link,
'premium': 'true'
},
headers=headers
)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
content_wrapper = soup.find('div', class_='content-wrapper')
if content_wrapper:
paragraphs = content_wrapper.find_all('p')
story_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
return story_text
return ''
# Function to collect headlines and fetch story text
def collect_headlines_and_stories(links_df):
all_articles = []
for _, row in tqdm(links_df.iterrows(), total=links_df.shape[0], desc='Processing Pages'):
topic = row['topic']
link = row['link']
# Parse the articles on the page
articles = parse_articles(link, topic)
# Fetch story text for key topics
for article in articles:
if article['topic'] in key_topics:
story_text = fetch_story_text(article['link'])
article['story_text'] = story_text
else:
article['story_text'] = ''
all_articles.extend(articles)
return pd.DataFrame(all_articles)
# Main workflow
menu_url = 'http://www.kcna.kp/en' # Replace with the actual URL of the menu page
links_df = fetch_menu_links(menu_url)
headlines_df = collect_headlines_and_stories(links_df)
print("Unique dates before conversion:", headlines_df['date'].unique())
# Convert date column in headlines_df to datetime
headlines_df['date'] = pd.to_datetime(headlines_df['date'], errors='raise')
# Combine with current archive, remove duplicates, and sort by date
all_headlines_df = pd.concat([current_archive, headlines_df])\
.drop_duplicates(subset=['headline', 'link'])\
.sort_values(by='date')\
.reset_index(drop=True)
# Export links and headlines. Save a dated copy in an archive directory.
all_headlines_df['date_str'] = pd.to_datetime(all_headlines_df['date']).dt.strftime('%Y-%m-%d')
all_headlines_df.to_json('data/headlines.json', indent=4, orient='records')
all_headlines_df.to_json(f'data/archive/headlines_{today}.json', indent=4, orient='records')
links_df.to_json('data/links.json', indent=4, orient='records')
links_df.to_json(f'data/archive/links_{today}.json', indent=4, orient='records')