-
Notifications
You must be signed in to change notification settings - Fork 5
/
maersk.py
424 lines (337 loc) · 21.4 KB
/
maersk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import itertools
from datetime import date
from datetime import datetime
import os
# Some definitions in explanation of the code:
# Connection: ALl possibilities from the origin port to the destination port
# Route: A specific possibility from origin to destination on a specific departure date
# Transfer: Within a route, the container can switch from one vessel to the other and continue journey
#sets up the options of the chromedriver
opts = Options()
opts.add_argument("window-size=1280,720") #locks the window size !!Don't change!!
opts.add_argument("user-agent=Chrome/106.0.5249.119") #Prevents sites from blocking traffic
headless = True
if headless: #if True, open chrome on the background without window
opts.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
#!!! Instructions on port selection start
# The same ports and methods as for scraping_routscanner_v2 were used.
# origin = ["BR", "CO", "VE", "SR", "CW", "GY", "GF", "UY", "AR", "CL", "PE", "EC", "VN", "PY", "GY", "KH"]
# destination = ["NL", "BE"]
# The UN-LOCODES were picked from the following CSV: (contains country codes
# country_df = pd.read_csv("../utils/country-codes.csv")
# Furthermore the ports in South-America, Vietnam and Benelux were selected
# with open('../pickles/msc_country_port_codes.pickle', 'rb') as handle:
# country_port_codes = pickle.load(handle)
# However unlike scraping_routescanner_v2, the site of Maersk doesn't accept port-codes such as NLRTM
# Therefore, the correct port names that work in the site of Maersk were selected by hand
# Using both the port name according to the list above and the latitude and longitude
#!!! Instructions on port selection end
origins_destinations = pd.read_csv(r'../utils/maersk_un_locodes_conversion.csv',sep=';')
o_names = origins_destinations.loc[origins_destinations['origin/destination'] == 'origin'].Maersk_name
d_names = origins_destinations.loc[origins_destinations['origin/destination'] == 'destination'].Maersk_name
od_names = list(itertools.product(o_names, d_names))
# Puerto seguro flavial has been moved to villeta. This place seemed more logical according to lat and long
# Terport villeta paraguay had no latitude or longitude to check,
# But luckily there was only one port called Terport in Maersk
today = date.today()
def open_routes(od,page):
# All if statements check if a route has been found
# First button can already be clicked because that was already checked before opening this process
driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[3]/div/div[4]/button/span").click()
if len(driver.find_elements(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[4]/div/div[4]/button/span")) > 0:
driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[4]/div/div[4]/button/span").click()
if len(driver.find_elements(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[5]/div/div[4]/button/span")) > 0:
driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[5]/div/div[4]/button/span").click()
if len(driver.find_elements(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[6]/div/div[4]/button/span")) > 0:
driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[6]/div/div[4]/button/span").click()
if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[7]/div/div[4]/button/span")) > 0:
driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[7]/div/div[4]/button/span").click()
soups.append(soup_page()) # Soup the page
save_html_page(od,page)
def save_html_page(od,page):
if not os.path.exists(f'../data/maersk_daily/html_runs/{today}'):
os.makedirs(f'../data/maersk_daily/html_runs/{today}')
with open(f'../data/maersk_daily/html_runs/{today}/{od}_{page}_{today}.html', "w", encoding="utf-8") as file:
file.write(str(soup_page()))
def soup_page():
#Soup the page
page_source = driver.page_source
soup = BeautifulSoup(page_source)
return soup
### This part fills in all the origin destination locations and saves the soup which will be processed later on
soups = []
def open_webpages(od_names):
print(f"Starting to scrape {len(od_names)} harbor combinations.")
#Open Maersk point to point site
driver.get("https://www.maersk.com/schedules/pointToPoint")
time.sleep(3)
#Click to allow cookies
driver.find_element(By.XPATH,"//*[@id='coiPage-1']/div[2]/button[3]").click()
for i in od_names:
#Open the site again
driver.get("https://www.maersk.com/schedules/pointToPoint")
time.sleep(3)
#fill in the origin location
originloc = driver.find_element(By.ID,'originLocation')
originloc.send_keys(i[0])
#a dropdown menu has to be clicked in order to confirm the origin location. This clicks the correct port
time.sleep(4) #Makes sure that the element is actually clickable
action = ActionChains(driver)
action.move_to_element_with_offset(originloc, 0, 50)
action.click()
action.perform()
#Fills in the destination location automatically.
destinationloc = driver.find_element(By.ID,'destinationLocation')
destinationloc.send_keys(i[1])
#a dropdown menu has to be clicked in order to confirm the origin location. This clicks the correct port
time.sleep(3)
action = ActionChains(driver)
action.move_to_element_with_offset(destinationloc, 0, 50)
action.click()
action.perform()
#Click the search button
search_button = driver.find_element(By.XPATH,'//*[@id="app"]/div[2]/span/form/div[6]/button')
search_button.click()
# There are 2 known possibilities that result in not finding routes:
# 1: There is no route
# 2: Sometimes Maersk site gives an error for either origin or destination
# even when the names are correctly filled in. Error seems to appear randomly
# Try makes sure the code doesn't fail even if a route is not found
# It works by checking if the first button for 'show route details' can be clicked. If not, no route has been found
time.sleep(5)
if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[3]/div/div[4]/button/span")) > 0:
open_routes(od=i,page=1) #Expand all the show route details buttons
time.sleep(5)
if len(driver.find_elements(By.CLASS_NAME,"load-more__text")) > 0: #Check if even more routes have been found than just appearing on the first page
driver.find_elements(By.CLASS_NAME,"load-more__text")[1].click() #Click to open second page with routes
# 'Earlier sailings' and 'Later sailings' have same class. We want to click 'Later sailings'
time.sleep(5) #Make sure that all buttons can open
if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[3]/div/div[4]/button/span")) > 0:
open_routes(od=i,page=2) #if statement above is not necessarily needed. It checks again if at least 1 route can be found.
# That should be the case because we are on the second page of routes. More of a failsafe.
print(f"Done with {i}")
else:
print("No route found for:",i)
#Closes the webdriver after a few seconds
driver.stop_client()
driver.quit()
open_webpages(od_names)
def process_data_route(route,list_ports,route_data):
#The origin port is the first port in the list, destination the last
origin = list_ports[0]
destination = list_ports[-1]
# The information about the destination and therefore arrival date can be found in the last box
info_destination = route.find(class_="ptp-results__transport-plan--item-final")
arrival_date = info_destination.find(class_="transport-label font--small")
arrival_date = arrival_date.find_all(class_="font--small")
arrival_date = arrival_date[1].text
arrival_date = datetime.strptime(arrival_date,"%d %b %Y %H:%M") #converts date from format "04 March 2023 10:00" to "2023-03-04 10:00"
# Tip: find_all searches for all the elements
# find only searches for one element. It will stop searching when it finds an element
# This means that the line below will find only the departure from the origin, not from other transfer departures
# as it is the first departure that can be found
info_departure = route.find(class_="ptp-results__transport-plan--item")
info_departure_and_ship = info_departure.find(class_="transport-label font--small")
departure_date = info_departure_and_ship.find(class_="font--small").text
departure_date = datetime.strptime(departure_date,"%d %b %Y %H:%M") #converts date from format "04 March 2023 10:00" to a datetime object: "2023-03-04 10:00"
transittime = arrival_date - departure_date
departure_date = departure_date.strftime("%Y-%m-%d %H:%M:%S") # Converting datetime object to string according to "2023-03-04 10:00:00"
arrival_date = arrival_date.strftime("%Y-%m-%d %H:%M:%S") # Converting datetime object to string according to "2023-03-04 10:00:00"
# Make an empty list for all used vessels. If only 1 vessel is used only 1 item will be in this list
vessels = []
# The following code only works for the first vessel that is being stored
# Either 2 things can occur: ' Departing on [shipname]' or ' Transport via barge '
# If a ship is used, the shipname will be stored
# If barge is used, 'barge' will be stored (literally)
vessel_name = info_departure_and_ship.find(class_="rich-text").text
if vessel_name[:13] != ' Departing on': #If false: vessel_name probably starts with: ' Transport via barge'
vessel_name = vessel_name.removeprefix(' Transport via ')
vessel_name = vessel_name.removesuffix(' ')
else: # If a ship is used
# The vessel name is initially given as ie. "Departing on CAP SAN LORENZO / 249S"
# This makes sure that only the Cap San Lorenzo part is stored
vessel_name = vessel_name.removeprefix(' Departing on ')
if vessel_name == '':
vessel_name = 'unknown'
if vessel_name != 'unknown':
vessel_name = vessel_name.split()
if len(vessel_name) >= 2 and "/" in vessel_name:
vessel_name.remove("/")
vessel_name.pop(-1)
vessel_name = ' '.join(vessel_name)
vessel_info = info_departure.find(class_="vessel")
if vessel_info is not None:
imo = vessel_info.find(class_="imo").text
imo = imo.removeprefix('IMO Number')
service = vessel_info.find(class_="service").text
service = service.removeprefix('Service')
flag = vessel_info.find(class_="flag").text
flag = flag.removeprefix('Flag')
callsign = vessel_info.find(class_="callsign").text
callsign = callsign.removeprefix('Call Sign')
built_year_ship = vessel_info.find(class_="built").text
built_year_ship = built_year_ship.removeprefix('Built')
# Store the information about the first used vessel as a list
# If other vessels are also used, these will be also be stored as a list
vessels.append({'vessel_name': vessel_name,'imo': imo,'flag': flag,'build_year_ship' : built_year_ship,'service': service,'callsign': callsign})
for i in range(len(vessels)):
for key, value in vessels[i].items():
if vessels[i][key] == '-':
vessels[i][key] = ''
else:
imo = ''
flag = ''
built_year_ship = ''
service = ''
callsign = ''
vessels.append({'vessel_name': vessel_name,'imo': imo,'flag': flag,'build_year_ship' : built_year_ship,'service': service,'callsign': callsign})
if len(list_ports)>2: # If there is a transfer, store data and also run process_data_transfer
route_data.append([origin,destination,departure_date,arrival_date,transittime])
process_data_transfer(route,list_ports,route_data,vessels)
else:
# Adding the information about the leg in a dictionary.
legs = {}
legs['1'] = {'OriginName': origin, 'DestinationName': destination,'Vessel': vessels[0],
'EstimatedDepartureTime': departure_date, 'EstimatedArrivalTime': arrival_date}
# Just store the route_data
route_data.append([origin,destination,departure_date,arrival_date,transittime,[origin,destination],vessels,[departure_date,arrival_date],legs])
return route_data
def process_data_transfer(route,list_ports,route_data,vessels):
transfer_arrival_departure =[]
list_transfer_ports_and_ships = route.find_all(class_="ptp-results__transport-plan--item")
for i in range(1,len(list_transfer_ports_and_ships)):
#item 1 is a port, 2 a ship, 3 a port and so on
#The following if statement makes sure that data of a port
#is actually read as a port
#Important note: The origin itself and and vessel are not read. They are both in the same ptp-results__transport-plan--item
#The rest of the vessels and ports are in separate ptp-results__transport-plan--item
#The destination is also not read because it is in ptp-results__transport-plan--item-final instead of ptp-results__transport-plan--item
if (i % 2) == 1:
transfer_port = list_transfer_ports_and_ships[i]
info_arrival = transfer_port.find(class_="transport-label font--small")
arrival_date = info_arrival.find_all(class_="font--small")[1].text
arrival_date = datetime.strptime(arrival_date,"%d %b %Y %H:%M") #converts date from format "04 March 2023 10:00" to a datetime object: "2023-03-04 10:00"
arrival_date = arrival_date.strftime("%Y-%m-%d %H:%M:%S") # Converting datetime object to string according to "2023-03-04 10:00:00"
transfer_arrival_departure.append(arrival_date)
transfer_ship = list_transfer_ports_and_ships[i+1]
info_departure = transfer_ship.find(class_="transport-label font--small")
departure_date = info_departure.find(class_="font--small").text
departure_date = datetime.strptime(departure_date,"%d %b %Y %H:%M") #converts date from format "04 March 2023 10:00" to a datetime object: "2023-03-04 10:00"
departure_date = departure_date.strftime("%Y-%m-%d %H:%M:%S") # Converting datetime object to string according to "2023-03-04 10:00:00"
transfer_arrival_departure.append(departure_date)
#Similar as for 1 ship, read description in process_data_route if unclear
vessel_name = info_departure.find(class_="rich-text").text
if vessel_name[:13] != ' Departing on':
vessel_name = vessel_name.removeprefix(' Transport via ')
vessel_name = vessel_name.removesuffix(' ')
else:
vessel_name = vessel_name.removeprefix(' Departing on ')
if vessel_name == '':
vessel_name = 'unknown'
if vessel_name != 'unknown':
vessel_name = vessel_name.split()
if len(vessel_name) >= 2 and "/" in vessel_name:
vessel_name.remove("/")
vessel_name.pop(-1)
vessel_name = ' '.join(vessel_name)
vessel_info = transfer_ship.find(class_="vessel")
if vessel_info is not None:
imo = vessel_info.find(class_="imo").text
imo = imo.removeprefix('IMO Number')
service = vessel_info.find(class_="service").text
service = service.removeprefix('Service')
flag = vessel_info.find(class_="flag").text
flag = flag.removeprefix('Flag')
callsign = vessel_info.find(class_="callsign").text
callsign = callsign.removeprefix('Call Sign')
built_year_ship = vessel_info.find(class_="built").text
built_year_ship = built_year_ship.removeprefix('Built')
vessels.append({'vessel_name': vessel_name,'imo': imo,'flag': flag,'build_year_ship' : built_year_ship,'service': service,'callsign': callsign})
for i in range(len(vessels)):
for key, value in vessels[i].items():
if vessels[i][key] == '-':
vessels[i][key] = ''
else:
imo = ''
flag = ''
built_year_ship = ''
service = ''
callsign = ''
vessels.append({'vessel_name': vessel_name,'imo': imo,'flag': flag,'build_year_ship' : built_year_ship,'service': service,'callsign': callsign})
# This part is quite complicated
# The data on the origin, destination and first vessel were already stored in route_data in process_data_route
# We will alter this data by adding the information about the transfer ports and vessels
# We first store the data on departure date that was already stored in process_data_route somewhere else
arrival_departure = []
arrival_departure.append(route_data[-1][2])
# Then store all the transfer arrival and departure date
for i in transfer_arrival_departure:
arrival_departure.append(i)
# Last store the arrival date for the whole route
arrival_departure.append(route_data[-1][3])
# Store the other transfer data in route_data
route_data[-1].append(list_ports)
route_data[-1].append(vessels)
# Store the data on all departure and arrival dates (including transfer) in the route_data
route_data[-1].append(arrival_departure)
# Adding the information about the leg in a dictionary.
legs = {}
for leg in range(len(list_ports)-1):
legs[f'{leg+1}'] = {'OriginName': list_ports[leg], 'DestinationName': list_ports[leg+1],'Vessel': vessels[leg],
'EstimatedDepartureTime': arrival_departure[leg*2], 'EstimatedArrivalTime': arrival_departure[leg*2+1]}
route_data[-1].append(legs)
return route_data
### Process_data_route, process_data_transfer and initialize_processing
### all process the soups into usable data
### First initialize_processing selects a soup and prepares it for processing
### Then process_data_route will process the information about the origin, destination, arrival data en departure date and the first vessel
### Last process_data_transfer will be used if a transfer takes place.
### A transfer means that the container is moved from one vessel to the another and continues the journey
# Make a list in which all the data of all routes can be stored
route_data = []
def initialize_processing(soups):
for i in range(len(soups)):
#Lists all the data on routes. The data on routes is already grouped.
routes = soups[i].find_all("div", class_="ptp-results__transport-plan")
#The Maersk site does not show all ports that are on the route
#It only provides information on ports which are either origin, destination or transfer ports
#Transfer in this context means a port where the container is moved to a different ship.
#The following few lines detects all ports in a route
for route in routes:
ports = route.find_all("div", class_="location")
list_ports =[]
for p in ports:
city = p.find("div", class_="font--default--bold").text
terminal = p.find("div", class_="font--small").text
port = city + ' ' + terminal
list_ports.append(port)
process_data_route(route,list_ports,route_data)
initialize_processing(soups)
# This turns the processed data into a Pandas dataframe
columns = ["OriginName","DestinationName","EstimatedDepartureTime","EstimatedArrivalTime","EstimatedTotalTransitTimeDays","Ports","Vessels","Dates","Legs"]
connection_df = pd.DataFrame(route_data, columns=columns)
connection_df["Origin"] = connection_df["OriginName"]
connection_df["Destination"] = connection_df["DestinationName"]
connection_df["ScrapingDate"] = date.today()
connection_df["ScrapingSite"] = "Maersk"
connection_df["EstimatedTotalTransitTimeHours"] = ""
connection_df["TotalCO2EmissionsKg"] = ""
connection_df["TotalDistanceMeters"] = ""
connection_df["CutOffs"] = ""
connection_df["NumberOfLegs"] = ""
connection_df["EstimatedTotalTransitTimeDays"] = connection_df.EstimatedTotalTransitTimeDays.round('d')
# Changing the order of the Dataframe. Makes analysing the dataframe by hand easier. Has no effect on actual dataframe operations
v2_connection_df = connection_df[['ScrapingDate','ScrapingSite','Origin','Destination','OriginName','DestinationName','EstimatedDepartureTime','EstimatedArrivalTime','EstimatedTotalTransitTimeDays','EstimatedTotalTransitTimeHours','TotalCO2EmissionsKg','TotalDistanceMeters','CutOffs','NumberOfLegs','Legs']]
# Store as both pickle and CSV
v2_connection_df.to_pickle(f"../pickles/maersk_daily/pickles_before_merge/connections_{today}.pickle")
v2_connection_df.to_csv(f"../data/maersk_daily/csv_runs/connections_{today}.csv")