webscrapers/maersk.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import itertools
from datetime import date
from datetime import datetime
import os

# Some definitions in explanation of the code:
# Connection: ALl possibilities from the origin port to the destination port
# Route: A specific possibility from origin to destination on a specific departure date
# Transfer: Within a route, the container can switch from one vessel to the other and continue journey

#sets up the options of the chromedriver
opts = Options()
opts.add_argument("window-size=1280,720") #locks the window size !!Don't change!!
opts.add_argument("user-agent=Chrome/106.0.5249.119") #Prevents sites from blocking traffic
headless = True

if headless: #if True, open chrome on the background without window
    opts.headless = True

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

#!!! Instructions on port selection start
# The same ports and methods as for scraping_routscanner_v2 were used.
#           origin = ["BR", "CO", "VE", "SR", "CW", "GY", "GF", "UY", "AR", "CL", "PE", "EC", "VN", "PY", "GY", "KH"]
#           destination = ["NL", "BE"]
# The UN-LOCODES were picked from the following CSV: (contains country codes
#           country_df = pd.read_csv("../utils/country-codes.csv")
# Furthermore the ports in South-America, Vietnam and Benelux were selected
#        with open('../pickles/msc_country_port_codes.pickle', 'rb') as handle:
#                country_port_codes = pickle.load(handle)

# However unlike scraping_routescanner_v2, the site of Maersk doesn't accept port-codes such as NLRTM
# Therefore, the correct port names that work in the site of Maersk were selected by hand
# Using both the port name according to the list above and the latitude and longitude
#!!! Instructions on port selection end

origins_destinations = pd.read_csv(r'../utils/maersk_un_locodes_conversion.csv',sep=';')
o_names = origins_destinations.loc[origins_destinations['origin/destination'] == 'origin'].Maersk_name
d_names = origins_destinations.loc[origins_destinations['origin/destination'] == 'destination'].Maersk_name

od_names = list(itertools.product(o_names, d_names))

# Puerto seguro flavial has been moved to villeta. This place seemed more logical according to lat and long
# Terport villeta paraguay had no latitude or longitude to check,
# But luckily there was only one port called Terport in Maersk

today = date.today()

def open_routes(od,page):
    # All if statements check if a route has been found
    # First button can already be clicked because that was already checked before opening this process
    driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[3]/div/div[4]/button/span").click()
    if len(driver.find_elements(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[4]/div/div[4]/button/span")) > 0:
        driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[4]/div/div[4]/button/span").click()
        if len(driver.find_elements(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[5]/div/div[4]/button/span")) > 0:
            driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[5]/div/div[4]/button/span").click()

            if len(driver.find_elements(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[6]/div/div[4]/button/span")) > 0:
                driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[6]/div/div[4]/button/span").click()

                if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[7]/div/div[4]/button/span")) > 0:
                    driver.find_element(By.XPATH, "//*[@id='app']/div[2]/div[1]/div[7]/div/div[4]/button/span").click()
    soups.append(soup_page()) # Soup the page
    save_html_page(od,page)

def save_html_page(od,page):
    if not os.path.exists(f'../data/maersk_daily/html_runs/{today}'):
        os.makedirs(f'../data/maersk_daily/html_runs/{today}')
    with open(f'../data/maersk_daily/html_runs/{today}/{od}_{page}_{today}.html', "w", encoding="utf-8") as file:
        file.write(str(soup_page()))

def soup_page():
    #Soup the page
    page_source = driver.page_source
    soup = BeautifulSoup(page_source)
    return soup

### This part fills in all the origin destination locations and saves the soup which will be processed later on
soups = []

def open_webpages(od_names):
    print(f"Starting to scrape {len(od_names)} harbor combinations.")
    #Open Maersk point to point site
    driver.get("https://www.maersk.com/schedules/pointToPoint")
    time.sleep(3)
    #Click to allow cookies
    driver.find_element(By.XPATH,"//*[@id='coiPage-1']/div[2]/button[3]").click()

    for i in od_names:
        #Open the site again
        driver.get("https://www.maersk.com/schedules/pointToPoint")
        time.sleep(3)

        #fill in the origin location
        originloc = driver.find_element(By.ID,'originLocation')
        originloc.send_keys(i[0])

        #a dropdown menu has to be clicked in order to confirm the origin location. This clicks the correct port
        time.sleep(4) #Makes sure that the element is actually clickable
        action = ActionChains(driver)
        action.move_to_element_with_offset(originloc, 0, 50)
        action.click()
        action.perform()

        #Fills in the destination location automatically.
        destinationloc = driver.find_element(By.ID,'destinationLocation')
        destinationloc.send_keys(i[1])

        #a dropdown menu has to be clicked in order to confirm the origin location. This clicks the correct port
        time.sleep(3)
        action = ActionChains(driver)
        action.move_to_element_with_offset(destinationloc, 0, 50)
        action.click()
        action.perform()

        #Click the search button
        search_button = driver.find_element(By.XPATH,'//*[@id="app"]/div[2]/span/form/div[6]/button')
        search_button.click()

        # There are 2 known possibilities that result in not finding routes:
        # 1: There is no route
        # 2: Sometimes Maersk site gives an error for either origin or destination
        #    even when the names are correctly filled in. Error seems to appear randomly
        # Try makes sure the code doesn't fail even if a route is not found
        # It works by checking if the first button for 'show route details' can be clicked. If not, no route has been found
        time.sleep(5)
        if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[3]/div/div[4]/button/span")) > 0:
            open_routes(od=i,page=1) #Expand all the show route details buttons
            time.sleep(5)
            if len(driver.find_elements(By.CLASS_NAME,"load-more__text")) > 0: #Check if even more routes have been found than just appearing on the first page
                driver.find_elements(By.CLASS_NAME,"load-more__text")[1].click() #Click to open second page with routes
                                                                                 # 'Earlier sailings' and 'Later sailings' have same class. We want to click 'Later sailings'
                time.sleep(5) #Make sure that all buttons can open
                if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[3]/div/div[4]/button/span")) > 0:
                    open_routes(od=i,page=2) #if statement above is not necessarily needed. It checks again if at least 1 route can be found.
                                  # That should be the case because we are on the second page of routes. More of a failsafe.
            print(f"Done with {i}")
        else:
            print("No route found for:",i)

    #Closes the webdriver after a few seconds
    driver.stop_client()
    driver.quit()

open_webpages(od_names)

def process_data_route(route,list_ports,route_data):
    #The origin port is the first port in the list, destination the last
    origin = list_ports[0]
    destination = list_ports[-1]

    # The information about the destination and therefore arrival date can be found in the last box
    info_destination = route.find(class_="ptp-results__transport-plan--item-final")
    arrival_date = info_destination.find(class_="transport-label font--small")
    arrival_date = arrival_date.find_all(class_="font--small")
    arrival_date = arrival_date[1].text

    arrival_date = datetime.strptime(arrival_date,"%d %b %Y %H:%M") #converts date from format "04 March 2023 10:00" to "2023-03-04 10:00"

    # Tip: find_all searches for all the elements
    # find only searches for one element. It will stop searching when it finds an element
    # This means that the line below will find only the departure from the origin, not from other transfer departures
    # as it is the first departure that can be found
    info_departure = route.find(class_="ptp-results__transport-plan--item")
    info_departure_and_ship = info_departure.find(class_="transport-label font--small")

    departure_date = info_departure_and_ship.find(class_="font--small").text

    departure_date = datetime.strptime(departure_date,"%d %b %Y %H:%M") #converts date from format "04 March 2023 10:00" to a datetime object: "2023-03-04 10:00"

    transittime = arrival_date - departure_date

    departure_date = departure_date.strftime("%Y-%m-%d %H:%M:%S") # Converting datetime object to string according to "2023-03-04 10:00:00"

    arrival_date = arrival_date.strftime("%Y-%m-%d %H:%M:%S") # Converting datetime object to string according to "2023-03-04 10:00:00"

    # Make an empty list for all used vessels. If only 1 vessel is used only 1 item will be in this list
    vessels = []

    # The following code only works for the first vessel that is being stored
    # Either 2 things can occur: ' Departing on [shipname]' or ' Transport via barge '
    # If a ship is used, the shipname will be stored
    # If barge is used, 'barge' will be stored (literally)
    vessel_name = info_departure_and_ship.find(class_="rich-text").text
    if vessel_name[:13] != ' Departing on': #If false: vessel_name probably starts with: ' Transport via barge'
        vessel_name = vessel_name.removeprefix(' Transport via ')
        vessel_name = vessel_name.removesuffix(' ')
    else: # If a ship is used
        # The vessel name is initially given as ie. "Departing on CAP SAN LORENZO / 249S"
        # This makes sure that only the Cap San Lorenzo part is stored
        vessel_name = vessel_name.removeprefix(' Departing on ')
        if vessel_name == '':
            vessel_name = 'unknown'
        if vessel_name != 'unknown':
            vessel_name = vessel_name.split()
            if len(vessel_name) >= 2 and "/" in vessel_name:
                vessel_name.remove("/")
                vessel_name.pop(-1)
                vessel_name = ' '.join(vessel_name)

    vessel_info = info_departure.find(class_="vessel")

    if vessel_info is not None:
        imo = vessel_info.find(class_="imo").text
        imo = imo.removeprefix('IMO Number')

        service = vessel_info.find(class_="service").text
        service = service.removeprefix('Service')

        flag = vessel_info.find(class_="flag").text
        flag = flag.removeprefix('Flag')

        callsign = vessel_info.find(class_="callsign").text
        callsign = callsign.removeprefix('Call Sign')

        built_year_ship = vessel_info.find(class_="built").text
        built_year_ship = built_year_ship.removeprefix('Built')

        # Store the information about the first used vessel as a list
        # If other vessels are also used, these will be also be stored as a list
        vessels.append({'vessel_name': vessel_name,'imo': imo,'flag': flag,'build_year_ship' : built_year_ship,'service': service,'callsign': callsign})
        for i in range(len(vessels)):
                for key, value in vessels[i].items():
                    if vessels[i][key] == '-':
                        vessels[i][key] = ''
    else:
        imo = ''
        flag = ''
        built_year_ship = ''
        service = ''
        callsign = ''
        vessels.append({'vessel_name': vessel_name,'imo': imo,'flag': flag,'build_year_ship' : built_year_ship,'service': service,'callsign': callsign})

    if len(list_ports)>2: # If there is a transfer, store data and also run process_data_transfer
        route_data.append([origin,destination,departure_date,arrival_date,transittime])
        process_data_transfer(route,list_ports,route_data,vessels)
    else:
        # Adding the information about the leg in a dictionary.
        legs = {}
        legs['1'] = {'OriginName': origin, 'DestinationName': destination,'Vessel': vessels[0],
                                                        'EstimatedDepartureTime': departure_date, 'EstimatedArrivalTime': arrival_date}
        # Just store the route_data
        route_data.append([origin,destination,departure_date,arrival_date,transittime,[origin,destination],vessels,[departure_date,arrival_date],legs])
        return route_data

def process_data_transfer(route,list_ports,route_data,vessels):
    transfer_arrival_departure =[]

    list_transfer_ports_and_ships = route.find_all(class_="ptp-results__transport-plan--item")
    for i in range(1,len(list_transfer_ports_and_ships)):
        #item 1 is a port, 2 a ship, 3 a port and so on
        #The following if statement makes sure that data of a port
        #is actually read as a port
        #Important note: The origin itself and and vessel are not read. They are both in the same ptp-results__transport-plan--item
        #The rest of the vessels and ports are in separate ptp-results__transport-plan--item
        #The destination is also not read because it is in ptp-results__transport-plan--item-final instead of ptp-results__transport-plan--item
        if (i % 2) == 1:
            transfer_port = list_transfer_ports_and_ships[i]

            info_arrival = transfer_port.find(class_="transport-label font--small")

            arrival_date = info_arrival.find_all(class_="font--small")[1].text

            arrival_date = datetime.strptime(arrival_date,"%d %b %Y %H:%M") #converts date from format "04 March 2023 10:00" to a datetime object: "2023-03-04 10:00"

            arrival_date = arrival_date.strftime("%Y-%m-%d %H:%M:%S") # Converting datetime object to string according to "2023-03-04 10:00:00"

            transfer_arrival_departure.append(arrival_date)

            transfer_ship = list_transfer_ports_and_ships[i+1]

            info_departure = transfer_ship.find(class_="transport-label font--small")
            departure_date = info_departure.find(class_="font--small").text

            departure_date = datetime.strptime(departure_date,"%d %b %Y %H:%M") #converts date from format "04 March 2023 10:00" to a datetime object: "2023-03-04 10:00"

            departure_date = departure_date.strftime("%Y-%m-%d %H:%M:%S") # Converting datetime object to string according to "2023-03-04 10:00:00"

            transfer_arrival_departure.append(departure_date)

            #Similar as for 1 ship, read description in process_data_route if unclear
            vessel_name = info_departure.find(class_="rich-text").text
            if vessel_name[:13] != ' Departing on':
                vessel_name = vessel_name.removeprefix(' Transport via ')
                vessel_name = vessel_name.removesuffix(' ')
            else:
                vessel_name = vessel_name.removeprefix(' Departing on ')
                if vessel_name == '':
                    vessel_name = 'unknown'
                if vessel_name != 'unknown':
                    vessel_name = vessel_name.split()
                    if len(vessel_name) >= 2 and "/" in vessel_name:
                        vessel_name.remove("/")
                        vessel_name.pop(-1)
                        vessel_name = ' '.join(vessel_name)

            vessel_info = transfer_ship.find(class_="vessel")

            if vessel_info is not None:
                imo = vessel_info.find(class_="imo").text
                imo = imo.removeprefix('IMO Number')

                service = vessel_info.find(class_="service").text
                service = service.removeprefix('Service')

                flag = vessel_info.find(class_="flag").text
                flag = flag.removeprefix('Flag')

                callsign = vessel_info.find(class_="callsign").text
                callsign = callsign.removeprefix('Call Sign')

                built_year_ship = vessel_info.find(class_="built").text
                built_year_ship = built_year_ship.removeprefix('Built')

                vessels.append({'vessel_name': vessel_name,'imo': imo,'flag': flag,'build_year_ship' : built_year_ship,'service': service,'callsign': callsign})

                for i in range(len(vessels)):
                    for key, value in vessels[i].items():
                        if vessels[i][key] == '-':
                            vessels[i][key] = ''

            else:
                imo = ''
                flag = ''
                built_year_ship = ''
                service = ''
                callsign = ''
                vessels.append({'vessel_name': vessel_name,'imo': imo,'flag': flag,'build_year_ship' : built_year_ship,'service': service,'callsign': callsign})

    # This part is quite complicated
    # The data on the origin, destination and first vessel were already stored in route_data in process_data_route
    # We will alter this data by adding the information about the transfer ports and vessels

    # We first store the data on departure date that was already stored in process_data_route somewhere else
    arrival_departure = []
    arrival_departure.append(route_data[-1][2])

    # Then store all the transfer arrival and departure date
    for i in transfer_arrival_departure:
        arrival_departure.append(i)

    # Last store the arrival date for the whole route
    arrival_departure.append(route_data[-1][3])

    # Store the other transfer data in route_data
    route_data[-1].append(list_ports)
    route_data[-1].append(vessels)
    # Store the data on all departure and arrival dates (including transfer) in the route_data
    route_data[-1].append(arrival_departure)

    # Adding the information about the leg in a dictionary.
    legs = {}
    for leg in range(len(list_ports)-1):
        legs[f'{leg+1}'] = {'OriginName': list_ports[leg], 'DestinationName': list_ports[leg+1],'Vessel': vessels[leg],
                                                        'EstimatedDepartureTime': arrival_departure[leg*2], 'EstimatedArrivalTime': arrival_departure[leg*2+1]}

    route_data[-1].append(legs)

    return route_data

### Process_data_route, process_data_transfer and initialize_processing
### all process the soups into usable data
### First initialize_processing selects a soup and prepares it for processing
### Then process_data_route will process the information about the origin, destination, arrival data en departure date and the first vessel
### Last process_data_transfer will be used if a transfer takes place.
### A transfer means that the container is moved from one vessel to the another and continues the journey

# Make a list in which all the data of all routes can be stored
route_data = []

def initialize_processing(soups):
    for i in range(len(soups)):
        #Lists all the data on routes. The data on routes is already grouped.
        routes = soups[i].find_all("div", class_="ptp-results__transport-plan")

        #The Maersk site does not show all ports that are on the route
        #It only provides information on ports which are either origin, destination or transfer ports
        #Transfer in this context means a port where the container is moved to a different ship.
        #The following few lines detects all ports in a route

        for route in routes:
            ports = route.find_all("div", class_="location")
            list_ports =[]
            for p in ports:
                city = p.find("div", class_="font--default--bold").text
                terminal = p.find("div", class_="font--small").text
                port = city + ' ' + terminal
                list_ports.append(port)

            process_data_route(route,list_ports,route_data)

initialize_processing(soups)

# This turns the processed data into a Pandas dataframe
columns = ["OriginName","DestinationName","EstimatedDepartureTime","EstimatedArrivalTime","EstimatedTotalTransitTimeDays","Ports","Vessels","Dates","Legs"]

connection_df = pd.DataFrame(route_data, columns=columns)
connection_df["Origin"] = connection_df["OriginName"]
connection_df["Destination"] = connection_df["DestinationName"]
connection_df["ScrapingDate"] = date.today()
connection_df["ScrapingSite"] = "Maersk"
connection_df["EstimatedTotalTransitTimeHours"] = ""
connection_df["TotalCO2EmissionsKg"] = ""
connection_df["TotalDistanceMeters"] = ""
connection_df["CutOffs"] = ""
connection_df["NumberOfLegs"] = ""
connection_df["EstimatedTotalTransitTimeDays"] = connection_df.EstimatedTotalTransitTimeDays.round('d')

# Changing the order of the Dataframe. Makes analysing the dataframe by hand easier. Has no effect on actual dataframe operations
v2_connection_df = connection_df[['ScrapingDate','ScrapingSite','Origin','Destination','OriginName','DestinationName','EstimatedDepartureTime','EstimatedArrivalTime','EstimatedTotalTransitTimeDays','EstimatedTotalTransitTimeHours','TotalCO2EmissionsKg','TotalDistanceMeters','CutOffs','NumberOfLegs','Legs']]


# Store as both pickle and CSV
v2_connection_df.to_pickle(f"../pickles/maersk_daily/pickles_before_merge/connections_{today}.pickle")
v2_connection_df.to_csv(f"../data/maersk_daily/csv_runs/connections_{today}.csv")