-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLawrence.py
59 lines (53 loc) · 2.34 KB
/
Lawrence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
def get_url_ug():
driver = webdriver.Firefox()
urls = ['https://www.stlawrencecollege.ca/programs?intl=true', 'https://www.stlawrencecollege.ca/programs?p=2&intl=true','https://www.stlawrencecollege.ca/programs?p=3&intl=true','https://www.stlawrencecollege.ca/programs?p=4&intl=true','https://www.stlawrencecollege.ca/programs?p=5&intl=true']
for url in urls:
keywords = "https://www.stlawrencecollege.ca/programs/"
not_key="https://www.stlawrencecollege.ca/programs/areas-of-study/"
driver.get(url)
time.sleep(5)
anchors = driver.find_elements(By.TAG_NAME, 'a')
for anchor in anchors:
href = anchor.get_attribute('href')
if href and href.startswith(keywords) and not href.startswith(not_key):
print(href)
driver.close()
def get_details():
with open("url.txt", "r") as file:
urls = [line.strip() for line in file.readlines()]
data = []
for url in urls:
try:
req = requests.get(url)
req.raise_for_status() # Raises an error for bad responses (4xx or 5xx)
soup = BeautifulSoup(req.text, 'html.parser')
name = soup.find_all('h1')
details = soup.find_all("span", class_='value')
# Check if the expected elements exist
if name and len(details) > 5:
data.append({
'URL': url,
'Name': name[0].get_text(),
'award': details[2].get_text(),
'code': details[0].get_text(),
'start': details[1].get_text(),
'location': details[3].get_text(),
'duration': details[4].get_text(),
'mode': details[5].get_text()
})
else:
print(f"Missing data for URL: {url}")
except Exception as e:
print(f"Error processing {url}: {e}")
df = pd.DataFrame(data)
df.to_excel('lawrence.xlsx')
print(df)
get_details()