-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathYoungstown.py
179 lines (149 loc) · 7 KB
/
Youngstown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import pandas as pd
from bs4 import BeautifulSoup
import requests
from lxml import html
from concurrent.futures import ThreadPoolExecutor
def fetch(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except Exception as e:
print(f"Error fetching URL {url}: {e}")
return None
def extract_information_and_export_to_excel():
# Read the URLs from the text file
with open('youngstownExUG.txt', 'r') as file:
urls = file.readlines()
# Clean up URLs by removing leading/trailing whitespace and newlines
urls = [url.strip() for url in urls]
# Create a list to store the extracted information
data = []
with ThreadPoolExecutor() as executor:
# Submit tasks for fetching URLs
futures = [executor.submit(fetch, url) for url in urls]
# Process results
for future, url in zip(futures, urls):
page_content = future.result()
if page_content:
# Parsing the page
tree = html.fromstring(page_content)
# Get elements using XPath
names = tree.cssselect('#main-content > div > div.shards-top.shards-bottom > div.article-wrapper.row > article > div > div.field.field-name-title > h1')
names = [name.text.strip() for name in names if name.text.strip()] # Remove leading/trailing whitespace
name = names[0] if names else "" # If there are multiple names, take the first one
course_summary_elements = tree.cssselect('#main-content > div > div.shards-top.shards-bottom > div.article-wrapper.row > article > div > div.article-intro > div > p')
course_summary = course_summary_elements[0].text_content().strip() if course_summary_elements else ""
# Append the extracted information to the list
data.append([url,name, course_summary])
# Export the list to an Excel file
df = pd.DataFrame(data, columns=['URL','Name', 'Course Summary'])
print(df)
df.to_excel('extracted_datayoungUG.xlsx', index=False)
return df
def extract_information_and_export_to_excel_PG():
# Read the URLs from the text file
with open('youngstownExPG.txt', 'r') as file:
urls = file.readlines()
# Clean up URLs by removing leading/trailing whitespace and newlines
urls = [url.strip() for url in urls]
# Create a list to store the extracted information
data = []
with ThreadPoolExecutor() as executor:
# Submit tasks for fetching URLs
futures = [executor.submit(fetch, url) for url in urls]
# Process results
for future, url in zip(futures, urls):
page_content = future.result()
if page_content:
# Parsing the page
tree = html.fromstring(page_content)
# Get elements using XPath
names = tree.cssselect('#main-content > div > div.shards-top.shards-bottom > div.article-wrapper.row > article > div > div.field.field-name-title > h1')
names = [name.text.strip() for name in names if name.text.strip()] # Remove leading/trailing whitespace
name = names[0] if names else "" # If there are multiple names, take the first one
course_summary_elements = tree.cssselect('#main-content > div > div.shards-top.shards-bottom > div.article-wrapper.row > article > div > div.article-intro > div')
course_summary = course_summary_elements[0].text_content().strip() if course_summary_elements else ""
# Append the extracted information to the list
data.append([url,name, course_summary])
# Export the list to an Excel file
df = pd.DataFrame(data, columns=['URL','Name', 'Course Summary'])
print(df)
df.to_excel('extracted_datayoungPG.xlsx', index=False)
return df
def crawl_UG():
url = 'https://ysu.edu/international-programs-office/undergraduate-programs-international-students'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
filter_word = "minor"
for link in soup.find_all('a'):
href = link.get('href')
if href and href.startswith(("https://ysu.edu/academics")):
if href and filter_word not in href:
urls.append(href)
print(href)
with open('youngstownUG.txt', 'w') as file:
# Print the filtered URLs
for url in urls:
print(url)
file.write(url + '\n')
def crawl_PG():
url = 'https://ysu.edu/international-programs-office/graduate-programs'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
filter_word = "minor"
for link in soup.find_all('a'):
href = link.get('href')
if href and href.startswith(("https://ysu.edu/academics")):
if href and filter_word not in href:
urls.append(href)
print(href)
with open('youngstownPG.txt', 'w') as file:
# Print the filtered URLs
for url in urls:
print(url)
file.write(url + '\n')
def check_existing():
existing_urls = []
with open('youngstownExUG.txt', 'r') as file:
urls = file.readlines()
for url in urls:
try:
response = requests.get(url.strip(), allow_redirects=False)
if response.status_code == 200:
status = "Working Fine"
redirected_to = ""
elif 300 <= response.status_code < 400:
status = f"Redirected ({response.status_code})"
redirected_to = response.headers.get('Location', 'Unknown')
elif response.status_code == 403:
status = "Access Forbidden"
redirected_to = ""
else:
status = f"Returned Status Code {response.status_code}"
redirected_to = ""
print(f"{url.strip()}: {status}")
existing_urls.append(url.strip()) # Append the stripped URL
except Exception as e:
status = f"Threw an Exception: {str(e)}"
redirected_to = ""
print(f"{url.strip()}: {status}")
existing_urls.append(url.strip()) # Append the stripped URL
return existing_urls
def cross_validate_with_new_file(existing_urls):
# Read the new URLs from another text file
with open('youngstownUG.txt', 'r') as file:
new_urls = file.readlines()
new_urls = [url.strip() for url in new_urls]
# Cross-validate with existing URLs
matching_urls = set(new_urls) - set(existing_urls)
return matching_urls
existing_urls = check_existing()
matching_urls = cross_validate_with_new_file(existing_urls)
print("Matching URLs in the new file:")
print(matching_urls)
crawl_PG()