-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapper.py
103 lines (86 loc) · 3.69 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# pip install selenium webdriver-manager
# pip install bs4
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from collections import defaultdict
url = "https://medium.com/@bijit211987/chunking-strategies-for-fine-tuning-llms-30d2988c3b7a"
# set the url you want to scrap
def Scrap(url):
# set up Chrome WebDriver using ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(
ChromeDriverManager().install()))
# open the specified URL in the browser
driver.get(url)
# extract info into soup
soup = BeautifulSoup(driver.page_source, 'html.parser')
# links
links = set()
# human_useful_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'div', 'span', 'li', 'ol', 'ul', 'head', 'body', 'footer', 'title', }
not_to_use = {'script', 'noscript', 'applet', 'embed', 'object', 'param', 'link', 'style'}
data = ""
links = set()
for tag in soup.find_all(True): # True finds all tags
tag_name = tag.name
# Skip tags that are not useful
if tag_name in not_to_use:
continue
# Extract text
text = tag.get_text(strip=True)
if text: # Only add if there's text
data += f"{tag_name}: {text} "
# Collect links
if tag_name == 'a' and tag.get('href'):
href = tag["href"]
if href.startswith('http') and not any(ext in href for ext in ['.jpg', '.png', '.gif', 'advertisement', 'ads']):
links.add(href)
data += "links to:" + href + " "
# Extract attributes from interactive elements
if tag_name in {'input', 'button', 'select', 'textarea'}:
attributes = []
if tag_name == 'input':
attributes.append(f"placeholder: {tag.get('placeholder', '')}")
attributes.append(f"value: {tag.get('value', '')}")
attributes.append(f"type: {tag.get('type', '')}")
elif tag_name == 'button':
attributes.append(f"type: {tag.get('type', '')}")
attributes.append(f"value: {tag.get('value', '')}")
elif tag_name == 'select':
attributes.append(f"options: {', '.join([option.get_text(strip=True) for option in tag.find_all('option')])}")
elif tag_name == 'textarea':
attributes.append(f"placeholder: {tag.get('placeholder', '')}")
attributes.append(f"value: {tag.get('value', '')}")
if attributes:
data += f"{tag_name} attributes: {', '.join(attributes)} "
# get the titile for the links
relevant_links = []
for url in links:
try:
# Send an HTTP request to fetch the webpage content
response = driver.get(url)
# Parse the content using BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
title = soup.title.string if soup.title else 'No Title Found'
# Append the URL and title to the result list
relevant_links.append({
"url": url,
"title": title
})
except Exception as e:
print(f"Failed to fetch {url}: {str(e)}")
# close the browser
driver.quit()
result = {
'relevant_link': relevant_links,
'data': data
}
# write the scrapped data
with open('data5.json', 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=4)
return result
Scrap(url)
## word_count
# word_count = len(str(data).split())
# print("Word count:", word_count)