-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
67 lines (45 loc) · 1.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from os import link, system
import time
from playwright.sync_api import Playwright, sync_playwright, expect
from bs4 import BeautifulSoup
#url = 'https://www.cartuseria.ro/sisteme-ciss-alimentare-continua/'
url = system.argv[1]
def run(playwright: Playwright) -> None:
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
# Open new page
page = context.new_page()
# Go to Cartuseria
page.goto(url, wait_until="networkidle")
html = page.inner_html("#content-wrapper")
soup = BeautifulSoup(html, "html.parser")
#Data Extraction Begins
#Check if FAQ exist
if soup.find("div", {"id": "c-faq-accord"}):
faq_exists = True
else:
faq_exists = False
#print(f'FAQ exists: {faq_exists}')
#Check if Description Exists and how many internal links
description_html = soup.find("div", {"id": "category-description"})
if description_html:
#extract description text
category_exists = True
links = description_html.findAll('a')
else:
category_exists = False
#print(f'Category exists: {category_exists}')
#print(f'Internal Links: {len(links)}')
#Exract number of products
number_of_product_html = soup.find("div", {"class": "col-md-4"})
if number_of_product_html:
product_count = number_of_product_html.get_text().strip()
#Stip all but total number of produtcts
product_count = product_count.split('din ')[1].split(' produs')[0]
else:
product_count = 0
#print (f'Number of products: {product_count}')
context.close()
browser.close()
with sync_playwright() as playwright:
run(playwright)