-
Notifications
You must be signed in to change notification settings - Fork 2
/
rpa_selenium_scraping.py
90 lines (83 loc) · 3.15 KB
/
rpa_selenium_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import requests
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import json
import requests
from PIL import Image
from io import BytesIO
from vision_machine_optical import VisionOCR
def check_xpaht_dbd(driver):
driver.implicitly_wait(5)
driver.find_element_by_xpath('//*[@id="loginForm"]/div[1]/span/img').screenshot('config/screenshot.png')
# element_image = driver.find_element_by_xpath('//*[@id="loginForm"]/div[1]/span')
# location = element_image.location
# size = element_image.size
# png = driver.get_screenshot_as_png()
# im = Image.open(BytesIO(png))
# left = location['x']
# top = location['y']
# right = left + size['width']
# bottom = top + size['height']
# print(left, top)
# print(right, bottom)
# im = im.crop((int(left), int(top), int(right), int(bottom)))
# im.save('config/screenshot.png')
time.sleep(2)
ocr = VisionOCR('config/screenshot.png')
ocr = ocr.document_pandas()
ocr = ocr.to_dict()
texts = ocr['description']
hack = driver.find_element_by_id('captchaCode')
hack.send_keys(texts[1])
summit = driver.find_element_by_id('signinBtn')
summit.click()
time.sleep(1)
class WebScraping:
def __init__(self, webdriver):
self.webdriver = webdriver
def dynamic_scraping(self, uri, html, key, val, delay):
driver = webdriver.Edge(self.webdriver)
driver.get(uri)
driver.implicitly_wait(5)
time.sleep(delay)
soup = BeautifulSoup(driver.page_source, 'lxml')
content = soup.find_all(html, {key: val})
driver.close()
return content
def dbd_tax(self, tax_id, url):
driver = webdriver.Chrome('config/chromedriver')
try:
driver.get(url)
check_xpaht_dbd(driver)
input_tax = driver.find_element_by_xpath('//*[@id="textStr"]')
input_tax.send_keys(str(tax_id))
time.sleep(1)
enter_tax = driver.find_element_by_xpath('//*[@id="Capa_1"]')
enter_tax.submit()
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'lxml')
content = soup.find_all('table', {'id': 'fixTable'})
driver.close()
return content
except NoSuchElementException:
while True:
try:
driver.get(url)
check_xpaht_dbd(driver)
input_tax = driver.find_element_by_xpath('//*[@id="textStr"]')
input_tax.send_keys(str(tax_id))
time.sleep(1)
enter_tax = driver.find_element_by_xpath('//*[@id="Capa_1"]')
enter_tax.submit()
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'lxml')
content = soup.find_all('table', {'id': 'fixTable'})
driver.close()
return content
except NoSuchElementException:
time.sleep(1)
driver.get(url)
check_xpaht_dbd(driver)