-
Notifications
You must be signed in to change notification settings - Fork 0
/
hummart2.py
111 lines (107 loc) · 4.39 KB
/
hummart2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import os
import sched
import time
import pandas as pd
import numpy as np
data = pd.read_csv('hummart_categories.csv')
site_name = 'Hum Mart'
def getDriver():
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
# options.add_argument("--kiosk")
driver = webdriver.Chrome()
return driver
def scroll_down(driver):
"""A method for scrolling the page."""
# Get scroll height.
last_height = driver.execute_script("return document.body.scrollHeight")
time1 = time.time()
while True:
time2 = time.time()
total_time = (time2 - time1)
if(total_time > 180):
break
# Scroll down to the bottom.
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load the page.
time.sleep(3)
# Calculate new scroll height and compare with last scroll height.
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
driver = getDriver()
for i in range(16):
category_name = data.iloc[i][2]
print(category_name)
url = data.iloc[i][3]
driver.get(url)
time.sleep(4)
scroll_down(driver)
product_names = []
product_pages = []
product_images = []
product_prices = []
discount_prices = []
sites = []
categories = []
sitemap = driver.find_element_by_css_selector(".products.wrapper.grid.columns4.products-grid")
try:
# sitemap = driver.find_element_by_css_selector(".category-products-")
ols = driver.find_elements_by_tag_name('ol')[0]
products = ols.find_elements_by_tag_name("li")
except:
continue
# # print(products)
print(len(products))
for p in products:
try:
div_info = p.find_element_by_css_selector(".product-item-info")
div_link_info = div_info.find_element_by_css_selector(".product.photo.product-item-photo")
link = div_link_info.find_element_by_tag_name("a")
href = link.get_attribute("href")
image = div_link_info.find_element_by_tag_name("img")
src = image.get_attribute("src")
prod_name = image.get_attribute("alt")
div_product_detail = p.find_element_by_css_selector(".product.details.product-item-details")
# print("div_product_detail")
# print(div_product_detail)
div_product_detail1 = div_product_detail.find_element_by_css_selector(".express-mobile-hide")
# print("div_product_detail1")
# print(div_product_detail1)
div_product_detail2 = div_product_detail1.find_element_by_css_selector(".price-box.price-final_price")
# print("div_product_detail2")
# print(div_product_detail2)
# span_price = div_product_detail2.find_element_by_css_seletor(".price-container.price-final_price.tax.weee")
span_price = div_product_detail2.find_element_by_tag_name("span")
# print(span_price)
span_price1 = span_price.find_element_by_css_selector(".price-wrapper ")
# print(span_price1)
# span_price2 = span_price1.find_element_by_css_selector(".price")
price = span_price1.get_attribute('data-price-amount')
# print(price)
product_names.append(prod_name)
product_pages.append(href)
product_images.append(src)
print(price)
product_prices.append(int(price))
discount_prices.append(-1)
categories.append(category_name)
sites.append(site_name)
except:
pass
df = pd.DataFrame({'Product Name': np.array(product_names), 'Product Page': np.array(product_pages), 'Product Image':np.array(product_images),
'Price': np.array(product_prices),'Discount Price': np.array(discount_prices), 'Category': np.array(categories), 'Site': np.array(sites) })
filename = './hummart/hummart_' + category_name + '.csv'
if(os.path.exists(filename)):
df1 = pd.read_csv(filename, index_col=False)
df1 = df1.append(df, ignore_index=True, sort=False)
df1.to_csv(filename)
else:
df.to_csv(filename)
driver.quit()