-
Notifications
You must be signed in to change notification settings - Fork 0
/
kurokami.py
254 lines (229 loc) · 10.7 KB
/
kurokami.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
'''
Author: Andrew Higgins
https://github.com/speckly
Two parse modes only differs in item divs 2nd a
Structure of Carousell HTML FORMAT 1 (parse_mode 1):
body > find main > 1st div > 1st div > divs of items
in divs of items > parents of each item
parent > 1st div > 1st a is seller, 2nd a is item page
in 1st a: 2nd div > p is seller name, > div > p is time posted
in 2nd a: 2nd div > p is item name but with ... if too long, directly under 2nd a first p is price, 2nd p is condition
parent > 2nd div > button > span is number of likes
total 24 or 25 results loaded once.
Structure of Carousell HTML FORMAT 2 (parse_mode 2, found in legacy):
body > find main > 1st div > 1st div > divs of items
in divs of items > parents of each item
parent > 1st div > 1st a is seller, 2nd a is item page
in 1st a: 2nd div > p is seller name, > div > p is time posted
in 2nd a: 1st p is FULl NAME, 2nd p is price, 3rd p is description, 4th p is condition
parent > 2nd div > button > span is number of likes
total 24 or 25 results loaded once.
body > find main > div > button to view more
view more button loads on top of existing, so can prob spam view more then gather all items at once
MAY NOT BE FIRST DIV! Temp workaround is to get class name of the correct item divs
My way (modified 1 here):
.asm-browse-listings > div > div > div of item > div with testid > div of item stripped
'''
from typing import Union
import pickle
import traceback
import argparse
import os
import asyncio
import re
import urllib
import sys
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
async def request_page(url, page_limit):
""" Returns BeautifulSoup4 Objects (soup)"""
opts = Options()
opts.add_argument("--log-level=3")
# opts.add_argument("--headless") # Requires human verification
opts.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
driver = webdriver.Chrome(options=opts)
driver.minimize_window()
driver.get(url)
page = 1
timeout = 10
while page < page_limit:
try:
next_page_btn = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.XPATH, '//button[contains(text(), "Show more results")]'))) # wait max timeout sec for loading
driver.execute_script("arguments[0].click();", next_page_btn) # click the load more button through ads
page += 1
except TimeoutException:
print("Button not found, reached end of page or load more button not found.")
break
pg = driver.page_source
driver.quit()
return BeautifulSoup(pg, "html.parser")
def parse_info(item_div, home,):
"""Author: Andrew Higgins
https://github.com/speckly
Parses the item_div and returns the list of items
"""
a = item_div.find_all('a', recursive=False)
seller_divs = a[0].find_all('div', recursive=False)[1]
item_p = a[1].find_all('p', recursive=False)
img = item_div.find('img')
item_url = home+a[1]['href']
return {'uid': re.search(r"\/p\/[^\/]+-(\d+)", item_url).group(1),
'seller_name': seller_divs.p.get_text(),
'price': re.findall(r"FREE|\$\d{0,3},?\d+\.?\d{,2}", a[1].get_text()),
'time_posted': seller_divs.div.p.get_text(), # Attempt to get absolute datetime?
'condition': item_p[1].get_text(),
'item_name': img['title'] if img else "Title not found as this is a video",
'item_url': item_url,
'item_img': img['src'] if img else None,
'seller_url': home+a[0]['href'],
} # 0 is discounted price, 1 is original price, if applicable
async def main(options: Union[dict, None] = None):
"""options keys: i (item), p (page), o (output), t (test), s (serialize), c (compare)"""
if options is None:
server_side = False
ps = argparse.ArgumentParser()
ps.add_argument('-i', '--item', type=str, help='Name of the item to scrape')
ps.add_argument('-p', '--page', type=int, help='Number of pages (approx 46 per page)')
ps.add_argument('-o', '--output', type=str,
help='CSV file to write out to, defaults to timestamped')
ps.add_argument('-t', '--test', action='store_true',
help=r'''For debugging of parsers which could break often due to the changing structure,
using a snapshot of a bs4 object while overriding these flags with the respective values: -i shirakami fubuki -p 1''')
ps.add_argument('-s', '--serialize', action='store_true',
help=r'''For debugging of parsers which could break often due to the changing structure,
the BS4 object is serialised for fast access, must not have -t''')
ps.add_argument('-c', '--compare', type=str,
help='Name of a .csv file output from this program to compare with')
args = ps.parse_args()
if args.test:
test = True
item = 'test'
page_limit = 1
if args.item or args.page:
print('Entered test mode, overriding some user provided arguments')
else:
test = False
if args.item:
item = args.item
else:
item = input('-i Item name: ')
if args.page:
page_limit = args.page
else:
while True:
inp = input('-p Number of pages (approx 50 per page): ')
if inp.isdigit():
page_limit = int(inp)
break
print("Invalid integer")
file_reg = r'^.?[a-zA-Z0-9_\\/\- ]+\.csv$'
output_file = args.output
if output_file:
if not re.match(file_reg, output_file):
print("Invalid CSV file name. Accepted chars: azAZ_-.csv")
sys.exit(1)
elif not os.path.exists(output_file):
print(f"{output_file} does not exist")
sys.exit(1)
else:
print("Using default csv file format")
output_file = item + ".csv"
serialize = args.serialize
compare_file = args.compare
if compare_file:
if not re.match(file_reg, args.compare):
print(f"Invalid CSV file name {compare_file}. Please provide a name consisting of letters, numbers, underscores, and dashes, ending with .csv")
sys.exit(1)
elif not os.path.exists(compare_file):
print(f"{compare_file} does not exist")
sys.exit(1)
else: # Praying that this does not result in a SSRF, used in bot.py with no user inputs yet. Validate user inputs
server_side = True
item = options.get("i")
output_file = options.get("o")
page_limit = options.get("p")
if options.get("t"):
test = True
item = 'shirakami fubuki'
page_limit = 1
else:
test = False
serialize = options.get("s")
compare_file = options.get("c")
if not server_side:
print("Author: Andrew Higgins")
print("https://github.com/speckly")
home = 'https://sg.carousell.com'
subdirs = f'/search/{urllib.parse.quote(item)}'
parameters = '?addRecent=false&canChangeKeyword=false&includeSuggestions=false&sort_by=3'
try:
if not server_side:
print(f'Retrieving search results on {item}...')
if not test:
if not server_side:
print("Creating webdriver")
search_results_soup = await request_page(home+subdirs+parameters, page_limit=page_limit)
if not server_side:
print(f'All results loaded. Total: {page_limit} pages.')
if serialize:
with open("./utils/soup.pkl", "wb") as f:
pickle.dump(search_results_soup, f)
print(f"Serialized: -i {item}")
else:
with open("./utils/soup.pkl", "rb") as f:
search_results_soup = pickle.load(f)
# Strip down
browse_listings_divs = search_results_soup.find(class_="asm-browse-listings")
item_divs_class = browse_listings_divs.select_one('.asm-browse-listings > div > div > div > div > div')['class']
if not server_side:
print(f'Detected item_divs class: {item_divs_class}')
item_divs = search_results_soup.find_all('div', class_=item_divs_class) # ads
if not server_side:
print(f'Found {len(item_divs)} listings. Parsing...')
except AttributeError: # no item_divs at all
print('The search has returned no result.')
sys.exit(1)
tries = 1
while tries < 5: # retrying loop as the div class position is random
try:
items_list = []
for item_div in item_divs:
items_list.append(parse_info(item_div, home))
break
except IndexError:
print(traceback.format_exc())
print(f'Parsing attempt {tries} failed due to class name error.\n')
tries += 1
continue
else:
print('Parsing failed as it still faces IndexError after 5 tries.')
sys.exit(1)
df = pd.DataFrame(items_list)
df.to_csv(output_file, index=False)
if not server_side:
print(f'Results saved to {output_file}')
if compare_file:
if not server_side:
print("Comparing resuls with given csv")
prev_df = pd.read_csv(compare_file)
# df_standardized = df.iloc[:len([prev_df])] # cases where there might be extra old results appended to new df, remove these
# new_rows = df_standardized[~df_standardized['uid'].isin(prev_df['uid'])]
# right exclusive join, old.column != new.column so we drop all cols named x as its from left
cols = ["seller_name","price","time_posted","condition","item_name","item_url","item_img","seller_url"]
df['uid'] = df['uid'].astype(str)
prev_df['uid'] = prev_df['uid'].astype(str)
new_rows = pd.merge(prev_df, df, on='uid', how="outer", indicator='ind').query('ind == "right_only"')
new_rows.drop(columns=["ind"]+[col + "_x" for col in cols])
return new_rows.values.tolist() # consider using dict?
return df.values.tolist()
if __name__ == "__main__":
compare_results = asyncio.run(main())
if compare_results:
print(f"The difference between the previous and this query is {compare_results}")
print(f"There are {len(compare_results)} new listings")