-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape_cars.py
50 lines (37 loc) · 1.67 KB
/
scrape_cars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
from datetime import datetime
def scrape_cars(query_url):
cars_base = 'https://www.cars.com/shopping/results/'
# Get HTML and make soup
model_url = cars_base + query_url
model_html = requests.get(model_url).text
model_soup = BeautifulSoup(model_html, 'html.parser')
# Find and store vehicle cards
vehicle_cards = model_soup.find_all("div", class_ = "vehicle-card")
# Create outer car list
vehicles = []
# Extract data from vehicle cards
for vehicle_card in vehicle_cards:
# Get title, mileage, price, and distance
title = vehicle_card.find('h2', class_ = 'title').text
mileage = vehicle_card.find('div', class_ = 'mileage').text
mileage = int(re.sub(pattern = r'\D*', repl = '', string = mileage))
price = vehicle_card.find('span', class_ = 'primary-price').text
price = int(re.sub(pattern = r'\D*', repl = '', string = price))
try:
distance = vehicle_card.find('div', class_ = 'miles-from').text.strip()
distance = int(re.sub(pattern = r'\D+.*', repl = '', string = distance))
except:
distance = np.NaN
# Get date accessed
date_accessed = datetime.now().strftime("%Y-%m-%d %H hrs")
# Ignore if mileage == 0
if mileage == 0:
continue
# Append vehicle data to dataframe
vehicles.append([title, mileage, price, distance, date_accessed])
return pd.DataFrame(vehicles, columns = ['title', 'mileage', 'price', 'distance', 'date_accessed'])