-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scraper.py
125 lines (89 loc) · 3.65 KB
/
Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 11 14:51:02 2023
@author: Yousha
"""
import pandas as pd
import time
from bs4 import BeautifulSoup
from selenium import webdriver
import warnings
warnings.simplefilter(action='ignore',category=FutureWarning)
warnings.simplefilter(action='ignore',category=DeprecationWarning)
pages = int(input('Enter number of pages to scrape: '))
page = 0
df = pd.DataFrame(columns=['title','company','ratings','reviews','experience',
'salary','location','days_posted','tags','url'])
for i in range(1,pages+1):
if i == 1:
# insert link to page 1 here
url = "https://www.naukri.com/data-scientist-jobs-?k=data%20scientist&nignbevent_src=jobsearchDeskGNB"
else:
# insert link to 2nd page here, replace number 2 with str(i)
url = "https://www.naukri.com/data-scientist-jobs-"+str(i)+"?k=data%20scientist&nignbevent_src=jobsearchDeskGNB"
driver = webdriver.Chrome("C:\\Users\\Shumail\\anaconda3\\Lib\\site-packages\\chromedriver_binary\\chromedriver.exe")
driver.get(url)
time.sleep(4)
soup = BeautifulSoup(driver.page_source,'html5lib')
driver.close()
results = soup.find(class_="list")
job_elems = results.find_all('article',class_='jobTuple')
for job_elem in job_elems:
# Job URL
url = job_elem.find('a',class_='title ellipsis').get('href')
# Job Title
title = job_elem.find('a',class_='title ellipsis')
# Number of reviews
review_span = job_elem.find('a',class_='reviewsCount fleft')
if review_span is None:
continue
else:
reviews = review_span.text
# Company ratings
rating_span = job_elem.find('span',class_='starRating fleft')
if rating_span is None:
continue
else:
ratings = rating_span.text
# Company name
company = job_elem.find('a',class_='subTitle ellipsis fleft')
# Experience required
Exp = job_elem.find('li',class_='fleft br2 placeHolderLi experience')
Exp_span = Exp.find('span',class_='ellipsis fleft expwdth')
if Exp_span is None:
continue
else:
Experience = Exp_span.text
# Salary
Sal = job_elem.find('li',class_='fleft br2 placeHolderLi salary')
Sal_span = Sal.find('span',class_='ellipsis fleft')
if Sal_span is None:
continue
else:
Salary = Sal_span.text
# Location
Loc = job_elem.find('li',class_='fleft br2 placeHolderLi location')
Loc_exp = Loc.find('span',class_='ellipsis fleft locWdth')
if Loc_exp is None:
continue
else:
Location = Loc_exp.text
# Days since job was posted
hist = job_elem.find('div', class_='jobTupleFooter mt-8')
hist2 = hist.find('div', class_='tupleTagsContainer')
hist3 = hist2.find('span', class_='fleft postedDate')
# job Tags
tags = job_elem.find('ul',class_='tags has-description')
tag1 = ""
for tag in tags:
tag1 += " "+tag.text
# Adding data to dataframe
df = df.append({'title':title.text,'company':company.text,'ratings':ratings,
'reviews':reviews,'experience':Experience,'salary':Salary,
'location':Location,'days_posted':hist3.text,'tags':tag1,'url':url},
ignore_index=True)
page += 1
print(f'Pages done {page}/{pages}')
df.duplicated().sum()
df2 = df.drop_duplicates()
df2.to_csv('data/naukri_scraped_data.csv', index=None)