Skip to content

Commit

Permalink
Change in Glassdoor Web Scrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
hvudeshi committed Nov 4, 2021
1 parent 6e4c6c8 commit a609e1f
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 23 deletions.
8 changes: 4 additions & 4 deletions Code/Scrapper/Scrapper_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,12 @@ def get_emailing_list(connection):
role = "Software Engineer"
no_of_jobs_to_retrieve = 5
match_threshold = 1
final_result_linkedIn = sl.get_job_description(connection,resume_skills,all_skills, match_threshold, role, location, no_of_jobs_to_retrieve, data)
# final_result_glassdoor = sg.get_job_description(connection,resume_skills,all_skills, match_threshold, role, location, no_of_jobs_to_retrieve, data)
# final_result_linkedIn = sl.get_job_description(connection,resume_skills,all_skills, match_threshold, role, location, no_of_jobs_to_retrieve, data)
final_result_glassdoor = sg.get_job_description(connection,resume_skills,all_skills, match_threshold, role, location, no_of_jobs_to_retrieve, data)
# final_result_indeed = si.get_job_description(connection,resume_skills,all_skills, match_threshold, role, location, no_of_jobs_to_retrieve, data)

# final_results = final_result_linkedIn + final_result_glassdoor + final_result_indeed
print(final_result_linkedIn)
ea.sendmail(final_result_linkedIn,email_id_list)
print(final_result_glassdoor)
ea.sendmail(final_result_glassdoor,email_id_list)


33 changes: 14 additions & 19 deletions Code/Scrapper/scrapper_glassdoor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,53 +5,48 @@
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import keyword_extraction_modules as ke
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from socket import gaierror
from webdriver_manager.chrome import ChromeDriverManager
import json
import urllib.parse
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests


def get_job_description(keyword,num_jobs,verbose):
def get_job_description(connection,resume_skills,all_skills, match_threshold, role, location, no_of_jobs_to_retrieve, data):
options = Options()
options.add_argument("--window-size-1920,1200")
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome (options=options,executable_path=ChromeDriverManager().install())
url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+keyword+"&sc.keyword="+keyword+"&locT=&locId=&jobType="
url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+role+"&sc.keyword="+role+"&locT=&locId=&jobType="
driver.get(url)
job_urls = []
c=0
job_buttons = driver.find_elements_by_xpath('.//a[@class = "jobLink job-search-key-1rd3saf eigr9kq1"]') #jl for Job Listing. These are the buttons we're going to click.
time.sleep(2)
# time.sleep(2)
print(len(job_buttons))
for text in job_buttons:
if text.get_attribute('href'): ### get all the job postings URL's
job_urls.append(text.get_attribute('href'))
c=c+1
if(c>=num_jobs):
if(c>=no_of_jobs_to_retrieve):
break

final_dict = {}

# ========== Iterate through each url and get the job description =================================

for i in job_urls:
time.sleep(5)
jobs = []
driver.get(i)
button = driver.find_element_by_xpath('//*[@id="JobDescriptionContainer"]/div[2]')
button.click()
job_description = driver.find_element_by_xpath('//*[@id="JobDescriptionContainer"]/div[1]').text
jobs.append(job_description)
final_dict[i] = job_description
jobs = []
driver.get(i)
button = driver.find_element_by_xpath('//*[@id="JobDescriptionContainer"]/div[2]')
button.click()
job_description = driver.find_element_by_xpath('//*[@id="JobDescriptionContainer"]/div[1]').text
jobs.append(job_description)
final_dict[i] = job_description

final_result = ke.get_user_id_to_list_of_job_ids(resume_skills,final_dict,connection,all_skills,match_threshold)

return final_dict
return final_result

0 comments on commit a609e1f

Please sign in to comment.