-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathindeed_scrap.py
42 lines (35 loc) · 944 Bytes
/
indeed_scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from lxml import html
import requests
import re
import pdfquery
import urllib2
import time
import sys
def download_file(download_url,idx):
response = urllib2.urlopen(download_url)
file = open("./resume_scrap/"+sys.argv[1]+ "/"+str(idx)+".pdf", 'w')
file.write(response.read())
file.close()
indeed = "http://www.indeed.com"
url_prefix = "http://www.indeed.com/resumes?q="+sys.argv[1]+"&co=US&start="
target_idx = 10000
idx = int(sys.argv[2])
file_no = idx
while(idx < target_idx):
url = url_prefix + str(idx)
page = requests.get(url).content
all_links = re.findall(r"/r/[^\?]*", page)
for link in all_links:
time.sleep(1)
url = indeed + link
page = requests.get(url).content
print file_no
print ' files downloaded'
try:
download_link = re.search(r"/r/[^\"]*pdf", page).group(0)
url = indeed + download_link
download_file(url,file_no)
file_no = file_no + 1
except:
continue
idx = idx + 50