-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #20 from NathanWorkman/enhancement/more-scrapers
Enhancement/more scrapers
- Loading branch information
Showing
18 changed files
with
642 additions
and
412 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
from django.contrib import admin | ||
|
||
from .models import Job, Board | ||
from .models import Job, Board, SearchTerms | ||
|
||
admin.site.register(Board) | ||
admin.site.register(Job) | ||
admin.site.register(SearchTerms) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Generated by Django 2.0.5 on 2018-05-14 19:54 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('job', '0001_initial'), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name='SearchTerms', | ||
fields=[ | ||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('term', models.CharField(max_length=55)), | ||
], | ||
), | ||
migrations.AlterModelOptions( | ||
name='job', | ||
options={'ordering': ['scrape_date']}, | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import scrapy | ||
from scrapy.spiders import Spider | ||
from scrapy.selector import Selector | ||
from scraper.items import JobItem | ||
from scrapy.http import Request | ||
|
||
from django.utils import timezone | ||
|
||
|
||
class GreenHouseSpider(Spider): | ||
name = "greenhouse" | ||
allowed_domains = ["google.com"] | ||
|
||
def start_requests(self): | ||
search_query = "q=site:greenhouse.io+django&tbs=qdr:m" | ||
base_url = "https://www.google.com/search?" | ||
start_urls = [] | ||
|
||
start_urls.append(base_url + search_query) | ||
|
||
return [scrapy.http.Request(url=start_url) for start_url in start_urls] | ||
|
||
def parse(self, response): | ||
"""Extract job detail urls from response.""" | ||
hxs = Selector(response) | ||
urls = hxs.xpath('//cite/text()').extract() | ||
for url in urls: | ||
yield Request(url, callback=self.parse_detail_pages, dont_filter=True) | ||
print(url) | ||
|
||
def parse_detail_pages(self, response): | ||
hxs = Selector(response) | ||
jobs = hxs.xpath('//div[contains(@id, "app_body")]') | ||
items = [] | ||
for job in jobs: | ||
item = JobItem() | ||
item["title"] = job.xpath('//h1[contains(@class, "app-title")]/text()').extract_first() | ||
item["company"] = str('n/a') | ||
item["body"] = job.xpath('//div[contains(@id, "content")]').extract() | ||
item["location"] = job.xpath('//div[contains(@class, "location")]').extract_first() | ||
item["url"] = response.request.url | ||
item["pub_date"] = str('n/a') | ||
item["email"] = str('n/a') | ||
item["salary"] = str('n/a') | ||
item["scrape_date"] = timezone.now() | ||
item["job_board"] = "Greenhouse" | ||
item["board_url"] = "www.greenhouse.io" | ||
items.append(item) | ||
return items |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import scrapy | ||
from scrapy.spiders import Spider | ||
from scrapy.selector import Selector | ||
from scraper.items import JobItem | ||
from scrapy.http import Request | ||
|
||
from django.utils import timezone | ||
|
||
|
||
class LeverSpider(Spider): | ||
name = "lever" | ||
allowed_domains = ["google.com"] | ||
|
||
def start_requests(self): | ||
search_query = "q=site:lever.co+django&tbs=qdr:m" | ||
base_url = "https://www.google.com/search?" | ||
start_urls = [] | ||
|
||
start_urls.append(base_url + search_query) | ||
|
||
return [scrapy.http.Request(url=start_url) for start_url in start_urls] | ||
|
||
def parse(self, response): | ||
"""Extract job detail urls from response.""" | ||
hxs = Selector(response) | ||
urls = hxs.xpath('//cite/text()').extract() | ||
for url in urls: | ||
yield Request(url, callback=self.parse_detail_pages, dont_filter=True) | ||
print(url) | ||
|
||
def parse_detail_pages(self, response): | ||
hxs = Selector(response) | ||
jobs = hxs.xpath('//div[contains(@class, "content")]') | ||
items = [] | ||
for job in jobs: | ||
item = JobItem() | ||
item["title"] = job.xpath('//div[contains(@class, "posting-headline")]/h2/text()').extract_first() | ||
item["company"] = str('n/a') | ||
item["body"] = job.xpath('//div[contains(@class, "section page-centered")]').extract() | ||
item["location"] = job.xpath('//div[contains(@class, "sort-by-time posting-category medium-category-label")]').extract_first() | ||
item["url"] = response.request.url | ||
item["pub_date"] = str('n/a') | ||
item["email"] = str('n/a') | ||
item["salary"] = str('n/a') | ||
item["scrape_date"] = timezone.now() | ||
item["job_board"] = "Lever" | ||
item["board_url"] = "lever.co" | ||
items.append(item) | ||
return items |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import scrapy | ||
from scrapy.spiders import Spider | ||
from scrapy.selector import Selector | ||
from scraper.items import JobItem | ||
from scrapy.http import Request | ||
|
||
from django.utils import timezone | ||
|
||
|
||
class RecruiterBoxSpider(Spider): | ||
name = "recruiterbox" | ||
allowed_domains = ["google.com"] | ||
|
||
def start_requests(self): | ||
search_query = "q=site:recruiterbox.com+django&tbs=qdr:m" | ||
base_url = "https://www.google.com/search?" | ||
start_urls = [] | ||
|
||
start_urls.append(base_url + search_query) | ||
|
||
return [scrapy.http.Request(url=start_url) for start_url in start_urls] | ||
|
||
def parse(self, response): | ||
"""Extract job detail urls from response.""" | ||
hxs = Selector(response) | ||
urls = hxs.xpath('//cite/text()').extract() | ||
for url in urls: | ||
yield Request(url, callback=self.parse_detail_pages, dont_filter=True) | ||
print(url) | ||
|
||
def parse_detail_pages(self, response): | ||
hxs = Selector(response) | ||
jobs = hxs.xpath('//div[contains(@id, "content")]') | ||
items = [] | ||
for job in jobs: | ||
item = JobItem() | ||
item["title"] = job.xpath('//h1[contains(@class, "jobtitle")]/text()').extract_first() | ||
item["company"] = str('n/a') | ||
item["body"] = job.xpath('//div[contains(@class, "jobdesciption")]').extract() | ||
item["location"] = job.xpath('//span[contains(@class, "meta-job-location-city")]').extract() | ||
item["url"] = response.request.url | ||
item["pub_date"] = str('n/a') | ||
item["email"] = str('n/a') | ||
item["salary"] = str('n/a') | ||
item["scrape_date"] = timezone.now() | ||
item["job_board"] = "Recruiter Box" | ||
item["board_url"] = "www.recruiterbox.com" | ||
items.append(item) | ||
return items |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import scrapy | ||
from scrapy.spiders import Spider | ||
from scrapy.selector import Selector | ||
from scraper.items import JobItem | ||
from scrapy.http import Request | ||
|
||
from django.utils import timezone | ||
|
||
|
||
class WorkableSpider(Spider): | ||
name = "workable" | ||
allowed_domains = ["google.com"] | ||
|
||
def start_requests(self): | ||
search_query = "q=site:workable.com+django&tbs=qdr:m" | ||
base_url = "https://www.google.com/search?" | ||
start_urls = [] | ||
|
||
start_urls.append(base_url + search_query) | ||
|
||
return [scrapy.http.Request(url=start_url) for start_url in start_urls] | ||
|
||
def parse(self, response): | ||
"""Extract job detail urls from response.""" | ||
hxs = Selector(response) | ||
urls = hxs.xpath('//cite/text()').extract() | ||
for url in urls: | ||
yield Request(url, callback=self.parse_detail_pages, dont_filter=True) | ||
print(url) | ||
|
||
def parse_detail_pages(self, response): | ||
hxs = Selector(response) | ||
jobs = hxs.xpath('//main[contains(@class, "stacked")]') | ||
items = [] | ||
for job in jobs: | ||
item = JobItem() | ||
item["title"] = job.xpath('//h1/text()').extract_first() | ||
item["company"] = str('n/a') | ||
item["body"] = job.xpath('//main[contains(@class, "stacked")]').extract() | ||
item["location"] = job.xpath('//p[contains(@class, "meta")]').extract_first() | ||
item["url"] = response.request.url | ||
item["pub_date"] = str('n/a') | ||
item["email"] = str('n/a') | ||
item["salary"] = str('n/a') | ||
# item["tags"] = job.css('.-tags p a.post-tag::text').extract() | ||
item["scrape_date"] = timezone.now() | ||
item["job_board"] = "Workable" | ||
item["board_url"] = "www.workable.com" | ||
items.append(item) | ||
return items |
Oops, something went wrong.