From 32ce36e4c8fca822e337cb45391f2abc56a6275f Mon Sep 17 00:00:00 2001 From: Nathan Workman Date: Sun, 1 Apr 2018 19:32:53 -0400 Subject: [PATCH 1/7] Update README --- README.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b2ad835..d5bd86f 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,17 @@ -# :sunglasses: seeker [WIP] +# :sunglasses: Seeker [WIP] -Another job board aggregator. This is very much a work in progress and documentation is still incomplete. +## What is Seeker? +Seeker aims not to be a job board for everyone, but a job board for you. -Right now the search terms are hard coded to remote django and python positions. +Inevitably the time will come when you are on the hunt for a job, be it circumstance, looking for new opportunity, or just keeping an eye out for the dream job you don't know you want yet. Let Seeker do the leg work for you of checking multiple job boards for positions you might be interested in and organize them all in one convenient location. + +Currently, the search terms are hard coded to remote django and remote python positions - you'll need to manually update these for now. To change the search terms edit the query inside each spider. -Some quick setup instructions: +## Setup + +Some quick setup instructions I would recommend installing [virtualenv](https://virtualenv.readthedocs.io/). @@ -25,7 +30,7 @@ python manage.py runserver ``` ### To run the spiders -From the root directory of the `seeker` project `cd` to seeker +From the root directory of the `seeker` project `cd` to the seeker app directory. ``` cd seeker @@ -43,4 +48,39 @@ scrapy crawl stackoverflow ``` -Navigate to the django admin to view your results. \ No newline at end of file +Navigate to the django admin to view your results. + + +## TODO + +#### Future features. +- [x] Simple UI +- [ ] Enhanced UI +- [ ] Breadcrumbs Navigation +- [ ] Settings Panel +- [ ] Move all environmental variables to .env using PyEnv. +- [ ] Save/Favorite Job Postings +- [ ] Tag/Skill Views +- [ ] Full-Time, Part-Time, Contract +- [ ] Email Notifications - send daily, weekly, monthly email notifications of new job postings. +- [ ] Celery Beat - run spiders on a schedule. + +#### Spiders +Want a spider not listed here? Feel free to open a pull request and add it to the list or implement the spider yourself. +- [x] [Stack Overflow](https://www.stackoverflow.com/jobs) +- [ ] [Indeed](https://www.indeed.com) +- [ ] [Dice](http://dice.com) +- [ ] [Angel.co](https://angel.co/) +- [ ] [RemotePython](https://www.remotepython.com) +- [ ] [DjangoJobs](https://djangojobs.net/jobs/) +- [ ] [DjangoGigs](https://djangogigs.com) +- [ ] [Jobspresso](http://jobspresso.co) +- [ ] [Authentic Jobs](http://authenticjobs.com/) +- [ ] [We Work Remotely](https://weworkremotely.com/) +- [ ] [Remotive](https://remotive.io) +- [ ] [Python.org](https://www.python.org/jobs/) + + + + + From 91e9318c24cfbf6e3bf7c53ff136c531105ea96e Mon Sep 17 00:00:00 2001 From: Nathan Workman Date: Sun, 1 Apr 2018 19:35:18 -0400 Subject: [PATCH 2/7] Makefile for common tasks --- Makefile | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f1e94a5 --- /dev/null +++ b/Makefile @@ -0,0 +1,112 @@ +PROJECT_NAME = seeker +SHELL := /bin/sh +help: + @echo "Please use 'make ' where is one of" + @echo "virtualenv Create virtual enviroment" + @echo "requirements Install requirements.txt" + @echo "migrate Run Django migrations" + @echo "user Create user account" + @echo "test Run tests" + @echo "clean Remove all *.pyc, .DS_Store and temp files from the project" + @echo "shell Open Django shell" + @echo "migrations Create database migrations" + @echo "collectstatic Collect static assets" + @echo "run Run Django Server" + +.PHONY: requirements + + +# Command variables +MANAGE_CMD = python manage.py +PIP_INSTALL_CMD = pip install +PLAYBOOK = ansible-playbook +VIRTUALENV_NAME = venv + +# Helper functions to display messagse +ECHO_BLUE = @echo "\033[33;34m $1\033[0m" +ECHO_RED = @echo "\033[33;31m $1\033[0m" +ECHO_GREEN = @echo "\033[33;32m $1\033[0m" + +# The default server host local development +HOST ?= localhost:8000 + + +virtualenv: +# Create virtualenv + virtualenv -p python3 $(VIRTUALENV_NAME) + +requirements: +# Install project requirements + ( \ + source venv/bin/activate;\ + $(PIP_INSTALL_CMD) -r requirements.txt; \ + ) + +migrate: +# Run django migrations + ( \ + cd seeker; \ + $(MANAGE_CMD) migrate; \ + ) + +user: +# Create user account + ( \ + cd seeker; \ + echo "from django.contrib.auth.models import User; User.objects.create_superuser('admin', 'admin@email.com', 'pass')" | ./manage.py shell; \ + ) + +test: +# Run the test cases + ( \ + cd seeker; \ + $(MANAGE_CMD) test; \ + ) + +clean: +# Remove all *.pyc, .DS_Store and temp files from the project + $(call ECHO_BLUE,removing .pyc files...) + @find . -name '*.pyc' -exec rm -f {} \; + $(call ECHO_BLUE,removing static files...) + @rm -rf $(PROJECT_NAME)/_static/ + $(call ECHO_BLUE,removing temp files...) + @rm -rf $(PROJECT_NAME)/_tmp/ + $(call ECHO_BLUE,removing .DS_Store files...) + @find . -name '.DS_Store' -exec rm {} \; + +shell: +# Run a local shell for debugging + ( \ + cd seeker; \ + $(MANAGE_CMD) shell; \ + ) + +migrations: +# Create database migrations + ( \ + cd seeker; \ + $(MANAGE_CMD) makemigrations; \ + ) + +migrate: +# Run database migrations + ( \ + cd seeker; \ + $(MANAGE_CMD) migrate; \ + ) + +collectstatic: +# Collect static assets + ( \ + cd seeker; \ + $(MANAGE_CMD) collectstatic; \ + ) + +run: +# run django server + $(call ECHO_GREEN, Starting Django Server...) + ( \ + cd seeker; \ + $(MANAGE_CMD) runserver; \ + ) + From 959ff6ed17b3a8eb372f838ed7612ff207847f4b Mon Sep 17 00:00:00 2001 From: Nathan Workman Date: Sun, 1 Apr 2018 20:00:09 -0400 Subject: [PATCH 3/7] Run spiders from Make tasks --- Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile b/Makefile index f1e94a5..c906647 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ help: @echo "migrations Create database migrations" @echo "collectstatic Collect static assets" @echo "run Run Django Server" + @echo "crawl Run Scrapy Spider" .PHONY: requirements @@ -110,3 +111,10 @@ run: $(MANAGE_CMD) runserver; \ ) +crawl: +# Run scrapy spider + $(call ECHO_GREEN, Running $(spider) spider... ) + (\ + cd seeker; \ + scrapy crawl $(spider); \ + ) From 072e21e6caf291a00c1053b0cb0a4c84fa0c9248 Mon Sep 17 00:00:00 2001 From: Nathan Workman Date: Sun, 1 Apr 2018 20:39:09 -0400 Subject: [PATCH 4/7] Update xpaths --- Makefile | 10 +++++++++- seeker/jobs/models.py | 2 +- seeker/scraper/spiders/stackoverflow.py | 16 +++++----------- seeker/seeker/templates/jobs/job_list.html | 1 + 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index c906647..661a2d8 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,7 @@ ECHO_GREEN = @echo "\033[33;32m $1\033[0m" # The default server host local development HOST ?= localhost:8000 +reset: delete_sqlite migrate user run virtualenv: # Create virtualenv @@ -114,7 +115,14 @@ run: crawl: # Run scrapy spider $(call ECHO_GREEN, Running $(spider) spider... ) - (\ + (\ cd seeker; \ scrapy crawl $(spider); \ ) + +delete_sqlite: +# delete project db + ( \ + cd seeker; \ + rm -rf db.sqlite3;\ + ) diff --git a/seeker/jobs/models.py b/seeker/jobs/models.py index d747d0c..a629851 100644 --- a/seeker/jobs/models.py +++ b/seeker/jobs/models.py @@ -41,7 +41,7 @@ class Job(models.Model): class Meta: """Order by date published.""" - ordering = ["-pub_date"] + ordering = ["-scrape_date"] def __str__(self): """Set title.""" diff --git a/seeker/scraper/spiders/stackoverflow.py b/seeker/scraper/spiders/stackoverflow.py index ecbd6d5..43fef73 100644 --- a/seeker/scraper/spiders/stackoverflow.py +++ b/seeker/scraper/spiders/stackoverflow.py @@ -1,11 +1,7 @@ import scrapy -import re - from scrapy.spiders import Spider from scrapy.selector import Selector -# from scrapy.linkextractors import LinkExtractor from scraper.items import JobItem -# from datetime import datetime from django.utils import timezone @@ -22,13 +18,11 @@ def __init__(self, search_params='django python', *args, **kwargs): self.search_terms = search_params.split(",") def start_requests(self): - location = "" - distance = "" - search_query = "sort=p&q=%s&l=%s&d=%s&r=true" + search_query = "q=django&r=true&sort=p" base_url = "https://stackoverflow.com/jobs?" start_urls = [] - start_urls.append(base_url + search_query % (self.search_terms, location, distance)) + start_urls.append(base_url + search_query) return [scrapy.http.Request(url=start_url) for start_url in start_urls] @@ -39,9 +33,9 @@ def parse(self, response): for job in jobs: item = JobItem() item["title"] = job.xpath('.//a[@class="job-link"]/text()').extract_first() - item["company"] = re.sub(r'\W+', '', job.xpath('.//div[@class="-name"]/text()').extract_first(default='n/a').strip()) - item["body"] = job.xpath('.//div[@class="-name"]/text()').extract()[0].strip() - item["location"] = re.sub(r'\W+', '', job.xpath('.//div[@class="-location"]/text()').extract()[0].strip()) + item["company"] = job.xpath('.//div[@class="-name"]/text()').extract() + item["body"] = job.xpath('.//div[@class="-name"]/text()').extract() + item["location"] = job.xpath('.//div[@class="-location"]/text()').extract()[0].strip() item["url"] = job.xpath('.//a[@class="job-link"]/@href').extract()[0] item["pub_date"] = job.xpath('.//p[contains(@class, "-posted-date")]/text()').extract()[0].strip() item["email"] = "N/A" diff --git a/seeker/seeker/templates/jobs/job_list.html b/seeker/seeker/templates/jobs/job_list.html index 4e1ac51..49fc7e1 100644 --- a/seeker/seeker/templates/jobs/job_list.html +++ b/seeker/seeker/templates/jobs/job_list.html @@ -22,6 +22,7 @@

{{ job.title }}< {% endfor %} From b725217049a9b8a50fe39a4f45e41685a94c5e14 Mon Sep 17 00:00:00 2001 From: Nathan Workman Date: Sun, 1 Apr 2018 23:18:36 -0400 Subject: [PATCH 5/7] Refactor stackoverflow spider to extract detail information --- seeker/scraper/pipelines.py | 2 +- seeker/scraper/spiders/stackoverflow.py | 33 ++++++++++--------- .../includes/job_detail_sidebar.html | 9 ++++- seeker/seeker/templates/jobs/job_detail.html | 10 +----- seeker/seeker/templates/jobs/job_list.html | 2 +- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/seeker/scraper/pipelines.py b/seeker/scraper/pipelines.py index 6f33ef0..e86d4d4 100644 --- a/seeker/scraper/pipelines.py +++ b/seeker/scraper/pipelines.py @@ -22,7 +22,7 @@ def process_item(self, item, spider): item['board'] = self.get_or_create_board(item['job_board'], item['board_url']) item['title'] = item['title'] item['company'] = self.get_or_create_company(item['company'], item['email']) - item['body'] = item['body'][0] + item['body'] = "\n".join(item['body']) # extract() will return a list, which you need to concatenate to restore the original html item['pub_date'] = item['pub_date'] item['salary'] = item['salary'] item['location'] = item['location'] diff --git a/seeker/scraper/spiders/stackoverflow.py b/seeker/scraper/spiders/stackoverflow.py index 43fef73..c31cda5 100644 --- a/seeker/scraper/spiders/stackoverflow.py +++ b/seeker/scraper/spiders/stackoverflow.py @@ -2,6 +2,8 @@ from scrapy.spiders import Spider from scrapy.selector import Selector from scraper.items import JobItem +from scrapy.http import Request + from django.utils import timezone @@ -9,14 +11,6 @@ class StackOverflowSpider(Spider): name = "stackoverflow" allowed_domains = ["stackoverflow.com"] - def __init__(self, search_params='django python', *args, **kwargs): - super(StackOverflowSpider, self).__init__(*args, **kwargs) - - if not search_params: - raise ValueError("No search terms given") - - self.search_terms = search_params.split(",") - def start_requests(self): search_query = "q=django&r=true&sort=p" base_url = "https://stackoverflow.com/jobs?" @@ -27,19 +21,26 @@ def start_requests(self): return [scrapy.http.Request(url=start_url) for start_url in start_urls] def parse(self, response): + """Extract job detail urls from response.""" + hxs = Selector(response) + urls = hxs.xpath('//a[@class="job-link"]/@href').extract() + for url in urls: + yield Request('https://www.stackoverflow.com/' + url, callback=self.parse_detail_pages, dont_filter=True) + + def parse_detail_pages(self, response): hxs = Selector(response) jobs = hxs.xpath('//div[contains(@class, "-job-item")]') items = [] for job in jobs: item = JobItem() - item["title"] = job.xpath('.//a[@class="job-link"]/text()').extract_first() - item["company"] = job.xpath('.//div[@class="-name"]/text()').extract() - item["body"] = job.xpath('.//div[@class="-name"]/text()').extract() - item["location"] = job.xpath('.//div[@class="-location"]/text()').extract()[0].strip() - item["url"] = job.xpath('.//a[@class="job-link"]/@href').extract()[0] - item["pub_date"] = job.xpath('.//p[contains(@class, "-posted-date")]/text()').extract()[0].strip() - item["email"] = "N/A" - item["salary"] = job.xpath('.//span[@class="-salary"]/text()').extract_first(default='n/a').strip() + item["title"] = job.xpath('//a[@class="title job-link"]/text()').extract_first() + item["company"] = job.xpath('//div[@class="-company g-row"]/div[@class="-name"]/a/text()').extract() + item["body"] = job.xpath('//section[@class="-job-description"]/node()').extract() + item["location"] = "\n".join(job.xpath('//div[@class="-company g-row"]/div[@class="-location"]/text()').extract()) + item["url"] = response.request.url + item["pub_date"] = str('n/a') + item["email"] = str('n/a') + item["salary"] = job.xpath('//div[@class="-description"]/div[@class="-perks g-row"]/p[@class="-salary"]/text()').extract_first(default='n/a').strip() # item["tags"] = job.css('.-tags p a.post-tag::text').extract() item["scrape_date"] = timezone.now() item["job_board"] = "Stack Overflow" diff --git a/seeker/seeker/templates/includes/job_detail_sidebar.html b/seeker/seeker/templates/includes/job_detail_sidebar.html index 91277ea..9c54a50 100644 --- a/seeker/seeker/templates/includes/job_detail_sidebar.html +++ b/seeker/seeker/templates/includes/job_detail_sidebar.html @@ -12,6 +12,13 @@
Search
+ +
+
Company
+
+ {{ job.company }} +
+
Location
@@ -40,7 +47,7 @@
Contact
  • Website:
  • Phone:
  • -
  • Email:
  • +
  • Email: {{ job.company.email }}
diff --git a/seeker/seeker/templates/jobs/job_detail.html b/seeker/seeker/templates/jobs/job_detail.html index bad97de..25af22b 100644 --- a/seeker/seeker/templates/jobs/job_detail.html +++ b/seeker/seeker/templates/jobs/job_detail.html @@ -7,17 +7,9 @@

{{ job.title }}

{{ job.pub_date|localtime }}

- {% if job.organization %} -

Organization

-

{{ job.organization.title }}

- {% if job.organization.email %} -

{{ job.organization.email }}

- {% endif %} {% endif %} -

Source

-

{{ job.board.title }}


-

Posted {{ job.pub_date }} on {{ job.board }}

+

Posted {{ job.pub_date }} on {{ job.board }}


diff --git a/seeker/seeker/templates/jobs/job_list.html b/seeker/seeker/templates/jobs/job_list.html index 49fc7e1..41049ce 100644 --- a/seeker/seeker/templates/jobs/job_list.html +++ b/seeker/seeker/templates/jobs/job_list.html @@ -16,7 +16,7 @@

{{ job.title }}

-

{{ job.body }}

+

{{ job.body | safe }}

View Details →
{% endfor %} + {% include 'includes/pagination.html' %}
{% endblock %} diff --git a/seeker/seeker/templates/includes/pagination.html b/seeker/seeker/templates/includes/pagination.html new file mode 100644 index 0000000..a4e4933 --- /dev/null +++ b/seeker/seeker/templates/includes/pagination.html @@ -0,0 +1,24 @@ +{% if is_paginated %} + +{% endif %} diff --git a/seeker/seeker/templates/jobs/job_list.html b/seeker/seeker/templates/jobs/job_list.html index 41049ce..c870575 100644 --- a/seeker/seeker/templates/jobs/job_list.html +++ b/seeker/seeker/templates/jobs/job_list.html @@ -16,7 +16,7 @@

{{ job.title }}

-

{{ job.body | safe }}

+ View Details →
{% endfor %} + {% include 'includes/pagination.html' %} {% include 'includes/job_sidebar.html' %} {% endblock %} From d0489653453f48adc58c2ab7a5673dd2f254a30d Mon Sep 17 00:00:00 2001 From: Nathan Workman Date: Mon, 2 Apr 2018 12:00:25 -0400 Subject: [PATCH 7/7] Update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b27e316..40cc3d4 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ## What is Seeker? Seeker aims not to be a job board for everyone, but a job board for you. -Inevitably the time will come when you are on the hunt for a job, be it circumstance, looking for new opportunity, or just keeping an eye out for the dream job you don't know you want yet. Let Seeker do the leg work for you of checking multiple job boards for positions you might be interested in and organize them all in one convenient location. +Inevitably the time will come when you are on the hunt for a new job. Let Seeker do the leg work for you. Check multiple job boards for positions you might be interested in and organize them all in one convenient location. Currently, the search terms are hard coded to remote django and remote python positions - you'll need to manually update these for now.