Merge pull request #8 from NathanWorkman/release/v0.2.0

Release/v0.2.0
NathanWorkman · Apr 2, 2018 · fe9856d · fe9856d
2 parents 45fa03b + d048965
commit fe9856d
Show file tree

Hide file tree

Showing 13 changed files with 238 additions and 46 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,128 @@
+PROJECT_NAME = seeker
+SHELL := /bin/sh
+help:
+	@echo "Please use 'make <target>' where <target> is one of"
+	@echo "virtualenv                   Create virtual enviroment"
+	@echo "requirements                 Install requirements.txt"
+	@echo "migrate 						Run Django migrations"
+	@echo "user  						Create user account"
+	@echo "test  						Run tests"
+	@echo "clean 						Remove all *.pyc, .DS_Store and temp files from the project"
+	@echo "shell 						Open Django shell"
+	@echo "migrations 					Create database migrations"
+	@echo "collectstatic 				Collect static assets"
+	@echo "run 							Run Django Server"
+	@echo "crawl <spidername>           Run Scrapy Spider"
+
+.PHONY: requirements
+
+
+# Command variables
+MANAGE_CMD = python manage.py
+PIP_INSTALL_CMD = pip install
+PLAYBOOK = ansible-playbook
+VIRTUALENV_NAME = venv
+
+# Helper functions to display messagse
+ECHO_BLUE = @echo "\033[33;34m $1\033[0m"
+ECHO_RED = @echo "\033[33;31m $1\033[0m"
+ECHO_GREEN = @echo "\033[33;32m $1\033[0m"
+
+# The default server host local development
+HOST ?= localhost:8000
+
+reset: delete_sqlite migrate user run
+
+virtualenv:
+# Create virtualenv
+	virtualenv -p python3 $(VIRTUALENV_NAME)
+
+requirements:
+# Install project requirements
+	( \
+		source venv/bin/activate;\
+		$(PIP_INSTALL_CMD) -r requirements.txt; \
+	)
+
+migrate:
+# Run django migrations
+	( \
+		cd seeker; \
+		$(MANAGE_CMD) migrate; \
+	)
+
+user:
+# Create user account
+	( \
+		cd seeker; \
+		echo "from django.contrib.auth.models import User; User.objects.create_superuser('admin', 'admin@email.com', 'pass')" | ./manage.py shell; \
+	)
+
+test:
+# Run the test cases
+	( \
+		cd seeker; \
+		$(MANAGE_CMD) test; \
+	)
+
+clean:
+# Remove all *.pyc, .DS_Store and temp files from the project
+	$(call ECHO_BLUE,removing .pyc files...)
+	@find . -name '*.pyc' -exec rm -f {} \;
+	$(call ECHO_BLUE,removing static files...)
+	@rm -rf $(PROJECT_NAME)/_static/
+	$(call ECHO_BLUE,removing temp files...)
+	@rm -rf $(PROJECT_NAME)/_tmp/
+	$(call ECHO_BLUE,removing .DS_Store files...)
+	@find . -name '.DS_Store' -exec rm {} \;
+
+shell:
+# Run a local shell for debugging
+	( \
+		cd seeker; \
+		$(MANAGE_CMD) shell; \
+	)
+
+migrations:
+# Create database migrations
+	( \
+		cd seeker; \
+		$(MANAGE_CMD) makemigrations; \
+	)
+
+migrate:
+# Run database migrations
+	( \
+		cd seeker; \
+		$(MANAGE_CMD) migrate; \
+	)
+
+collectstatic:
+# Collect static assets
+	( \
+		cd seeker; \
+		$(MANAGE_CMD) collectstatic; \
+	)
+
+run:
+# run django server
+	$(call ECHO_GREEN, Starting Django Server...)
+	( \
+		cd seeker; \
+		$(MANAGE_CMD) runserver; \
+	)
+
+crawl:
+# Run scrapy spider
+	$(call ECHO_GREEN, Running $(spider) spider... )
+	 (\
+		cd seeker; \
+		scrapy crawl $(spider);  \
+	)
+
+delete_sqlite: 
+# delete project db
+	( \
+		cd seeker; \
+		rm -rf db.sqlite3;\
+	)
diff --git a/README.md b/README.md
@@ -1,12 +1,17 @@
-# :sunglasses: seeker [WIP]
+# :sunglasses: Seeker [WIP]
 
-Another job board aggregator. This is very much a work in progress and documentation is still incomplete. 
+## What is Seeker?
+Seeker aims not to be a job board for everyone, but a job board for you.
 
-Right now the search terms are hard coded to remote django and python positions.
+Inevitably the time will come when you are on the hunt for a new job. Let Seeker do the leg work for you. Check multiple job boards for positions you might be interested in and organize them all in one convenient location.
+
+Currently, the search terms are hard coded to remote django and remote python positions - you'll need to manually update these for now.
 
 To change the search terms edit the query inside each spider.
 
-Some quick setup instructions:
+## Setup
+
+Some quick setup instructions
 
 I would recommend installing [virtualenv](https://virtualenv.readthedocs.io/).
 
@@ -25,7 +30,7 @@ python manage.py runserver
 ```
 
 ### To run the spiders
-From the root directory of the `seeker` project `cd` to seeker
+From the root directory of the `seeker` project `cd` to the seeker app directory.
 
 ```
 cd seeker
@@ -43,4 +48,40 @@ scrapy crawl stackoverflow
 ```
 
 
-Navigate to the django admin to view your results.
+Navigate to the django admin to view your results.
+
+
+## TODO
+
+#### Future features.
+- [x] Simple UI
+- [ ] Enhanced UI
+- [x] Pagination
+- [ ] Breadcrumbs Navigation
+- [ ] Settings Panel 
+- [ ] Move all environmental variables to .env using PyEnv.
+- [ ] Save/Favorite Job Postings
+- [ ] Tag/Skill Views
+- [ ] Full-Time, Part-Time, Contract
+- [ ] Email Notifications - send daily, weekly, monthly email notifications of new job postings.
+- [ ] Celery Beat - run spiders on a schedule.
+
+#### Spiders
+Want a spider not listed here? Feel free to open a pull request and add it to the list or implement the spider yourself. 
+- [x] [Stack Overflow](https://www.stackoverflow.com/jobs)
+- [ ] [Indeed](https://www.indeed.com)
+- [ ] [Dice](http://dice.com)
+- [ ] [Angel.co](https://angel.co/)
+- [ ] [RemotePython](https://www.remotepython.com)
+- [ ] [DjangoJobs](https://djangojobs.net/jobs/)
+- [ ] [DjangoGigs](https://djangogigs.com)
+- [ ] [Jobspresso](http://jobspresso.co)
+- [ ] [Authentic Jobs](http://authenticjobs.com/)
+- [ ] [We Work Remotely](https://weworkremotely.com/)
+- [ ] [Remotive](https://remotive.io)
+- [ ] [Python.org](https://www.python.org/jobs/)
+
+
+
+
+
diff --git a/seeker/companies/views.py b/seeker/companies/views.py
@@ -11,4 +11,5 @@ class CompanyDetailView(DetailView):
 
 class CompanyListView(ListView):
     """Companies List View."""
-    model = Company
+    paginate_by = 10
+    queryset = Company.objects.all()
diff --git a/seeker/jobs/models.py b/seeker/jobs/models.py
@@ -41,7 +41,7 @@ class Job(models.Model):
 
     class Meta:
         """Order by date published."""
-        ordering = ["-pub_date"]
+        ordering = ["-scrape_date"]
 
     def __str__(self):
         """Set title."""

diff --git a/seeker/jobs/views.py b/seeker/jobs/views.py
@@ -11,4 +11,5 @@ class JobDetailView(DetailView):
 
 class JobListView(ListView):
     """Job List View."""
-    model = Job
+    paginate_by = 10
+    queryset = Job.objects.all()
diff --git a/seeker/scraper/pipelines.py b/seeker/scraper/pipelines.py
@@ -22,7 +22,7 @@ def process_item(self, item, spider):
         item['board'] = self.get_or_create_board(item['job_board'], item['board_url'])
         item['title'] = item['title']
         item['company'] = self.get_or_create_company(item['company'], item['email'])
-        item['body'] = item['body'][0]
+        item['body'] = "\n".join(item['body'])  # extract() will return a list, which you need to concatenate to restore the original html
         item['pub_date'] = item['pub_date']
         item['salary'] = item['salary']
         item['location'] = item['location']

diff --git a/seeker/scraper/spiders/stackoverflow.py b/seeker/scraper/spiders/stackoverflow.py
@@ -1,51 +1,46 @@
 import scrapy
-import re
-
 from scrapy.spiders import Spider
 from scrapy.selector import Selector
-# from scrapy.linkextractors import LinkExtractor
 from scraper.items import JobItem
-# from datetime import datetime
+from scrapy.http import Request
+
 from django.utils import timezone
 
 
 class StackOverflowSpider(Spider):
     name = "stackoverflow"
     allowed_domains = ["stackoverflow.com"]
 
-    def __init__(self, search_params='django python', *args, **kwargs):
-        super(StackOverflowSpider, self).__init__(*args, **kwargs)
-
-        if not search_params:
-            raise ValueError("No search terms given")
-
-        self.search_terms = search_params.split(",")
-
     def start_requests(self):
-        location = ""
-        distance = ""
-        search_query = "sort=p&q=%s&l=%s&d=%s&r=true"
+        search_query = "q=django&r=true&sort=p"
         base_url = "https://stackoverflow.com/jobs?"
         start_urls = []
 
-        start_urls.append(base_url + search_query % (self.search_terms, location, distance))
+        start_urls.append(base_url + search_query)
 
         return [scrapy.http.Request(url=start_url) for start_url in start_urls]
 
     def parse(self, response):
+        """Extract job detail urls from response."""
+        hxs = Selector(response)
+        urls = hxs.xpath('//a[@class="job-link"]/@href').extract()
+        for url in urls:
+            yield Request('https://www.stackoverflow.com/' + url, callback=self.parse_detail_pages, dont_filter=True)
+
+    def parse_detail_pages(self, response):
         hxs = Selector(response)
         jobs = hxs.xpath('//div[contains(@class, "-job-item")]')
         items = []
         for job in jobs:
             item = JobItem()
-            item["title"] = job.xpath('.//a[@class="job-link"]/text()').extract_first()
-            item["company"] = re.sub(r'\W+', '', job.xpath('.//div[@class="-name"]/text()').extract_first(default='n/a').strip())
-            item["body"] = job.xpath('.//div[@class="-name"]/text()').extract()[0].strip()
-            item["location"] = re.sub(r'\W+', '', job.xpath('.//div[@class="-location"]/text()').extract()[0].strip())
-            item["url"] = job.xpath('.//a[@class="job-link"]/@href').extract()[0]
-            item["pub_date"] = job.xpath('.//p[contains(@class, "-posted-date")]/text()').extract()[0].strip()
-            item["email"] = "N/A"
-            item["salary"] = job.xpath('.//span[@class="-salary"]/text()').extract_first(default='n/a').strip()
+            item["title"] = job.xpath('//a[@class="title job-link"]/text()').extract_first()
+            item["company"] = job.xpath('//div[@class="-company g-row"]/div[@class="-name"]/a/text()').extract()
+            item["body"] = job.xpath('//section[@class="-job-description"]/node()').extract()
+            item["location"] = "\n".join(job.xpath('//div[@class="-company g-row"]/div[@class="-location"]/text()').extract())
+            item["url"] = response.request.url
+            item["pub_date"] = str('n/a')
+            item["email"] = str('n/a')
+            item["salary"] = job.xpath('//div[@class="-description"]/div[@class="-perks g-row"]/p[@class="-salary"]/text()').extract_first(default='n/a').strip()
             # item["tags"] = job.css('.-tags p a.post-tag::text').extract()
             item["scrape_date"] = timezone.now()
             item["job_board"] = "Stack Overflow"

diff --git a/seeker/seeker/templates/base.html b/seeker/seeker/templates/base.html
@@ -1,5 +1,5 @@
 <!DOCTYPE html> {% load i18n staticfiles %}
-<html lang="{% get_current_language as language %}{{ language }}">
+<html lang="en">
 
 <head>
     <meta charset="utf-8">

diff --git a/seeker/seeker/templates/companies/company_list.html b/seeker/seeker/templates/companies/company_list.html
@@ -21,5 +21,6 @@ <h2 class="card-title"><a href="{{ company.get_absolute_url }}">{{ company.title
         </div>
     </div>
     {% endfor %}
+    {% include 'includes/pagination.html' %}
 </div>
 {% endblock %}
diff --git a/seeker/seeker/templates/includes/job_detail_sidebar.html b/seeker/seeker/templates/includes/job_detail_sidebar.html
@@ -12,6 +12,13 @@ <h5 class="card-header">Search</h5>
             </div>
         </div>
     </div>
+    <!-- Contact Widget -->
+    <div class="card my-4">
+        <h5 class="card-header">Company</h5>
+        <div class="card-body">
+            {{ job.company }}
+        </div>
+    </div>
     <!-- Location Widget -->
     <div class="card my-4">
         <h5 class="card-header">Location</h5>
@@ -40,7 +47,7 @@ <h5 class="card-header">Contact</h5>
             <ul class="list-unstyled">
                 <li>Website:</li>
                 <li>Phone:</li>
-                <li>Email:</li>
+                <li>Email: {{ job.company.email }}</li>
             </ul>
         </div>
     </div>

diff --git a/seeker/seeker/templates/includes/pagination.html b/seeker/seeker/templates/includes/pagination.html
@@ -0,0 +1,24 @@
+{% if is_paginated %}
+<nav class="m-4">
+  <ul class="pagination justify-content-center">
+      {% if page_obj.has_previous %}
+      <li class="page-item"><a href="?page={{ page_obj.previous_page_number }}" class="page-link">Previous</a></li>
+      {% else %}
+      <li class="page-item disabled"><a href="" class="page-link disabled">Previous</a></li>
+      {% endif %} 
+      {% for i in paginator.page_range %} 
+      {% if page_obj.number == i %}
+      <li class="page-item active"><span class="page-link">{{ i }} <span class="sr-only">(current)</span></span>
+      </li>
+      {% else %}
+      <li class="page-item"><a href="?page={{ i }}" class="page-link">{{ i }}</a></li>
+      {% endif %} 
+      {% endfor %} 
+      {% if page_obj.has_next %}
+      <li class="page-item"><a href="?page={{ page_obj.next_page_number }}" class="page-link">Next</a></li>
+      {% else %}
+      <li class="page-item disabled"><span class="page-link">Next</span></li>
+      {% endif %}
+  </ul>
+</nav>
+{% endif %}
diff --git a/seeker/seeker/templates/jobs/job_detail.html b/seeker/seeker/templates/jobs/job_detail.html
@@ -7,17 +7,9 @@
 <div class="col-md-8">
     <h2>{{ job.title }}</h2>
     <p>{{ job.pub_date|localtime }}</p>
-    {% if job.organization %}
-    <h2>Organization</h2>
-    <p><a href="{{ job.organization.get_absolute_url }}">{{ job.organization.title }}</a></p>
-    {% if job.organization.email %}
-    <p><a href="mailto:{{ job.organization.email }}">{{ job.organization.email }}</a></p>
-    {% endif %} {% endif %}
-    <h2>Source</h2>
-    <p><a href="{{ job.url }}" rel="external">{{ job.board.title }}</a></p>
     <hr>
     <!-- Date/Time -->
-    <p> Posted {{ job.pub_date }} on <a href="https://{{ job.board.url }}{{ job.url }}" target="_blank">{{ job.board }}</a></p>
+    <p> Posted {{ job.pub_date }} on <a href="{{ job.url }}" target="_blank">{{ job.board }}</a></p>
     <hr>
     <!-- Preview Image -->
     <img class="img-fluid rounded" src="http://placehold.it/900x300" alt="">