-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
75 lines (57 loc) · 2.82 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import urllib2
import json
from bs4 import BeautifulSoup
from parser import Parser
from inserter import Inserter
# Takes about 2 minutes to crunch data of all the departments' offerings
class Department:
def __init__(self):
self.departments = ["ACC", "ADA", "AMER", "ARCH", "BF", "BIM", "BTE", "CAA", "CAD", "CHEM", "CI", "CINT", "CITE", "COMD", "CS", "CTE", "CTIS", "CTP", "DIR", "ECON", "EDEB", "EE", "EEE", "EEPS", "ELIT", "ELS", "EM", "EMBA", "ENG", "ETE", "ETS", "FA", "FRE", "FRL", "FRP", "GE", "GER", "GIA", "GRA", "HART", "HCIV", "HIST", "HISTR", "HUM", "IAED", "IE", "IR", "ITA", "JAP", "LAUD", "LAW", "LNG", "MAN", "MATH", "MBA", "MBG", "ME", "MIAPP", "MSC", "MSG", "MSN", "MTE", "MUS", "MUSS", "NSC", "PE", "PHIL", "PHYS", "PNT", "POLS", "PREP", "PSYC", "RUS", "SFL", "SOC", "SPA", "TE", "TEFL", "THEA", "THM", "THR", "THS", "TOEFL", "TRIN", "TRK", "TTP", "TURK"]
self.count = 0
self.length = len(self.departments)
def next(self):
self.count += 1
if(self.count > self.length):
return ''
else:
return self.departments[self.count - 1]
def get_pretty_source(departmentCode, semester):
url = 'https://stars.bilkent.edu.tr/homepage/print/plainOfferings.php?COURSE_CODE='
url2 = '&SEMESTER='
page = urllib2.urlopen(url + departmentCode + url2 + semester).read()
soup = BeautifulSoup(page, 'html.parser')
soup.prettify()
return soup
def get_courses(departmentCode, parser):
pageSource = get_pretty_source(departmentCode, semester)
trs = pageSource.findAll('tr')
currentCourses = []
for i in xrange(2, len(trs)):
td_list = trs[i].find_all("td")
parser.set_td_list(td_list)
currentCourse = parser.get_course()
currentCourses.append(currentCourse)
return currentCourses
departments = Department()
# TODO: Add logic to set the semester to spring if the month is January/February, fall if month is July/August/September/October, and May and June for summer.
with open('config.json') as config:
data = json.load(config)
semester = data['semester']
# Instantiate new Parser
parser = Parser()
writeToMongo = True # Change to False not to write to mongo database.
if writeToMongo:
# Check for options for initialization of Inserter, options like where to write, db
inserter = Inserter(collectionName=semester)
collection = inserter.getCollection()
departmentCode = departments.next()
while departmentCode != '':
# print departmentCode
currentCourses = get_courses(departmentCode, parser)
departmentData = {'currentCourses': currentCourses}
# print departmentData
jsonData = json.dumps(departmentData) # All yours
if writeToMongo:
for course in currentCourses:
sectionID = collection.insert_one(course).inserted_id
departmentCode = departments.next()