-
Notifications
You must be signed in to change notification settings - Fork 4
/
MIT OCW Scraper.py
152 lines (134 loc) · 5 KB
/
MIT OCW Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import requests
import csv
import json
import os
from time import sleep
from bs4 import BeautifulSoup
#import urllib
def add_data(topic, subTopic, speciality, courseName, courseId, link, resourceType, name, path):
data = [topic, subTopic, speciality, courseName, courseId, link, resourceType, name, path]
with open("scraper.csv", "a") as fp:
wr = csv.writer(fp, dialect='excel')
wr.writerow(data)
topics = []
mainURL = "https://ocw.mit.edu/courses/find-by-topic/topics.json"
print "Listing the Topics ......."
loop1 = True
while loop1:
try:
r = requests.get(mainURL)
loop1 = False
except IOError as e:
print "Socket error. Sleeping for 2 seconds"
sleep(2)
continue
except requests.exceptions.ConnectionError as e:
print "Proxy Error. Sleeping for 2 seconds"
sleep(2)
continue
jsonTexts = json.loads(r.content.decode())
for jsonText in jsonTexts:
topics.append([jsonText["name"],jsonText['file']])
print "\nFollowing Topics are available to Scrape : \n"
topicsCount = len(topics)
for i in range(0,topicsCount):
print str(i+1) + ") " + topics[i][0] + ""
topicNumber = 0
while topicNumber == 0:
topicNumber = int(raw_input("\nPlease give the topic number which you want to scrape? \n"))
if topicNumber > topicsCount:
topicNumber = 0
selectedTopicNumber = topicNumber - 1
selectedTopicName = topics[selectedTopicNumber][0]
selecteTopicUrl = "https://ocw.mit.edu/courses/find-by-topic/" + topics[selectedTopicNumber][1]
print "Scraping the topic " + selectedTopicName + ". (Based on the number of subjects in topic, scraping might take some time.) \n"
loop2 = True
while loop2:
try:
r = requests.get(selecteTopicUrl)
loop2 = False
except IOError as e:
print "Socket error. Sleeping for 2 seconds"
sleep(2)
continue
except requests.exceptions.ConnectionError as e:
print "Proxy Error. Sleeping for 2 seconds"
sleep(2)
continue
json_texts = json.loads(r.content.decode())
subjectsCount = len(json_texts)
print "Total subjects in topic " + selectedTopicName + " are " + str(subjectsCount) + ". \n"
subjectsToScrape = 0
while subjectsToScrape == 0 :
subjectsToScrape = int(raw_input("How many Subjects do you want to scrape? \n"))
if subjectsToScrape > subjectsCount:
subjectsToScrape = 0
print "Starting to scrape " + str(subjectsToScrape) + " subjects."
for i in range(0,subjectsToScrape):
courseName = json_texts[i]['title']
courseLink = json_texts[i]['href']
courseId = json_texts[i]['id']
print str(i+1) + ") Scraping " + courseName + "."
if(json_texts[i]["textbooks"] == False):
if courseLink.startswith("courses"):
finalUrl = "https://ocw.mit.edu/" + courseLink + "/download-course-materials/"
elif courseLink.startswith("resources"):
finalUrl = "https://ocw.mit.edu/" + courseLink + "/download-resource-materials/"
else:
continue
loop3 = True
while loop3:
try:
r = requests.get(finalUrl)
loop3 = False
except IOError as e:
print "Socket error. Sleeping for 2 seconds"
sleep(2)
continue
except requests.exceptions.ConnectionError as e:
print "Proxy Error. Sleeping for 2 seconds"
sleep(2)
continue
soup1 = BeautifulSoup(r.text)
if len(soup1.select("a.downloadNowButton")) >= 1:
print "Downloading the Study Material for this subject. "
downloadUrl = soup1.select("a.downloadNowButton")[0]['href']
downloadUrl = "https://ocw.mit.edu" + downloadUrl
fullfilename = os.path.join('courses/'+ selectedTopicName +'/', courseId + ".zip")
loop4 = True
while loop4:
try:
#Because of some problems with proxy, I was unable to use urllib.
#urllib.urlretrieve(downloadUrl,fullfilename)
r = requests.get(downloadUrl)
with open(fullfilename, "wb") as zipFile:
zipFile.write(r.content)
loop4 = False
except IOError as e:
print e
print "Socket error. Sleeping for 5 seconds"
sleep(5)
continue
except requests.exceptions.ConnectionError as e:
print "Proxy Error. Stopping the script for 5 seconds"
sleep(5)
continue
print "Download Finished, Adding details to csv. \n"
for topics in json_texts[i]['topics']:
subTopic = topics['subCat']
speciality = topics['speciality']
add_data(selectedTopicName,subTopic,speciality,courseName,courseId,courseLink,"Study Material",' ',fullfilename)
else:
for details in json_texts[i]['topics']:
subTopic = details['subCat']
speciality = details['speciality']
add_data(selectedTopicName,subTopic,speciality,courseName,courseId,courseLink," ",' ',' ')
print "No course material available for this Subject. Adding Subject details to csv. \n"
else:
for details in json_texts[i]['topics']:
subTopic = details['subCat']
speciality = details['speciality']
add_data(selectedTopicName,subTopic,speciality,courseName,courseId,courseLink," ",' ',' ')
print "No course material available for this Subject. Adding subject details to csv. \n"
print "Scraping all " + str(subjectsToScrape) + " Subjects Done (y) .\n"
print "Thank You for using this script. For any issues please mail to sunil@suniltatipelly.in :) \n"