-
Notifications
You must be signed in to change notification settings - Fork 0
/
Project 4E 1b.py
143 lines (99 loc) · 4.47 KB
/
Project 4E 1b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from bs4 import BeautifulSoup
import requests
from time import strftime
import os
##The below is site-specific for Nytimes.com
##I would like to build a dictionary for different
##sites so the user can type in the news site
##and the program would alter itself for that
##
##For now, just Nytimes.com
CONTENT_KEYWORD = "story-body-text story-content"
def urlOpenAndWrite(url):
try:
##Pulls in website content using 'requests' library
res = requests.get(url)
##Creates a unique ID for the raw HTML content
fileToParse = strftime("%a, %d %b %Y %H:%M:%S") + ".txt"
##Commits raw HTML content to a file
target = open(fileToParse, "w")
target.write(res.text)
target.close()
##Gives raw HTML content back
return fileToParse
except:
print("Something happened with urlOpenAndWrite. Possible bad link? Bad raw file name?")
def sniffAndRetrieveContent(fileToParse):
try:
##Converts raw HTML into tree object to step through
html_doc = open(fileToParse)
soup = BeautifulSoup(html_doc, 'html.parser')
##Create the final file title from the article
savedStoryName = (soup.title.string) + ".txt"
##Steps through tree scraping only article content
elements = soup.find_all("p", class_=CONTENT_KEYWORD)
##Iterates through a 'clean' version of scraped
##HTML content
listOfWholeLines = []
for i in elements:
for word in i:
word = word.string
listOfWholeLines.append(word)
##Returns the final file name and the story
##as a list of scraped and rendered HTML lines
return listOfWholeLines, savedStoryName
except:
print("Something happened with sniffAndRetrieveContent. Possible issue with content keyword?")
def compileContent(listOfWholeLines, savedStoryName):
try:
##Take a list of article's lines and creates a
##single string. The thinking behind one
##line is to have something easy to process
##if user decides to split then index the terms
##of the article later
##
##There is one minor issue in the transfer here,
##after a period in the article there's no space.
##It's a minor thing that can be corrected later
wholeStory = ""
for i in range(len(listOfWholeLines)):
line = listOfWholeLines[i]
wholeStory = wholeStory + line
##Returns the desired news
##story as a single string.
return wholeStory
except:
print("Something happened with compileContent, perhaps something's wrong with the compiled list?")
def commitContent(wholeStory, savedStoryName):
try:
##Given the story as a string, and the accompanying
##descriptive title, this will create a .txt file
##with the descriptive title as its name and
##contains the desired story as a single string
target = open(savedStoryName, "w")
target.write(wholeStory)
target.close()
except:
print("Something happened with commitContent, perhaps something's off with wholeStory? or savedStoryName?")
def newsScrape(url):
try:
rawFile = urlOpenAndWrite(url)
(list, name) = sniffAndRetrieveContent(rawFile)
wholeStory = compileContent(list, name)
commitContent(wholeStory, name)
##Once the story has been rendered,
##this removes the raw file, which is
##not needed anymore.
os.remove(rawFile)
except:
print("Something happened with newsScrape, did you put in the URL for a single article?")
##Mike, these links from nytimes.com are for you
##to plug in and try! Feel free to navigate to
##the site, click on an article, then paste in the
##url into newsScrape.
test1 = "http://www.nytimes.com/2016/12/12/world/europe/rex-tillersons-company-exxon-has-billions-at-stake-over-russia-sanctions.html?partner=rss&emc=rss"
test2 = "http://www.nytimes.com/2016/12/13/us/politics/donald-trump-transition.html?partner=rss&emc=rss"
test3 = "http://www.nytimes.com/2016/12/13/us/politics/rex-tillerson-secretary-state-trump.html?partner=rss&emc=rss"
test4 = "http://www.nytimes.com/2016/12/13/us/politics/rick-perry-energy-secretary-trump.html?partner=rss&emc=rss"
test5 = "http://www.nytimes.com/2016/12/12/world/europe/rex-tillersons-company-exxon-has-billions-at-stake-over-russia-sanctions.html?partner=rss&emc=rss"
newsScrape(test1)