-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.rb
73 lines (60 loc) · 1.53 KB
/
scrape.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Scrape Doug Hennig's Technical Articles.
require 'rubygems'
require 'fileutils'
require 'json'
require 'nokogiri'
require 'open-uri'
# require 'rubyzip'
class Article
attr_accessor :title
attr_accessor :folder
attr_accessor :description
attr_accessor :pdf_url
attr_accessor :code_url
end
OUTPUT_RELATIVE = "../"
if !File.exist?('papers.html')
open('papers.html', 'wb') do |file|
file << open('http://doughennig.com/papers/').read
end
end
page = Nokogiri::HTML(open("papers.html"))
articles = Array.new
page.css( "h3" ).each { |tag|
art = Hash.new
art_title = tag.text
next if art_title == "My personal Web site"
art[:title] = art_title
art[:folder] = art_title.gsub(/\?|\!|\'|\:|\,|\.0/,"").gsub(/\s/,"_")
desc = tag.next_element
art[:description] = desc.text
linkspara = desc.next_element
links = linkspara.css( "a" )
if links.length > 0
art[:pdf_url] = links[0]["href"]
if links.length > 1
art[:samples_url] = links[1]["href"]
end
end
articles << art
}
File.open("articles.json", 'w') { |file|
file.write( articles.to_json )
}
articles.each { |a|
outfolder = OUTPUT_RELATIVE + a[:folder]
Dir.mkdir outfolder if ! Dir.exists? outfolder
File.open( outfolder + "/readme.md", 'w') { |file|
filetext = []
filetext << "# " + a[:title]
filetext << " "
filetext << "*by Doug Hennig*"
filetext << " "
filetext << a[:description]
filetext << " "
filetext << "----"
filetext << " "
filetext << File.read("bio.md")
file.write( filetext.join( "\n" ) )
}
}