-
Notifications
You must be signed in to change notification settings - Fork 31
/
scraper.go
267 lines (216 loc) · 6.36 KB
/
scraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
package main
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
)
// MaxQueueSize is the maximum size of the channel, per user, that
// feeds files to downloaders. After that, scraping slows down due
// to the channel being blocked and the scraper having to wait for
// files to free up.
// TODO(Liru): Implement infinite channels or something similar.
const MaxQueueSize = 10000
var (
inlineSearch = regexp.MustCompile(`(http:\/\/\d{2}\.media\.tumblr\.com\/\w{32}\/tumblr_inline_\w+\.\w+)`) // FIXME: Possibly buggy/unoptimized.
videoSearch = regexp.MustCompile(`"hdUrl":".*(tumblr_\w+)"`) // fuck it
altVideoSearch = regexp.MustCompile(`source src=".*(tumblr_\w+)(?:\/\d+)?" type`)
gfycatSearch = regexp.MustCompile(`href="https?:\/\/(?:www\.)?gfycat\.com\/(\w+)`)
)
// PostParseMap maps tumblr post types to functions that search those
// posts for content.
var PostParseMap = map[string]func(Post) []File{
"photo": parsePhotoPost,
"answer": parseAnswerPost,
"regular": parseRegularPost,
"video": parseVideoPost,
}
// TrimJS trims the javascript response received from Tumblr.
// The response starts with "var tumblr_api_read = " and ends with ";".
// We need to remove these to parse the response as JSON.
func TrimJS(c []byte) []byte {
// The length of "var tumblr_api_read = " is 22.
return c[22 : len(c)-2]
}
func parsePhotoPost(post Post) (files []File) {
var id string
if !cfg.IgnorePhotos {
if len(post.Photos) == 0 {
f := newFile(post.PhotoURL)
files = append(files, f)
id = f.Filename
} else {
for _, photo := range post.Photos {
f := newFile(photo.PhotoURL)
files = append(files, f)
id = f.Filename
}
}
}
if !cfg.IgnoreVideos {
var slug string
if len(id) > 26 {
slug = id[:26]
}
files = append(files, getGfycatFiles(post.PhotoCaption, slug)...)
}
return
}
func parseAnswerPost(post Post) (files []File) {
if !cfg.IgnorePhotos {
for _, f := range inlineSearch.FindAllString(post.Answer, -1) {
files = append(files, newFile(f))
}
}
return
}
func parseRegularPost(post Post) (files []File) {
if !cfg.IgnorePhotos {
for _, f := range inlineSearch.FindAllString(post.RegularBody, -1) {
files = append(files, newFile(f))
}
}
return
}
func parseVideoPost(post Post) (files []File) {
if !cfg.IgnoreVideos {
post.Video = bytes.Replace(post.Video, []byte("\\"), []byte(""), -1)
regextest := videoSearch.FindStringSubmatch(string(post.Video))
if regextest == nil { // hdUrl is false. We have to get the other URL.
regextest = altVideoSearch.FindStringSubmatch(string(post.Video))
}
// If it's still nil, it means it's another embedded video type, like Youtube, Vine or Pornhub.
// In that case, ignore it and move on. Not my problem.
if regextest == nil {
return
}
videoURL := strings.Replace(regextest[1], `\`, ``, -1)
videoURL = "https://vtt.tumblr.com/" + videoURL;
// If there are problems with downloading video, the below part may be the cause.
// videoURL = strings.Replace(videoURL, `/480`, ``, -1)
videoURL += ".mp4"
f := newFile(videoURL)
files = append(files, f)
// We slice from 0 to 24 because that's the length of the ID
// portion of a tumblr video file.
slug := f.Filename[:23]
files = append(files, getGfycatFiles(post.VideoCaption, slug)...)
}
return
}
func parseDataForFiles(post Post) (files []File) {
fn, ok := PostParseMap[post.Type]
if ok {
files = fn(post)
}
return
}
func makeTumblrURL(u *User, i int) *url.URL {
base := fmt.Sprintf("https://%s.tumblr.com/api/read/json", u.name)
tumblrURL, err := url.Parse(base)
checkFatalError(err, "tumblrURL: ")
vals := url.Values{}
vals.Set("num", "50")
vals.Add("start", strconv.Itoa((i-1)*50))
// vals.Add("type", "photo")
if u.tag != "" {
vals.Add("tagged", u.tag)
}
tumblrURL.RawQuery = vals.Encode()
return tumblrURL
}
func shouldFinishScraping(lim <-chan time.Time, done <-chan struct{}) bool {
select {
case <-done:
return true
default:
select {
case <-done:
return true
case <-lim:
// We get a value from limiter, and proceed to scrape a page.
return false
}
}
}
func scrape(u *User, limiter <-chan time.Time) <-chan File {
var once sync.Once
u.fileChannel = make(chan File, MaxQueueSize)
go func() {
done := make(chan struct{})
closeDone := func() { close(done) }
var i, numPosts int
// We need to put all of the following into a function because
// Go evaluates params at defer instead of at execution.
// That, and it beats writing `defer` multiple times.
defer func() {
u.finishScraping(i)
}()
for i = 1; ; i++ {
if shouldFinishScraping(limiter, done) {
return
}
tumblrURL := makeTumblrURL(u, i)
showProgress(u.name, "is on page", i, "/", (numPosts/50)+1)
var resp *http.Response
var err error
var contents []byte
for {
resp, err = http.Get(tumblrURL.String())
// XXX: Ugly as shit. This could probably be done better.
if err != nil {
log.Println("http.Get:", u, err)
continue
}
contents, err = ioutil.ReadAll(resp.Body)
if err != nil {
log.Println("ReadAll:", u, err,
"(", len(contents), "/", resp.ContentLength, ")")
continue
}
err = resp.Body.Close()
checkError(err)
break
}
atomic.AddUint64(&gStats.bytesOverhead, uint64(len(contents)))
// This is returned as pure javascript. We need to filter out the variable and the ending semicolon.
contents = TrimJS(contents)
var blog TumbleLog
err = json.Unmarshal(contents, &blog)
if err != nil {
// Goddamnit tumblr, make a consistent API that doesn't
// fucking return strings AND booleans in the same field
ioutil.WriteFile("json_error.txt", contents, 0644)
log.Println("Unmarshal:", err)
}
numPosts = blog.TotalPosts
u.scrapeWg.Add(1)
defer u.scrapeWg.Done()
for _, post := range blog.Posts {
id, err := post.ID.Int64()
if err != nil {
log.Println(err)
}
u.updateHighestPost(id)
if !cfg.ForceCheck && id <= u.lastPostID {
once.Do(closeDone)
return
}
u.Queue(post)
} // Done searching all posts on a page
if len(blog.Posts) < 50 {
break
}
} // loop that searches blog, page by page
}() // Function that asynchronously adds all downloadables from a blog to a queue
return u.fileChannel
}