-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathrod.go
125 lines (104 loc) · 3.63 KB
/
rod.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package main
import (
"time"
"github.com/go-rod/rod"
"github.com/go-rod/rod/lib/launcher"
"github.com/go-rod/rod/lib/proto"
"github.com/ssoroka/slice"
)
func launchInLambda() *launcher.Launcher {
return launcher.New().
// where lambda runtime stores chromium
Bin("/opt/chromium").
// recommended flags to run in serverless environments
// see https://github.com/alixaxel/chrome-aws-lambda/blob/master/source/index.ts
Set("allow-running-insecure-content").
Set("autoplay-policy", "user-gesture-required").
Set("disable-component-update").
Set("disable-domain-reliability").
Set("disable-features", "AudioServiceOutOfProcess", "IsolateOrigins", "site-per-process").
Set("disable-print-preview").
Set("disable-setuid-sandbox").
Set("disable-site-isolation-trials").
Set("disable-speech-api").
Set("disable-web-security").
Set("disk-cache-size", "33554432").
Set("enable-features", "SharedArrayBuffer").
Set("hide-scrollbars").
Set("ignore-gpu-blocklist").
Set("in-process-gpu").
Set("mute-audio").
Set("no-default-browser-check").
Set("no-pings").
Set("no-sandbox").
Set("no-zygote").
Set("single-process").
Set("use-gl", "swiftshader").
Set("window-size", "1920", "1080")
}
func getPageHTML(url string) (html string, err error) {
// If Rod fails, it needs to correctly timeout before the timeout we set as the lambda fn's timeout
// this ensures that the browser instance is properly killed and cleaned up
//
// these timeouts should collectively be less than the timeout we set for the lambda
const (
navigateTimeout = 5 * time.Second
navigationTimeout = 5 * time.Second
requestIdleTimeout = 10 * time.Second
htmlTimeout = 5 * time.Second
)
err = rod.Try(func() {
// instantiate the chromium launcher
launcher := launchInLambda()
// lambda warm starts reuse environments:
//
// we must delete data generated by the browser,
// otherwise repeated calls to this fn will eat up storage
// and the lambda will fail
defer launcher.Cleanup()
//
// likewise, browser.close() will leave a zombie process
// so we must kill the chromium processes completely
// otherwise memory consumption will be much higher
defer launcher.Kill()
u := launcher.MustLaunch()
// create a browser instance
browser := rod.New().ControlURL(u).MustConnect()
// open a page
page := browser.MustPage()
// Block loading any resources we don't need in headless
// https://go-rod.github.io/#/network?id=blocking-certain-resources-from-loading
router := page.HijackRequests()
resources := []proto.NetworkResourceType{
proto.NetworkResourceTypeFont,
proto.NetworkResourceTypeImage,
proto.NetworkResourceTypeMedia,
proto.NetworkResourceTypeStylesheet,
proto.NetworkResourceTypeWebSocket, // we don't need websockets to fetch html
}
router.MustAdd("*", func(ctx *rod.Hijack) {
if slice.Contains(resources, ctx.Request.Type()) {
ctx.Response.Fail(proto.NetworkErrorReasonBlockedByClient)
return
}
ctx.ContinueRequest(&proto.FetchContinueRequest{})
})
go router.Run()
// go to the url
page.Timeout(navigateTimeout).MustNavigate(url)
// follow any redirects
// https://github.com/go-rod/rod/issues/640#issuecomment-1171941374
waitNavigation := page.Timeout(navigationTimeout).MustWaitNavigation()
waitNavigation()
// wait until requests stop firing so we can get
// any html rendered by js scripts or iframes
waitRequestIdle := page.Timeout(requestIdleTimeout).MustWaitRequestIdle()
waitRequestIdle()
// return the html
html = page.Timeout(htmlTimeout).MustElement("html").MustHTML()
})
if err != nil {
return "", err
}
return html, nil
}