diff --git a/README.md b/README.md index 353680f..61f9e38 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # colly-responsible -Responsible crawling with Colly +Responsible crawling with Colly. For the better Internet. + +Based on lessons learned while writing [Idun](https://github.com/tb0hdan/idun) and subsequently getting banned by half of website operators... ## Limits diff --git a/main.go b/crawler.go similarity index 51% rename from main.go rename to crawler.go index d376c63..dbc4784 100644 --- a/main.go +++ b/crawler.go @@ -1,4 +1,4 @@ -package main +package collyresponsible import ( "context" @@ -6,109 +6,19 @@ import ( "net/http" "net/url" "strings" - "sync" "time" "github.com/gocolly/colly/v2" - "github.com/temoto/robotstxt" ) -type RequestLimiter struct { - SleepDelay int - sleepMin int - lock sync.RWMutex -} - -func (r *RequestLimiter) Increase() { - r.lock.Lock() - defer r.lock.Unlock() - r.SleepDelay++ -} - -func (r *RequestLimiter) Decrease() { - r.lock.Lock() - defer r.lock.Unlock() - if r.SleepDelay > r.sleepMin { - r.SleepDelay-- - } -} - -func (r *RequestLimiter) Sleep() { - r.lock.RLock() - defer r.lock.RUnlock() - time.Sleep(time.Duration(r.SleepDelay) * time.Second) -} - -func NewLimiter(sleepDelay int) *RequestLimiter { - return &RequestLimiter{ - SleepDelay: sleepDelay, - sleepMin: sleepDelay, - lock: sync.RWMutex{}, - } -} - -type VisitMap struct { - visited map[string]bool - lock sync.RWMutex -} - -func (v *VisitMap) Add(url string) { - v.lock.Lock() - defer v.lock.Unlock() - v.visited[url] = true -} - -func (v *VisitMap) IsVisited(url string) bool { - v.lock.RLock() - defer v.lock.RUnlock() - return v.visited[url] -} - -func NewVisitMap() *VisitMap { - return &VisitMap{ - visited: make(map[string]bool), - lock: sync.RWMutex{}, - } -} - -func GetRobots(ctx context.Context, website, userAgent string) (*robotstxt.RobotsData, error) { - if strings.HasSuffix(website, "/") { - website = website[:len(website)-1] - } - head, err := http.NewRequestWithContext(ctx, http.MethodHead, website+"/robots.txt", nil) - if err != nil { - return nil, err - } - head.Header.Add("User-Agent", userAgent) - // - resp, err := http.DefaultClient.Do(head) - if err != nil || resp.StatusCode != http.StatusOK { - return nil, err - } - get, err := http.NewRequestWithContext(ctx, http.MethodGet, website+"/robots.txt", nil) - if err != nil { - return nil, err - } - get.Header.Add("User-Agent", userAgent) - // - getResp, err := http.DefaultClient.Do(get) - if err != nil || resp.StatusCode != http.StatusOK { - return nil, err - } - robots, err := robotstxt.FromResponse(getResp) - if err != nil { - return nil, err - } - return robots, nil -} - -func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.CollectorOption) (err error) { +func Crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.CollectorOption) (err error) { parsed, err := url.Parse(webSite) if err != nil { return err } // Get robots.txt - robots, err := GetRobots(ctx, webSite, userAgent) + limiter := NewLimiter(2) + robots, err := GetRobots(ctx, webSite, userAgent, limiter) if err != nil { return err } @@ -116,12 +26,13 @@ func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.Co if !robots.TestAgent(webSite, userAgent) { return fmt.Errorf("User agent is not allowed to visit the website") } - + // Sleep after getting robots.txt + limiter.Sleep() + // visitMap := NewVisitMap() - limiter := NewLimiter(2) collectorOptions := []colly.CollectorOption{ - colly.Async(), + // colly.Async(), colly.UserAgent(userAgent), } @@ -132,10 +43,8 @@ func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.Co // Instantiate default collector c := colly.NewCollector(collectorOptions...) - // Limit the number of threads started by colly to two - c.Limit(&colly.LimitRule{ - Parallelism: 2, - }) + // Use empty limit rule for collector + c.Limit(&colly.LimitRule{}) // Pass down URL from request to response context c.OnRequest(func(r *colly.Request) { @@ -159,7 +68,6 @@ func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.Co absoluteLink := e.Request.AbsoluteURL(link) // Print link // Visit link found on page on a new thread - limiter.Sleep() currentHost, err := url.Parse(absoluteLink) if err != nil { @@ -182,6 +90,7 @@ func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.Co return } + limiter.Sleep() fmt.Println("Visiting", absoluteLink) c.Visit(absoluteLink) visitMap.Add(absoluteLink) @@ -214,12 +123,3 @@ func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.Co return nil } - -func main() { - httpCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - err := crawl(httpCtx, "https://en.wikipedia.org", "Mozilla/5.0 (compatible; Colly Responsible; +https://github.com/tb0hdan/colly-responsible)") - if err != nil { - panic(err) - } -} diff --git a/examples/main.go b/examples/main.go new file mode 100644 index 0000000..95fffc1 --- /dev/null +++ b/examples/main.go @@ -0,0 +1,17 @@ +package main + +import ( + "context" + "time" + + collyresponsible "github.com/tb0hdan/colly-responsible" +) + +func main() { + httpCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + err := collyresponsible.Crawl(httpCtx, "https://en.wikipedia.org", "Mozilla/5.0 (compatible; Colly Responsible; +https://github.com/tb0hdan/colly-responsible)") + if err != nil { + panic(err) + } +} diff --git a/limiter.go b/limiter.go new file mode 100644 index 0000000..5838529 --- /dev/null +++ b/limiter.go @@ -0,0 +1,40 @@ +package collyresponsible + +import ( + "sync" + "time" +) + +type RequestLimiter struct { + SleepDelay int + sleepMin int + lock *sync.RWMutex +} + +func (r *RequestLimiter) Increase() { + r.lock.Lock() + defer r.lock.Unlock() + r.SleepDelay++ +} + +func (r *RequestLimiter) Decrease() { + r.lock.Lock() + defer r.lock.Unlock() + if r.SleepDelay > r.sleepMin { + r.SleepDelay-- + } +} + +func (r *RequestLimiter) Sleep() { + r.lock.RLock() + defer r.lock.RUnlock() + time.Sleep(time.Duration(r.SleepDelay) * time.Second) +} + +func NewLimiter(sleepDelay int) *RequestLimiter { + return &RequestLimiter{ + SleepDelay: sleepDelay, + sleepMin: sleepDelay, + lock: &sync.RWMutex{}, + } +} diff --git a/robots.go b/robots.go new file mode 100644 index 0000000..7a2d4bf --- /dev/null +++ b/robots.go @@ -0,0 +1,43 @@ +package collyresponsible + +import ( + "context" + "net/http" + "strings" + + "github.com/temoto/robotstxt" +) + +func GetRobots(ctx context.Context, website, userAgent string, limiter *RequestLimiter) (*robotstxt.RobotsData, error) { + if strings.HasSuffix(website, "/") { + website = website[:len(website)-1] + } + head, err := http.NewRequestWithContext(ctx, http.MethodHead, website+"/robots.txt", nil) + if err != nil { + return nil, err + } + head.Header.Add("User-Agent", userAgent) + // + resp, err := http.DefaultClient.Do(head) + if err != nil || resp.StatusCode != http.StatusOK { + return nil, err + } + // + limiter.Sleep() + // + get, err := http.NewRequestWithContext(ctx, http.MethodGet, website+"/robots.txt", nil) + if err != nil { + return nil, err + } + get.Header.Add("User-Agent", userAgent) + // + getResp, err := http.DefaultClient.Do(get) + if err != nil || resp.StatusCode != http.StatusOK { + return nil, err + } + robots, err := robotstxt.FromResponse(getResp) + if err != nil { + return nil, err + } + return robots, nil +} diff --git a/visitmap.go b/visitmap.go new file mode 100644 index 0000000..3137320 --- /dev/null +++ b/visitmap.go @@ -0,0 +1,27 @@ +package collyresponsible + +import "sync" + +type VisitMap struct { + visited map[string]bool + lock *sync.RWMutex +} + +func (v *VisitMap) Add(url string) { + v.lock.Lock() + defer v.lock.Unlock() + v.visited[url] = true +} + +func (v *VisitMap) IsVisited(url string) bool { + v.lock.RLock() + defer v.lock.RUnlock() + return v.visited[url] +} + +func NewVisitMap() *VisitMap { + return &VisitMap{ + visited: make(map[string]bool), + lock: &sync.RWMutex{}, + } +}