Skip to content

Commit

Permalink
restructurE
Browse files Browse the repository at this point in the history
  • Loading branch information
tb0hdan committed Nov 3, 2023
1 parent 81f481c commit 667dc10
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 112 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# colly-responsible
Responsible crawling with Colly
Responsible crawling with Colly. For the better Internet.

Based on lessons learned while writing [Idun](https://github.com/tb0hdan/idun) and subsequently getting banned by half of website operators...


## Limits
Expand Down
122 changes: 11 additions & 111 deletions main.go → crawler.go
Original file line number Diff line number Diff line change
@@ -1,127 +1,38 @@
package main
package collyresponsible

import (
"context"
"fmt"
"net/http"
"net/url"
"strings"
"sync"
"time"

"github.com/gocolly/colly/v2"
"github.com/temoto/robotstxt"
)

type RequestLimiter struct {
SleepDelay int
sleepMin int
lock sync.RWMutex
}

func (r *RequestLimiter) Increase() {
r.lock.Lock()
defer r.lock.Unlock()
r.SleepDelay++
}

func (r *RequestLimiter) Decrease() {
r.lock.Lock()
defer r.lock.Unlock()
if r.SleepDelay > r.sleepMin {
r.SleepDelay--
}
}

func (r *RequestLimiter) Sleep() {
r.lock.RLock()
defer r.lock.RUnlock()
time.Sleep(time.Duration(r.SleepDelay) * time.Second)
}

func NewLimiter(sleepDelay int) *RequestLimiter {
return &RequestLimiter{
SleepDelay: sleepDelay,
sleepMin: sleepDelay,
lock: sync.RWMutex{},
}
}

type VisitMap struct {
visited map[string]bool
lock sync.RWMutex
}

func (v *VisitMap) Add(url string) {
v.lock.Lock()
defer v.lock.Unlock()
v.visited[url] = true
}

func (v *VisitMap) IsVisited(url string) bool {
v.lock.RLock()
defer v.lock.RUnlock()
return v.visited[url]
}

func NewVisitMap() *VisitMap {
return &VisitMap{
visited: make(map[string]bool),
lock: sync.RWMutex{},
}
}

func GetRobots(ctx context.Context, website, userAgent string) (*robotstxt.RobotsData, error) {
if strings.HasSuffix(website, "/") {
website = website[:len(website)-1]
}
head, err := http.NewRequestWithContext(ctx, http.MethodHead, website+"/robots.txt", nil)
if err != nil {
return nil, err
}
head.Header.Add("User-Agent", userAgent)
//
resp, err := http.DefaultClient.Do(head)
if err != nil || resp.StatusCode != http.StatusOK {
return nil, err
}
get, err := http.NewRequestWithContext(ctx, http.MethodGet, website+"/robots.txt", nil)
if err != nil {
return nil, err
}
get.Header.Add("User-Agent", userAgent)
//
getResp, err := http.DefaultClient.Do(get)
if err != nil || resp.StatusCode != http.StatusOK {
return nil, err
}
robots, err := robotstxt.FromResponse(getResp)
if err != nil {
return nil, err
}
return robots, nil
}

func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.CollectorOption) (err error) {
func Crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.CollectorOption) (err error) {
parsed, err := url.Parse(webSite)
if err != nil {
return err
}
// Get robots.txt
robots, err := GetRobots(ctx, webSite, userAgent)
limiter := NewLimiter(2)
robots, err := GetRobots(ctx, webSite, userAgent, limiter)
if err != nil {
return err
}
// Check if the user agent is allowed to visit the website
if !robots.TestAgent(webSite, userAgent) {
return fmt.Errorf("User agent is not allowed to visit the website")
}

// Sleep after getting robots.txt
limiter.Sleep()
//
visitMap := NewVisitMap()
limiter := NewLimiter(2)

collectorOptions := []colly.CollectorOption{
colly.Async(),
// colly.Async(),
colly.UserAgent(userAgent),
}

Expand All @@ -132,10 +43,8 @@ func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.Co
// Instantiate default collector
c := colly.NewCollector(collectorOptions...)

// Limit the number of threads started by colly to two
c.Limit(&colly.LimitRule{
Parallelism: 2,
})
// Use empty limit rule for collector
c.Limit(&colly.LimitRule{})

// Pass down URL from request to response context
c.OnRequest(func(r *colly.Request) {
Expand All @@ -159,7 +68,6 @@ func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.Co
absoluteLink := e.Request.AbsoluteURL(link)
// Print link
// Visit link found on page on a new thread
limiter.Sleep()

currentHost, err := url.Parse(absoluteLink)
if err != nil {
Expand All @@ -182,6 +90,7 @@ func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.Co
return
}

limiter.Sleep()
fmt.Println("Visiting", absoluteLink)
c.Visit(absoluteLink)
visitMap.Add(absoluteLink)
Expand Down Expand Up @@ -214,12 +123,3 @@ func crawl(ctx context.Context, webSite, userAgent string, options ...[]colly.Co

return nil
}

func main() {
httpCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
err := crawl(httpCtx, "https://en.wikipedia.org", "Mozilla/5.0 (compatible; Colly Responsible; +https://github.com/tb0hdan/colly-responsible)")
if err != nil {
panic(err)
}
}
17 changes: 17 additions & 0 deletions examples/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package main

import (
"context"
"time"

collyresponsible "github.com/tb0hdan/colly-responsible"
)

func main() {
httpCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
err := collyresponsible.Crawl(httpCtx, "https://en.wikipedia.org", "Mozilla/5.0 (compatible; Colly Responsible; +https://github.com/tb0hdan/colly-responsible)")
if err != nil {
panic(err)
}
}
40 changes: 40 additions & 0 deletions limiter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package collyresponsible

import (
"sync"
"time"
)

type RequestLimiter struct {
SleepDelay int
sleepMin int
lock *sync.RWMutex
}

func (r *RequestLimiter) Increase() {
r.lock.Lock()
defer r.lock.Unlock()
r.SleepDelay++
}

func (r *RequestLimiter) Decrease() {
r.lock.Lock()
defer r.lock.Unlock()
if r.SleepDelay > r.sleepMin {
r.SleepDelay--
}
}

func (r *RequestLimiter) Sleep() {
r.lock.RLock()
defer r.lock.RUnlock()
time.Sleep(time.Duration(r.SleepDelay) * time.Second)
}

func NewLimiter(sleepDelay int) *RequestLimiter {
return &RequestLimiter{
SleepDelay: sleepDelay,
sleepMin: sleepDelay,
lock: &sync.RWMutex{},
}
}
43 changes: 43 additions & 0 deletions robots.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package collyresponsible

import (
"context"
"net/http"
"strings"

"github.com/temoto/robotstxt"
)

func GetRobots(ctx context.Context, website, userAgent string, limiter *RequestLimiter) (*robotstxt.RobotsData, error) {
if strings.HasSuffix(website, "/") {
website = website[:len(website)-1]
}
head, err := http.NewRequestWithContext(ctx, http.MethodHead, website+"/robots.txt", nil)
if err != nil {
return nil, err
}
head.Header.Add("User-Agent", userAgent)
//
resp, err := http.DefaultClient.Do(head)
if err != nil || resp.StatusCode != http.StatusOK {
return nil, err
}
//
limiter.Sleep()
//
get, err := http.NewRequestWithContext(ctx, http.MethodGet, website+"/robots.txt", nil)
if err != nil {
return nil, err
}
get.Header.Add("User-Agent", userAgent)
//
getResp, err := http.DefaultClient.Do(get)
if err != nil || resp.StatusCode != http.StatusOK {
return nil, err
}
robots, err := robotstxt.FromResponse(getResp)
if err != nil {
return nil, err
}
return robots, nil
}
27 changes: 27 additions & 0 deletions visitmap.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package collyresponsible

import "sync"

type VisitMap struct {
visited map[string]bool
lock *sync.RWMutex
}

func (v *VisitMap) Add(url string) {
v.lock.Lock()
defer v.lock.Unlock()
v.visited[url] = true
}

func (v *VisitMap) IsVisited(url string) bool {
v.lock.RLock()
defer v.lock.RUnlock()
return v.visited[url]
}

func NewVisitMap() *VisitMap {
return &VisitMap{
visited: make(map[string]bool),
lock: &sync.RWMutex{},
}
}

0 comments on commit 667dc10

Please sign in to comment.