-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.go
122 lines (109 loc) · 2.76 KB
/
scraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package main
import (
"context"
"database/sql"
"encoding/xml"
"io"
"log"
"net/http"
"strings"
"sync"
"time"
"github.com/google/uuid"
)
func startScraping(db *database.Queries, concurrency int, timeBetweenRequest time.Duration) {
log.Printf("Collecting feeds every %s on %v goroutines...", timeBetweenRequest, concurrency)
ticker := time.NewTicker(timeBetweenRequest)
for ; ; <-ticker.C {
feeds, err := db.GetNextFeedsToFetch(context.Background(), int32(concurrency))
if err != nil {
log.Println("Couldn't get next feeds to fetch", err)
continue
}
log.Printf("Found %v feeds to fetch!", len(feeds))
wg := &sync.WaitGroup{}
for _, feed := range feeds {
wg.Add(1)
go scrapeFeed(db, wg, feed)
}
wg.Wait()
}
}
func scrapeFeed(db *database.Queries, wg *sync.WaitGroup, feed database.Feed) {
defer wg.Done()
_, err := db.MarkFeedFetched(context.Background(), feed.ID)
if err != nil {
log.Printf("Couldn't mark feed %s fetched: %v", feed.Name, err)
return
}
feedData, err := fetchFeed(feed.Url)
if err != nil {
log.Printf("Couldn't collect feed %s: %v", feed.Name, err)
return
}
for _, item := range feedData.Channel.Item {
publishedAt := sql.NullTime{}
if t, err := time.Parse(time.RFC1123Z, item.PubDate); err == nil {
publishedAt = sql.NullTime{
Time: t,
Valid: true,
}
}
_, err = db.CreatePost(context.Background(), database.CreatePostParams{
ID: uuid.New(),
CreatedAt: time.Now().UTC(),
UpdatedAt: time.Now().UTC(),
FeedID: feed.ID,
Title: item.Title,
Description: sql.NullString{
String: item.Description,
Valid: true,
},
Url: item.Link,
PublishedAt: publishedAt,
})
if err != nil {
if strings.Contains(err.Error(), "duplicate key value violates unique constraint") {
continue
}
log.Printf("Couldn't create post: %v", err)
continue
}
}
log.Printf("Feed %s collected, %v posts found", feed.Name, len(feedData.Channel.Item))
}
type RSSFeed struct {
Channel struct {
Title string `xml:"title"`
Link string `xml:"link"`
Description string `xml:"description"`
Language string `xml:"language"`
Item []RSSItem `xml:"item"`
} `xml:"channel"`
}
type RSSItem struct {
Title string `xml:"title"`
Link string `xml:"link"`
Description string `xml:"description"`
PubDate string `xml:"pubDate"`
}
func fetchFeed(feedURL string) (*RSSFeed, error) {
httpClient := http.Client{
Timeout: 10 * time.Second,
}
resp, err := httpClient.Get(feedURL)
if err != nil {
return nil, err
}
defer resp.Body.Close()
dat, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
var rssFeed RSSFeed
err = xml.Unmarshal(dat, &rssFeed)
if err != nil {
return nil, err
}
return &rssFeed, nil
}