Skip to content

Commit

Permalink
add utils
Browse files Browse the repository at this point in the history
  • Loading branch information
dominikus1993 committed Dec 31, 2023
1 parent 5b71c7d commit e2f2173
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 71 deletions.
53 changes: 26 additions & 27 deletions internal/parser/dotnetomaniak/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/url"
"time"

"github.com/dominikus1993/dev-news-bot/internal/parser/utils"
"github.com/dominikus1993/dev-news-bot/pkg/model"
"github.com/gocolly/colly/v2"
log "github.com/sirupsen/logrus"
Expand Down Expand Up @@ -38,32 +39,30 @@ func geDotnetomaniakLink(link string) string {
}

func (p *dotnetoManiakParser) Parse(ctx context.Context) model.ArticlesStream {
result := make(chan model.Article)
go func() {
defer close(result)
c := colly.NewCollector(colly.Async(true), colly.UserAgent(userAgent))
c.OnHTML(".article", func(e *colly.HTMLElement) {
title := e.ChildText(".title .taggedlink span")
link := geDotnetomaniakLink(e.ChildAttr(".title .taggedlink", "href"))
content := e.ChildText(".description p span")
result <- model.NewArticleWithContent(title, link, content, source)
})
c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
log.WithField("url", req.URL.String()).Debugln("Redirecting to another site")
return nil
})
c.OnError(func(r *colly.Response, err error) {
log.WithError(err).Errorln("can't parse dotnetomaniak")
})
url := fmt.Sprintf("%s://%s/", dotnetomaniakNewsScheme, dotnetomaniakNewsURL)
c.SetRequestTimeout(time.Second * 30)
c.UserAgent = "devnews-bot"
err := c.Visit(url)
if err != nil {
log.WithError(err).Errorln("error while parsing dotnetomaniak")
}
c.Wait()
return utils.Parse(ctx, p.parseArticles)
}

}()
return result
func (p *dotnetoManiakParser) parseArticles(ctx context.Context, result chan<- model.Article) {
c := colly.NewCollector(colly.Async(true), colly.UserAgent(userAgent))
c.OnHTML(".article", func(e *colly.HTMLElement) {
title := e.ChildText(".title .taggedlink span")
link := geDotnetomaniakLink(e.ChildAttr(".title .taggedlink", "href"))
content := e.ChildText(".description p span")
result <- model.NewArticleWithContent(title, link, content, source)
})
c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
log.WithField("url", req.URL.String()).Debugln("Redirecting to another site")
return nil
})
c.OnError(func(r *colly.Response, err error) {
log.WithError(err).Errorln("can't parse dotnetomaniak")
})
url := fmt.Sprintf("%s://%s/", dotnetomaniakNewsScheme, dotnetomaniakNewsURL)
c.SetRequestTimeout(time.Second * 30)
c.UserAgent = "devnews-bot"
err := c.Visit(url)
if err != nil {
log.WithError(err).Errorln("error while parsing dotnetomaniak")
}
c.Wait()
}
50 changes: 25 additions & 25 deletions internal/parser/echojs/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"time"

"github.com/dominikus1993/dev-news-bot/internal/parser/utils"
"github.com/dominikus1993/dev-news-bot/pkg/model"
"github.com/gocolly/colly/v2"
log "github.com/sirupsen/logrus"
Expand All @@ -21,30 +22,29 @@ func NewEechoJsParser() *echojsParser {
}

func (parser *echojsParser) Parse(ctx context.Context) model.ArticlesStream {
result := make(chan model.Article)
go func() {
defer close(result)
c := colly.NewCollector(colly.Async(true), colly.UserAgent(userAgent))
c.OnHTML("article h2 a", func(e *colly.HTMLElement) {
title := e.Text
link := e.Attr("href")
article := model.NewArticle(title, link, source)
if article.IsValid() {
result <- article
} else {
log.WithField("link", article.GetLink()).Warnln("echojs article is not valid")
}
})
c.OnError(func(r *colly.Response, err error) {
log.WithError(err).Errorln("can't parse echojs")
})
c.SetRequestTimeout(time.Second * 30)
c.UserAgent = userAgent
err := c.Visit(url)
if err != nil {
log.WithError(err).Errorln("error while parsing echojs")
return utils.Parse(ctx, parser.parseArticles)
}

func (p *echojsParser) parseArticles(ctx context.Context, result chan<- model.Article) {
c := colly.NewCollector(colly.Async(true), colly.UserAgent(userAgent))
c.OnHTML("article h2 a", func(e *colly.HTMLElement) {
title := e.Text
link := e.Attr("href")
article := model.NewArticle(title, link, source)
if article.IsValid() {
result <- article
} else {
log.WithField("link", article.GetLink()).Warnln("echojs article is not valid")
}
c.Wait()
}()
return result
})
c.OnError(func(r *colly.Response, err error) {
log.WithError(err).Errorln("can't parse echojs")
})
c.SetRequestTimeout(time.Second * 30)
c.UserAgent = userAgent
err := c.Visit(url)
if err != nil {
log.WithError(err).Errorln("error while parsing echojs")
}
c.Wait()
}
38 changes: 19 additions & 19 deletions internal/parser/hackernews/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"net/http"
"time"

"github.com/dominikus1993/dev-news-bot/internal/parser/utils"
"github.com/dominikus1993/dev-news-bot/pkg/model"
"github.com/dominikus1993/go-toolkit/random"
jsoniter "github.com/json-iterator/go"
Expand Down Expand Up @@ -90,26 +91,25 @@ func getArticle(id int, client *http.Client) (*hackernewsArticle, error) {
}

func (p *hackerNewsArticleParser) Parse(ctx context.Context) model.ArticlesStream {
result := make(chan model.Article, 20)
go func() {
defer close(result)
ids, err := getTopArticlesIds(p.client)
return utils.Parse(ctx, p.parseArticles)
}

func (p *hackerNewsArticleParser) parseArticles(ctx context.Context, result chan<- model.Article) {
ids, err := getTopArticlesIds(p.client)
if err != nil {
log.WithContext(ctx).WithError(err).Errorln("Error while parsing hackernews top articles")
return
}
ids = takeRandomArticesIds(ids, p.maxArticlesQuantity)
for _, id := range ids {
hackerNewsArticle, err := getArticle(id, p.client)
if err != nil {
log.WithContext(ctx).WithError(err).Errorln("Error while parsing hackernews top articles")
return
log.WithField("id", id).WithError(err).Errorln("error while parsing article by id")
continue
}
ids = takeRandomArticesIds(ids, p.maxArticlesQuantity)
for _, id := range ids {
hackerNewsArticle, err := getArticle(id, p.client)
if err != nil {
log.WithField("id", id).WithError(err).Errorln("error while parsing article by id")
continue
}
article := model.NewArticle(hackerNewsArticle.Title, hackerNewsArticle.URL, source)
if article.IsValid() {
result <- article
}
article := model.NewArticle(hackerNewsArticle.Title, hackerNewsArticle.URL, source)
if article.IsValid() {
result <- article
}
}()
return result
}
}
16 changes: 16 additions & 0 deletions internal/parser/utils/parse.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package utils

import (
"context"

"github.com/dominikus1993/dev-news-bot/pkg/model"
)

func Parse(ctx context.Context, action func(ctx context.Context, stream chan<- model.Article)) model.ArticlesStream {
result := make(chan model.Article, 20)
go func() {
defer close(result)
action(ctx, result)
}()
return result
}

0 comments on commit e2f2173

Please sign in to comment.