Skip to content

Commit

Permalink
Resolve canonical, sitemap, and feed URLs to absolute URLs based on t…
Browse files Browse the repository at this point in the history
…he current URL
  • Loading branch information
FluxCapacitor2 committed Oct 21, 2024
1 parent a05bb34 commit d395d64
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions app/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func Crawl(source config.Source, currentDepth int32, referrer string, db databas
description, _ := element.DOM.Find("meta[name=description]").Attr("content")

if metaCanonicalTag, exists := element.DOM.Find("link[rel=canonical]").Attr("href"); exists {
page.canonical = metaCanonicalTag
page.canonical = element.Request.AbsoluteURL(metaCanonicalTag)
}

// Find alternate links for RSS feeds, other languages, etc.
Expand Down Expand Up @@ -156,11 +156,11 @@ func Crawl(source config.Source, currentDepth int32, referrer string, db databas
// Attempt to parse this response as a sitemap or sitemap index
reader := bytes.NewReader(resp.Body)
sitemap.Parse(reader, func(entry sitemap.Entry) error {
return add(entry.GetLocation())
return add(resp.Request.AbsoluteURL(entry.GetLocation()))
})
reader.Reset(resp.Body)
sitemap.ParseIndex(reader, func(entry sitemap.IndexEntry) error {
return add(entry.GetLocation())
return add(resp.Request.AbsoluteURL(entry.GetLocation()))
})
} else if strings.HasPrefix(ct, "application/rss+xml") || strings.HasPrefix(ct, "application/feed+json") || strings.HasPrefix(ct, "application/atom+xml") {
// Parse RSS, Atom, and JSON feeds using `gofeed`
Expand All @@ -172,7 +172,7 @@ func Crawl(source config.Source, currentDepth int32, referrer string, db databas
}
for _, item := range res.Items {
for _, link := range item.Links {
add(link)
add(resp.Request.AbsoluteURL(link))
}
}
}
Expand Down

0 comments on commit d395d64

Please sign in to comment.