Skip to content

Commit

Permalink
Record visits for all non-HTML pages, not just sitemaps and feeds
Browse files Browse the repository at this point in the history
  • Loading branch information
FluxCapacitor2 committed Oct 18, 2024
1 parent 551e178 commit a1ce478
Showing 1 changed file with 2 additions and 6 deletions.
8 changes: 2 additions & 6 deletions app/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,10 @@ func Crawl(source config.Source, currentDepth int32, referrer string, db databas
add(link)
}
}
} else {
return
}

if len(urls) > 0 { // <- This will be true if URLs were found *before* an HTML document was parsed, which only happens for sitemaps/feeds.
// This page is a sitemap. Insert an "unindexable" document, which records that this document has been crawled, but has no text content of its own.
db.AddDocument(source.ID, currentDepth, referrer, canonical, database.Unindexable, "", "", "", "")
}
// This page is not an HTML document. Insert an "unindexable" document, which records that this document has been crawled, but has no text content of its own.
db.AddDocument(source.ID, currentDepth, referrer, canonical, database.Unindexable, "", "", "", "")
})

collector.OnHTML("a[href]", func(element *colly.HTMLElement) {
Expand Down

0 comments on commit a1ce478

Please sign in to comment.