From a1ce4780cb678838ad1ecf0dd0388af33aff38ad Mon Sep 17 00:00:00 2001 From: FluxCapacitor2 <31071265+FluxCapacitor2@users.noreply.github.com> Date: Fri, 18 Oct 2024 01:51:28 -0400 Subject: [PATCH] Record visits for all non-HTML pages, not just sitemaps and feeds --- app/crawler/crawler.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/app/crawler/crawler.go b/app/crawler/crawler.go index 850a4f6..7812077 100644 --- a/app/crawler/crawler.go +++ b/app/crawler/crawler.go @@ -156,14 +156,10 @@ func Crawl(source config.Source, currentDepth int32, referrer string, db databas add(link) } } - } else { - return } - if len(urls) > 0 { // <- This will be true if URLs were found *before* an HTML document was parsed, which only happens for sitemaps/feeds. - // This page is a sitemap. Insert an "unindexable" document, which records that this document has been crawled, but has no text content of its own. - db.AddDocument(source.ID, currentDepth, referrer, canonical, database.Unindexable, "", "", "", "") - } + // This page is not an HTML document. Insert an "unindexable" document, which records that this document has been crawled, but has no text content of its own. + db.AddDocument(source.ID, currentDepth, referrer, canonical, database.Unindexable, "", "", "", "") }) collector.OnHTML("a[href]", func(element *colly.HTMLElement) {