Skip to content

Commit

Permalink
Add tests to the crawler module
Browse files Browse the repository at this point in the history
  • Loading branch information
FluxCapacitor2 committed Sep 24, 2024
1 parent eef6267 commit 0a35a4e
Showing 1 changed file with 130 additions and 0 deletions.
130 changes: 130 additions & 0 deletions app/crawler/crawler_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package crawler

import (
"path"
"testing"

"github.com/fluxcapacitor2/easysearch/app/config"
"github.com/fluxcapacitor2/easysearch/app/database"
)

func createDB(t *testing.T) database.Database {
db, err := database.SQLiteFromFile(path.Join(t.TempDir(), "temp.db"))

if err != nil {
t.Fatalf("database creation failed: %v", err)
}

if err := db.Setup(); err != nil {
t.Fatalf("database setup failed: %v", err)
}

return db
}

func TestCrawl(t *testing.T) {
db := createDB(t)
source := config.Source{
ID: "example",
AllowedDomains: []string{"www.example.com"},
}

url := "https://www.example.com"
res, err := Crawl(source, 1, url, db, url)

if err != nil {
t.Fatalf("error crawling URL %v: %v\n", url, err)
}

if res.Canonical != url {
t.Fatalf("unexpected canonical: %v != %v\n", res.Canonical, url)
}

if len(res.URLs) != 1 || res.URLs[0] != "https://www.iana.org/domains/example" {
t.Fatalf("unexpected URLs found: %v\n", res.URLs)
}
}

func TestCrawlWithRedirect(t *testing.T) {
db := createDB(t)
source := config.Source{
ID: "example",
AllowedDomains: []string{"bswanson.dev", "www.bswanson.dev"},
}

url := "https://bswanson.dev"
expectedCanonical := "https://www.bswanson.dev/"
res, err := Crawl(source, 1, url, db, url)

if err != nil {
t.Fatalf("error crawling URL %v: %v\n", url, err)
}

if res.Canonical != expectedCanonical {
t.Fatalf("unexpected canonical: %v != %v\n", res.Canonical, expectedCanonical)
}
}

func TestCrawlWithForbiddenDomain(t *testing.T) {
db := createDB(t)
source := config.Source{
ID: "example",
AllowedDomains: []string{"www.example.com"},
}

url := "https://bswanson.dev/portfolio"
_, err := Crawl(source, 1, url, db, url)

if err == nil {
t.Fatalf("expected error due to forbidden domain; none was received")
}
}

func TestCrawlWithServerError(t *testing.T) {
db := createDB(t)
source := config.Source{
ID: "example",
AllowedDomains: []string{"httpstat.us"},
}

url := "https://httpstat.us/500"
_, err := Crawl(source, 1, url, db, url)

if err.Error() != "Internal Server Error" {
t.Fatalf("expected error due to 500 status; got %v\n", err)
}
}

func TestCrawlWithPageNotFound(t *testing.T) {
db := createDB(t)
source := config.Source{
ID: "example",
AllowedDomains: []string{"httpstat.us"},
}

url := "https://httpstat.us/404"
_, err := Crawl(source, 1, url, db, url)

if err.Error() != "Not Found" {
t.Fatalf("expected error due to 404 status; got %v\n", err)
}
}

func TestSitemap(t *testing.T) {
db := createDB(t)
source := config.Source{
ID: "example",
AllowedDomains: []string{"www.google.com"},
}

url := "https://www.google.com/sitemap.xml"
res, err := Crawl(source, 1, url, db, url)

if err != nil {
t.Errorf("error crawling Google sitemap: %v\n", err)
}

if len(res.URLs) < 20 {
t.Errorf("sitemap URLs were not discovered - expected >=20 URLs, got %+v\n", res)
}
}

0 comments on commit 0a35a4e

Please sign in to comment.