Skip to content

Commit

Permalink
Fix GetCanonical and remove a page's cached canonicals when it is del…
Browse files Browse the repository at this point in the history
…eted
  • Loading branch information
FluxCapacitor2 committed Oct 6, 2024
1 parent d5acc4c commit 734633d
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 2 deletions.
6 changes: 5 additions & 1 deletion app/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,11 @@ func getText(node *html.Node) string {
func Canonicalize(src string, db database.Database, url *url.URL) (*url.URL, error) {

// Check if we already have a canonical URL recorded
canonical, _ := db.GetCanonical(src, url.String())
canonical, err := db.GetCanonical(src, url.String())

if err != nil {
return nil, err
}

if canonical != nil {
return url.Parse(canonical.Canonical)
Expand Down
2 changes: 2 additions & 0 deletions app/database/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ type Database interface {
HasDocument(source string, url string) (*bool, error)
// Fetch the document by URL (or the URL's canonical)
GetDocument(source string, url string) (*Page, error)
// Delete a document by its URL and remove all canonicals pointing to it
RemoveDocument(source string, url string) error

// Run a fulltext search with the given query
Search(sources []string, query string, page uint32, pageSize uint32) ([]Result, *uint32, error)
Expand Down
10 changes: 9 additions & 1 deletion app/database/db_sqlite.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ func (db *SQLiteDatabase) AddDocument(source string, depth int32, referrer strin
return err
}

func (db *SQLiteDatabase) RemoveDocument(source string, url string) error {
_, err := db.conn.Exec("DELETE FROM pages WHERE source = ? AND url = ?;", source, url)
return err
}

func (db *SQLiteDatabase) HasDocument(source string, url string) (*bool, error) {
cursor := db.conn.QueryRow("SELECT 1 FROM pages WHERE source = ? AND (url = ? OR url IN (SELECT canonical FROM canonicals WHERE url = ?));", source, url, url)

Expand Down Expand Up @@ -320,12 +325,15 @@ func (db *SQLiteDatabase) QueuePagesOlderThan(source string, daysAgo int32) erro
}

func (db *SQLiteDatabase) GetCanonical(source string, url string) (*Canonical, error) {
cursor := db.conn.QueryRow("SELECT original, canonical, crawledAt FROM canonicals WHERE source = ? AND url = ?", source, url)
cursor := db.conn.QueryRow("SELECT url, canonical, crawledAt FROM canonicals WHERE source = ? AND url = ?", source, url)

canonical := &Canonical{}
err := cursor.Scan(&canonical.Original, &canonical.Canonical, &canonical.CrawledAt)

if err != nil {
if err == sql.ErrNoRows {
return nil, nil
}
return nil, err
}
return canonical, nil
Expand Down
5 changes: 5 additions & 0 deletions app/database/db_sqlite_setup.sql
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5(
content=pages
);

-- When a page is deleted, delete its canonicals too
CREATE TRIGGER IF NOT EXISTS delete_page_canonicals_on_page_delete AFTER DELETE ON pages BEGIN
DELETE FROM canonicals WHERE source = old.source AND canonical = old.url;
END;

-- Use triggers to automatically sync the FTS table with the content table
-- https://sqlite.org/fts5.html#external_content_tables
CREATE TRIGGER IF NOT EXISTS pages_auto_insert AFTER INSERT ON pages BEGIN
Expand Down
26 changes: 26 additions & 0 deletions app/database/db_sqlite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,32 @@ func TestGetDocument(t *testing.T) {
}
}

func TestDeleteCanonicalsOnDeletePage(t *testing.T) {
db := createDB(t)

err := db.AddDocument("source1", 0, "", "https://example.com/", Finished, "Title", "Description", "Content", "")
if err != nil {
t.Fatalf("failed to add page: %v", err)
}
err = db.SetCanonical("source1", "https://www.example.com/", "https://example.com/")
if err != nil {
t.Fatalf("failed to set canonical: %v", err)
}
err = db.RemoveDocument("source1", "https://example.com/")
if err != nil {
t.Fatalf("failed to delete document: %v", err)
}

canonical, err := db.GetCanonical("source1", "https://www.example.com/")
if err != nil {
t.Fatalf("failed to get canonical: %v", err)
}

if canonical != nil {
t.Fatalf("canonical was not deleted: expected nil, got %+v", canonical)
}
}

func TestSearchQuery(t *testing.T) {
db := createDB(t)

Expand Down

0 comments on commit 734633d

Please sign in to comment.