From 734633df84945de92d31c9fd4eafc7212bfee96e Mon Sep 17 00:00:00 2001 From: FluxCapacitor2 <31071265+FluxCapacitor2@users.noreply.github.com> Date: Sun, 6 Oct 2024 15:27:53 -0400 Subject: [PATCH] Fix GetCanonical and remove a page's cached canonicals when it is deleted --- app/crawler/crawler.go | 6 +++++- app/database/db.go | 2 ++ app/database/db_sqlite.go | 10 +++++++++- app/database/db_sqlite_setup.sql | 5 +++++ app/database/db_sqlite_test.go | 26 ++++++++++++++++++++++++++ 5 files changed, 47 insertions(+), 2 deletions(-) diff --git a/app/crawler/crawler.go b/app/crawler/crawler.go index 5947db2..850a4f6 100644 --- a/app/crawler/crawler.go +++ b/app/crawler/crawler.go @@ -215,7 +215,11 @@ func getText(node *html.Node) string { func Canonicalize(src string, db database.Database, url *url.URL) (*url.URL, error) { // Check if we already have a canonical URL recorded - canonical, _ := db.GetCanonical(src, url.String()) + canonical, err := db.GetCanonical(src, url.String()) + + if err != nil { + return nil, err + } if canonical != nil { return url.Parse(canonical.Canonical) diff --git a/app/database/db.go b/app/database/db.go index 967ec85..c036bdc 100644 --- a/app/database/db.go +++ b/app/database/db.go @@ -10,6 +10,8 @@ type Database interface { HasDocument(source string, url string) (*bool, error) // Fetch the document by URL (or the URL's canonical) GetDocument(source string, url string) (*Page, error) + // Delete a document by its URL and remove all canonicals pointing to it + RemoveDocument(source string, url string) error // Run a fulltext search with the given query Search(sources []string, query string, page uint32, pageSize uint32) ([]Result, *uint32, error) diff --git a/app/database/db_sqlite.go b/app/database/db_sqlite.go index 39e05f5..56eb408 100644 --- a/app/database/db_sqlite.go +++ b/app/database/db_sqlite.go @@ -32,6 +32,11 @@ func (db *SQLiteDatabase) AddDocument(source string, depth int32, referrer strin return err } +func (db *SQLiteDatabase) RemoveDocument(source string, url string) error { + _, err := db.conn.Exec("DELETE FROM pages WHERE source = ? AND url = ?;", source, url) + return err +} + func (db *SQLiteDatabase) HasDocument(source string, url string) (*bool, error) { cursor := db.conn.QueryRow("SELECT 1 FROM pages WHERE source = ? AND (url = ? OR url IN (SELECT canonical FROM canonicals WHERE url = ?));", source, url, url) @@ -320,12 +325,15 @@ func (db *SQLiteDatabase) QueuePagesOlderThan(source string, daysAgo int32) erro } func (db *SQLiteDatabase) GetCanonical(source string, url string) (*Canonical, error) { - cursor := db.conn.QueryRow("SELECT original, canonical, crawledAt FROM canonicals WHERE source = ? AND url = ?", source, url) + cursor := db.conn.QueryRow("SELECT url, canonical, crawledAt FROM canonicals WHERE source = ? AND url = ?", source, url) canonical := &Canonical{} err := cursor.Scan(&canonical.Original, &canonical.Canonical, &canonical.CrawledAt) if err != nil { + if err == sql.ErrNoRows { + return nil, nil + } return nil, err } return canonical, nil diff --git a/app/database/db_sqlite_setup.sql b/app/database/db_sqlite_setup.sql index 8e94b76..8cb8893 100644 --- a/app/database/db_sqlite_setup.sql +++ b/app/database/db_sqlite_setup.sql @@ -55,6 +55,11 @@ CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5( content=pages ); +-- When a page is deleted, delete its canonicals too +CREATE TRIGGER IF NOT EXISTS delete_page_canonicals_on_page_delete AFTER DELETE ON pages BEGIN + DELETE FROM canonicals WHERE source = old.source AND canonical = old.url; +END; + -- Use triggers to automatically sync the FTS table with the content table -- https://sqlite.org/fts5.html#external_content_tables CREATE TRIGGER IF NOT EXISTS pages_auto_insert AFTER INSERT ON pages BEGIN diff --git a/app/database/db_sqlite_test.go b/app/database/db_sqlite_test.go index 6d5b172..ce1efc9 100644 --- a/app/database/db_sqlite_test.go +++ b/app/database/db_sqlite_test.go @@ -178,6 +178,32 @@ func TestGetDocument(t *testing.T) { } } +func TestDeleteCanonicalsOnDeletePage(t *testing.T) { + db := createDB(t) + + err := db.AddDocument("source1", 0, "", "https://example.com/", Finished, "Title", "Description", "Content", "") + if err != nil { + t.Fatalf("failed to add page: %v", err) + } + err = db.SetCanonical("source1", "https://www.example.com/", "https://example.com/") + if err != nil { + t.Fatalf("failed to set canonical: %v", err) + } + err = db.RemoveDocument("source1", "https://example.com/") + if err != nil { + t.Fatalf("failed to delete document: %v", err) + } + + canonical, err := db.GetCanonical("source1", "https://www.example.com/") + if err != nil { + t.Fatalf("failed to get canonical: %v", err) + } + + if canonical != nil { + t.Fatalf("canonical was not deleted: expected nil, got %+v", canonical) + } +} + func TestSearchQuery(t *testing.T) { db := createDB(t)