Skip to content

Commit

Permalink
Move DB setup logic to its own file, simplify AddDocument, and use tr…
Browse files Browse the repository at this point in the history
…iggers to update FTS virtual table
  • Loading branch information
FluxCapacitor2 committed Aug 8, 2024
1 parent 3eca33b commit 65938e5
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 143 deletions.
6 changes: 2 additions & 4 deletions app/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,12 @@ func Crawl(source config.Source, currentDepth int32, db database.Database, pageU
for _, item := range element.DOM.Nodes {
content += getText(item)
}
_, err = db.AddDocument(source.ID, currentDepth, canonical, database.Finished, title, description, content)
err = db.AddDocument(source.ID, currentDepth, canonical, database.Finished, title, description, content)
} else {

if len(title) == 0 {
title = article.Title
}

_, err = db.AddDocument(source.ID, currentDepth, canonical, database.Finished, title, description, article.TextContent)
err = db.AddDocument(source.ID, currentDepth, canonical, database.Finished, title, description, article.TextContent)
}

if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion app/database/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ type Database interface {
Setup() error

// Add a page to the search index.
AddDocument(source string, depth int32, url string, status QueueItemStatus, title string, description string, content string) (*Page, error)
AddDocument(source string, depth int32, url string, status QueueItemStatus, title string, description string, content string) error
// Returns whether the given URL (or the URL's canonical) is indexed
HasDocument(source string, url string) (*bool, error)

Expand Down
150 changes: 14 additions & 136 deletions app/database/db_sqlite.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import (
"regexp"
"strings"

_ "embed"

"github.com/google/uuid"
_ "github.com/mattn/go-sqlite3"
)
Expand All @@ -17,146 +19,22 @@ type SQLiteDatabase struct {
conn *sql.DB
}

func (db *SQLiteDatabase) Setup() error {
{
// Enable write-ahead logging for improved write performance (https://www.sqlite.org/wal.html)
_, err := db.conn.Exec("PRAGMA journal_mode=WAL;")

if err != nil {
return err
}
}

// TODO: use a composite index on the `source` AND `url` column instead of making `url` globally unique

{
_, err := db.conn.Exec(`
CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5 (
source UNINDEXED,
url,
title,
description,
content,
status UNINDEXED
)
`)

if err != nil {
return err
}
}

{
_, err := db.conn.Exec(`
CREATE TABLE IF NOT EXISTS pages (
source TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP,
depth INTEGER NOT NULL,
status INTEGER NOT NULL
)
`)

if err != nil {
//go:embed db_sqlite_setup.sql
var setupCommands string

return err
}
}

{
_, err := db.conn.Exec(`
CREATE TABLE IF NOT EXISTS crawl_queue (
source TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
status INTEGER DEFAULT 0,
depth INTEGER,
addedAt DATETIME DEFAULT CURRENT_TIMESTAMP,
updatedAt DATETIME DEFAULT CURRENT_TIMESTAMP
)
`)

if err != nil {
return err
}
}

{
_, err := db.conn.Exec(`
CREATE TABLE IF NOT EXISTS canonicals (
source TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
canonical TEXT NOT NULL,
crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP
)
`)

if err != nil {
return err
}
}

return nil
func (db *SQLiteDatabase) Setup() error {
_, err := db.conn.Exec(setupCommands)
return err
}

func (db *SQLiteDatabase) AddDocument(source string, depth int32, url string, status QueueItemStatus, title string, description string, content string) (*Page, error) {
tx, err := db.conn.Begin()

if err != nil {
fmt.Printf("Error starting transaction: %v\n", err)
return nil, err
}

// Remove old entries
{
_, err := tx.Exec(`
DELETE FROM crawl_queue WHERE source = ? AND url = ?;
DELETE FROM pages WHERE source = ? AND url = ?;
DELETE FROM pages_fts WHERE source = ? AND url = ?;
`, source, url, source, url, source, url)

if err != nil {
fmt.Printf("Error removing old entries: %v\n", err)
return nil, err
}
}

// Insert new records for the page (to prevent duplicates and record crawl time as a DATETIME) and the FTS entry (for user search queries)
{
_, err := tx.Exec("INSERT INTO pages (source, url, depth, status) VALUES (?, ?, ?, ?)", source, url, depth, status)
if err != nil {
fmt.Printf("Error inserting new page: %v\n", err)
return nil, err
}
}

result := tx.QueryRow("INSERT INTO pages_fts (source, url, title, description, content, status) VALUES (?, ?, ?, ?, ?, ?) RETURNING *", source, url, title, description, content, status)

// Return the newly-inserted row
row := &Page{}

{
err := result.Scan(&row.Source, &row.URL, &row.Title, &row.Description, &row.Content, &row.Status)

if err != nil {
fmt.Printf("Error scanning inserted row: %v\n", err)
return nil, err
}
}

{
err := tx.Commit()
if err != nil {
fmt.Printf("Error inserting new FTS entry: %v\n", err)
return nil, err
}
}

return row, nil
func (db *SQLiteDatabase) AddDocument(source string, depth int32, url string, status QueueItemStatus, title string, description string, content string) error {
_, err := db.conn.Exec("INSERT INTO pages (source, depth, status, url, title, description, content) VALUES (?, ?, ?, ?, ?, ?, ?);", source, depth, status, url, title, description, content)
return err
}

func (db *SQLiteDatabase) HasDocument(source string, url string) (*bool, error) {
// TODO: SELECTing the URL is unnecessary. we can just use a "SELECT 1" and see if any rows were returned.
cursor := db.conn.QueryRow("SELECT url FROM pages WHERE source = ? AND (url = ? OR url IN (SELECT canonical FROM canonicals WHERE url = ?))", source, url, url)
cursor := db.conn.QueryRow("SELECT url FROM pages WHERE source = ? AND (url = ? OR url IN (SELECT canonical FROM canonicals WHERE url = ?));", source, url, url)

page := &Page{}
err := cursor.Scan(&page.URL)
Expand Down Expand Up @@ -205,9 +83,9 @@ func (db *SQLiteDatabase) Search(sources []string, search string, page uint32, p
query := fmt.Sprintf(`SELECT
rank,
url,
highlight(pages_fts, 2, ?, ?) title,
snippet(pages_fts, 3, ?, ?, '…', 8) description,
snippet(pages_fts, 4, ?, ?, '…', 24) content
highlight(pages_fts, 3, ?, ?) title,
snippet(pages_fts, 4, ?, ?, '…', 8) description,
snippet(pages_fts, 5, ?, ?, '…', 24) content
FROM pages_fts WHERE source IN (%s) AND status = ? AND pages_fts MATCH ? ORDER BY rank LIMIT ? OFFSET ?;
`, strings.Repeat("?, ", len(sources)-1)+"?")

Expand Down
69 changes: 69 additions & 0 deletions app/database/db_sqlite_setup.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
-- Enable write-ahead logging for improved write performance (https://www.sqlite.org/wal.html)
PRAGMA journal_mode = wal;

CREATE TABLE IF NOT EXISTS crawl_queue(
source TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
status INTEGER DEFAULT 0, -- Pending
depth INTEGER,
addedAt DATETIME DEFAULT CURRENT_TIMESTAMP,
updatedAt DATETIME DEFAULT CURRENT_TIMESTAMP
);

-- When a canonical URL is discovered, it is cached in this table to prevent excessively querying the target
CREATE TABLE IF NOT EXISTS canonicals(
source TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
canonical TEXT NOT NULL,
crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP
);

-- After a page is crawled, it is added to this table
CREATE TABLE IF NOT EXISTS pages(
source TEXT NOT NULL,

crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP,
depth INTEGER NOT NULL,
status INTEGER NOT NULL,

url TEXT NOT NULL,
title TEXT,
description TEXT,
content TEXT
);

-- Ensure a page can only be added once per source
CREATE UNIQUE INDEX IF NOT EXISTS page_source_url ON pages(source, url);

-- Create a full-text search table
CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5(
source UNINDEXED,
status UNINDEXED,

url,
title,
description,
content,

-- Specify that this FTS table is contentless and gets its content from the `pages` table
content=pages
);

-- Use triggers to automatically sync the FTS table with the content table
-- https://sqlite.org/fts5.html#external_content_tables
CREATE TRIGGER IF NOT EXISTS pages_auto_insert AFTER INSERT ON pages BEGIN
INSERT INTO pages_fts(rowid, source, status, url, title, description, content) VALUES (new.rowid, new.source, new.status, new.url, new.title, new.description, new.content);
-- Remove crawl queue entry if it exists
DELETE FROM crawl_queue WHERE source = new.source AND url = new.url;
END;

CREATE TRIGGER IF NOT EXISTS pages_auto_delete AFTER DELETE ON pages BEGIN
INSERT INTO pages_fts(pages_fts, source, status, rowid, url, title, description, content) VALUES('delete', old.rowid, old.source, old.status, old.url, old.title, old.description, old.content);
END;

CREATE TRIGGER IF NOT EXISTS pages_auto_update AFTER UPDATE ON pages BEGIN
INSERT INTO pages_fts(pages_fts, source, status, rowid, url, title, description, content) VALUES('delete', old.rowid, old.source, old.status, old.url, old.title, old.description, old.content);
INSERT INTO pages_fts(rowid, url, title, description, content) VALUES (new.url, new.title, new.description, new.content);
-- Remove crawl queue entry if it exists
DELETE FROM crawl_queue WHERE source = new.source AND url = new.url;
END;
5 changes: 3 additions & 2 deletions app/easysearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
)

// TODO: look into dependency injection instead of passing the DB and config into every function call
// TODO: add a command-line option to rebuild the search index (https://sqlite.org/fts5.html#the_rebuild_command)

func main() {

Expand Down Expand Up @@ -74,7 +75,7 @@ func startCrawl(db database.Database, config *config.Config) {
exists, err := db.HasDocument(src.ID, src.URL)

if err != nil {
fmt.Printf("Failed to look up document %v in pages table\n", err)
fmt.Printf("Failed to look up document '%v'/'%v' in pages table: %v\n", src.ID, src.URL, err)
} else {
if !*exists {
// If the document wasn't found, it should be added to the queue
Expand Down Expand Up @@ -136,7 +137,7 @@ func consumeQueue(db database.Database, config *config.Config) {

// Add an entry to the pages table to prevent immediately recrawling the same URL when referred from other sources
if result != nil {
_, err := db.AddDocument(src.ID, item.Depth, result.Canonical, database.Error, "", "", "")
err := db.AddDocument(src.ID, item.Depth, result.Canonical, database.Error, "", "", "")
if err != nil {
fmt.Printf("Failed to add page in 'error' state: %v\n", err)
}
Expand Down

0 comments on commit 65938e5

Please sign in to comment.