From 2a7eca01df307888052f9241eb83a7c6c706a142 Mon Sep 17 00:00:00 2001 From: FluxCapacitor2 <31071265+FluxCapacitor2@users.noreply.github.com> Date: Sat, 23 Nov 2024 03:28:14 -0500 Subject: [PATCH] Add a config option to limit the maximum text length indexed per page --- app/config/config.go | 2 ++ app/crawler/crawler.go | 23 ++++++++++++++++++++++- app/crawler/crawler_test.go | 25 +++++++++++++++++++++++++ app/embedding/chunk_test.go | 3 --- config-sample.yml | 2 ++ 5 files changed, 51 insertions(+), 4 deletions(-) diff --git a/app/config/config.go b/app/config/config.go index 0214abb..c5030ad 100644 --- a/app/config/config.go +++ b/app/config/config.go @@ -34,6 +34,8 @@ type Source struct { URL string `yaml:"url"` // The maximum amount of requests per minute that can be made to this source. Speed int32 + // The maximum amount of text content to index per page, in bytes + SizeLimit int `yaml:"sizeLimit"` AllowedDomains []string `yaml:"allowedDomains"` diff --git a/app/crawler/crawler.go b/app/crawler/crawler.go index ae07563..e8a98c4 100644 --- a/app/crawler/crawler.go +++ b/app/crawler/crawler.go @@ -211,7 +211,8 @@ func Crawl(source config.Source, currentDepth int32, referrer string, db databas } if !cancelled { - id, addDocErr := db.AddDocument(source.ID, currentDepth, referrer, page.Canonical, page.Status, page.Title, page.Description, page.Content, page.ErrorInfo) + text := Truncate(source.SizeLimit, page.Title, page.Description, page.Content) + id, addDocErr := db.AddDocument(source.ID, currentDepth, referrer, page.Canonical, page.Status, text[0], text[1], text[2], page.ErrorInfo) result.PageID = id if addDocErr != nil { err = addDocErr @@ -221,6 +222,26 @@ func Crawl(source config.Source, currentDepth int32, referrer string, db databas return result, err } +func Truncate(max int, items ...string) []string { + ret := make([]string, len(items)) + remaining := max + + for i, item := range items { + if len(item) <= remaining { + ret[i] = item + remaining -= len(item) + } else if remaining > 0 { + added := item[:remaining] + ret[i] = added + remaining -= len(added) + } else { + ret[i] = "" + } + } + + return ret +} + // A list of elements that will never contain useful text and should always be filtered out when collecting text content. var nonTextElements = []string{"head", "meta", "script", "style", "noscript", "object", "svg"} diff --git a/app/crawler/crawler_test.go b/app/crawler/crawler_test.go index 2694f79..5300744 100644 --- a/app/crawler/crawler_test.go +++ b/app/crawler/crawler_test.go @@ -2,6 +2,7 @@ package crawler import ( "path" + "reflect" "testing" "github.com/fluxcapacitor2/easysearch/app/config" @@ -132,3 +133,27 @@ func TestSitemap(t *testing.T) { t.Errorf("sitemap URLs were not discovered - expected >=20 URLs, got %+v\n", res) } } + +func TestTruncate(t *testing.T) { + + tests := []struct { + max int + strings []string + expected []string + }{ + {5, []string{"123", "45", "6"}, []string{"123", "45", ""}}, + {6, []string{"123", "45", "6"}, []string{"123", "45", "6"}}, + {10, []string{"123", "45", "6"}, []string{"123", "45", "6"}}, + {2, []string{"123", "45", "6"}, []string{"12", "", ""}}, + {5, []string{"lorem ipsum"}, []string{"lorem"}}, + {5, []string{"lorem", "", "", "", "", "", "ipsum"}, []string{"lorem", "", "", "", "", "", ""}}, + {10, []string{"lorem", "", "", "", "", "", "ipsum"}, []string{"lorem", "", "", "", "", "", "ipsum"}}, + } + + for _, test := range tests { + result := Truncate(test.max, test.strings...) + if !reflect.DeepEqual(result, test.expected) { + t.Fatalf("incorrect Truncate result - expected %#v, got %#v\n", test.expected, result) + } + } +} diff --git a/app/embedding/chunk_test.go b/app/embedding/chunk_test.go index dccaeab..ee510da 100644 --- a/app/embedding/chunk_test.go +++ b/app/embedding/chunk_test.go @@ -1,7 +1,6 @@ package embedding import ( - "fmt" "testing" ) @@ -10,8 +9,6 @@ func TestChunkText(t *testing.T) { for chunkSize := 5; chunkSize < 50; chunkSize += 5 { for overlap := 0; overlap <= chunkSize/2; overlap++ { - fmt.Printf("testing chunking with size = %v and overlap = %v\n", chunkSize, overlap) - results, err := ChunkText(text, chunkSize, overlap) if err != nil { diff --git a/config-sample.yml b/config-sample.yml index 1570eb3..503db7c 100644 --- a/config-sample.yml +++ b/config-sample.yml @@ -39,6 +39,8 @@ sources: # The minimum amount of time between refreshes, **in days**. # In this example, pages are recrawled weekly. minAge: 7 + # The maximum amount of text content to index per page, in characters + sizeLimit: 200000 # Content will be truncated after 200,000 characters embeddings: enabled: true # The maximum number of requests per minute to the embeddings API.