Skip to content

Commit

Permalink
Add a config option to limit the maximum text length indexed per page
Browse files Browse the repository at this point in the history
  • Loading branch information
FluxCapacitor2 committed Nov 23, 2024
1 parent c2d3bc6 commit 2a7eca0
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 4 deletions.
2 changes: 2 additions & 0 deletions app/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ type Source struct {
URL string `yaml:"url"`
// The maximum amount of requests per minute that can be made to this source.
Speed int32
// The maximum amount of text content to index per page, in bytes
SizeLimit int `yaml:"sizeLimit"`

AllowedDomains []string `yaml:"allowedDomains"`

Expand Down
23 changes: 22 additions & 1 deletion app/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ func Crawl(source config.Source, currentDepth int32, referrer string, db databas
}

if !cancelled {
id, addDocErr := db.AddDocument(source.ID, currentDepth, referrer, page.Canonical, page.Status, page.Title, page.Description, page.Content, page.ErrorInfo)
text := Truncate(source.SizeLimit, page.Title, page.Description, page.Content)
id, addDocErr := db.AddDocument(source.ID, currentDepth, referrer, page.Canonical, page.Status, text[0], text[1], text[2], page.ErrorInfo)
result.PageID = id
if addDocErr != nil {
err = addDocErr
Expand All @@ -221,6 +222,26 @@ func Crawl(source config.Source, currentDepth int32, referrer string, db databas
return result, err
}

func Truncate(max int, items ...string) []string {
ret := make([]string, len(items))
remaining := max

for i, item := range items {
if len(item) <= remaining {
ret[i] = item
remaining -= len(item)
} else if remaining > 0 {
added := item[:remaining]
ret[i] = added
remaining -= len(added)
} else {
ret[i] = ""
}
}

return ret
}

// A list of elements that will never contain useful text and should always be filtered out when collecting text content.
var nonTextElements = []string{"head", "meta", "script", "style", "noscript", "object", "svg"}

Expand Down
25 changes: 25 additions & 0 deletions app/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package crawler

import (
"path"
"reflect"
"testing"

"github.com/fluxcapacitor2/easysearch/app/config"
Expand Down Expand Up @@ -132,3 +133,27 @@ func TestSitemap(t *testing.T) {
t.Errorf("sitemap URLs were not discovered - expected >=20 URLs, got %+v\n", res)
}
}

func TestTruncate(t *testing.T) {

tests := []struct {
max int
strings []string
expected []string
}{
{5, []string{"123", "45", "6"}, []string{"123", "45", ""}},
{6, []string{"123", "45", "6"}, []string{"123", "45", "6"}},
{10, []string{"123", "45", "6"}, []string{"123", "45", "6"}},
{2, []string{"123", "45", "6"}, []string{"12", "", ""}},
{5, []string{"lorem ipsum"}, []string{"lorem"}},
{5, []string{"lorem", "", "", "", "", "", "ipsum"}, []string{"lorem", "", "", "", "", "", ""}},
{10, []string{"lorem", "", "", "", "", "", "ipsum"}, []string{"lorem", "", "", "", "", "", "ipsum"}},
}

for _, test := range tests {
result := Truncate(test.max, test.strings...)
if !reflect.DeepEqual(result, test.expected) {
t.Fatalf("incorrect Truncate result - expected %#v, got %#v\n", test.expected, result)
}
}
}
3 changes: 0 additions & 3 deletions app/embedding/chunk_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package embedding

import (
"fmt"
"testing"
)

Expand All @@ -10,8 +9,6 @@ func TestChunkText(t *testing.T) {

for chunkSize := 5; chunkSize < 50; chunkSize += 5 {
for overlap := 0; overlap <= chunkSize/2; overlap++ {
fmt.Printf("testing chunking with size = %v and overlap = %v\n", chunkSize, overlap)

results, err := ChunkText(text, chunkSize, overlap)

if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions config-sample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ sources:
# The minimum amount of time between refreshes, **in days**.
# In this example, pages are recrawled weekly.
minAge: 7
# The maximum amount of text content to index per page, in characters
sizeLimit: 200000 # Content will be truncated after 200,000 characters
embeddings:
enabled: true
# The maximum number of requests per minute to the embeddings API.
Expand Down

0 comments on commit 2a7eca0

Please sign in to comment.