Skip to content

Commit

Permalink
update keyword extraction and reranking
Browse files Browse the repository at this point in the history
  • Loading branch information
yujonglee committed Aug 20, 2024
1 parent ad4fb57 commit fcb484e
Show file tree
Hide file tree
Showing 24 changed files with 65 additions and 52 deletions.
2 changes: 1 addition & 1 deletion core/config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ config :canary, :root, File.cwd!()

config :canary, Oban,
engine: Oban.Engines.Basic,
queues: [fetcher: 1, email: 2, updater: 5, stripe: 50, keyword: 20],
queues: [fetcher: 1, email: 2, updater: 5, stripe: 50, summary: 20],
repo: Canary.Repo,
plugins: [
{Oban.Plugins.Pruner, max_age: 60 * 60 * 24 * 7},
Expand Down
5 changes: 1 addition & 4 deletions core/lib/canary/query/understander.ex
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,12 @@ defmodule Canary.Query.Understander.LLM do
Your job is to analyze the user's query and return a structured response like below:
<analysis>
<query>QUERY</query>
<keywords>KEYWORD_1,KEYWORD_2,KEYWORD_3</keywords>
</analysis>
IMPORTANT NOTES:
- <keywords></keywords> should contain comma separated list of keywords. MAX 3 keywords are allowed.
- Each "keyword" should be one or two words. It will be used to run keyword based search. User '#{@keywords_section}' section for inspiration.
- The "query" is typo-corrected version of the user's raw query for better search results, with minimal formatting applied.
- Do not omit prepositions or details that are not mentioned in the user's query.
- Each "keyword" must be a single word. It will be used to run keyword based search. User '#{@keywords_section}' section for inspiration.
Do not include any other text, just respond with the XML-like format that I provided.
If user's query is totally nonsense, just return <analysis></analysis>.
Expand Down
11 changes: 7 additions & 4 deletions core/lib/canary/reranker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ defmodule Canary.Reranker do
@callback run(
query :: String.t(),
docs :: list(any()),
renderer :: function()
opts :: keyword()
) :: {:ok, list(any())} | {:error, any()}

def run(_, _, _ \\ fn doc -> doc end)
def run(_, _, _ \\ [])
def run(_, [], _), do: {:ok, []}
def run(query, docs, renderer), do: impl().run(query, docs, renderer)
def run(query, docs, opts), do: impl().run(query, docs, opts)

defp impl(), do: Application.get_env(:canary, :reranker, Canary.Reranker.Noop)
end
Expand All @@ -19,7 +19,10 @@ defmodule Canary.Reranker.Cohere do

use Retry

def run(query, docs, renderer, threshold \\ 0.5) do
def run(query, docs, opts) do
renderer = opts[:renderer] || fn doc -> doc end
threshold = opts[:threshold] || 0

result =
retry with: exponential_backoff() |> randomize |> cap(1_000) |> expiry(4_000) do
request(query, docs, renderer)
Expand Down
9 changes: 5 additions & 4 deletions core/lib/canary/searcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ defmodule Canary.Searcher.Default do

keywords =
source.documents
|> Enum.map(& &1.keywords)
|> Enum.map(fn doc -> if doc.summary, do: doc.summary.keywords, else: [] end)
|> Enum.flat_map(& &1)
|> Enum.frequencies()
|> Enum.map(fn {k, v} -> if v > 0.5 * docs_size, do: k, else: nil end)
Expand All @@ -48,14 +48,15 @@ defmodule Canary.Searcher.Default do
{:ok, docs} <- Canary.Index.batch_search_documents(source.id, analysis.keywords),
{:ok, reranked} <-
Canary.Reranker.run(
analysis.query,
query,
Enum.dedup_by(docs, & &1.id),
fn doc -> doc.content end
renderer: fn doc -> doc.content end,
threshold: 0.05
) do
{:ok,
%{
search: reranked,
suggestion: %{questions: [analysis.query]}
suggestion: %{questions: [query]}
}}
end
end
Expand Down
12 changes: 6 additions & 6 deletions core/lib/canary/sources/document.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ defmodule Canary.Sources.Document do
attribute :url, :string, allow_nil?: true
attribute :content, :binary, allow_nil?: false
attribute :chunks, {:array, Canary.Sources.Chunk}, default: []
attribute :keywords, {:array, :string}, default: []
attribute :summary, Canary.Sources.DocumentSummary, allow_nil?: true
end

identities do
Expand Down Expand Up @@ -64,14 +64,14 @@ defmodule Canary.Sources.Document do
change Canary.Sources.Document.Changes.DestroyChunks
end

update :update_kewords do
argument :keywords, {:array, :string}, allow_nil?: false
change set_attribute(:keywords, arg(:keywords))
update :update_summary do
argument :summary, Canary.Sources.DocumentSummary, allow_nil?: false
change set_attribute(:summary, arg(:summary))
end
end

code_interface do
define :update_kewords, args: [:keywords], action: :update_kewords
define :update_summary, args: [:summary], action: :update_summary
define :find_by_chunk_index_id, args: [:id], action: :find_by_chunk_index_id
end

Expand Down Expand Up @@ -161,7 +161,7 @@ defmodule Canary.Sources.Document.Changes.CreateSummary do
changeset
|> Ash.Changeset.after_action(fn _changeset, record ->
%{"document_id" => record.id}
|> Canary.Workers.Keywords.new()
|> Canary.Workers.Summary.new()
|> Oban.insert()

{:ok, record}
Expand Down
9 changes: 9 additions & 0 deletions core/lib/canary/sources/document_summary.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defmodule Canary.Sources.DocumentSummary do
use Ash.Resource,
domain: Canary.Sources,
data_layer: :embedded

attributes do
attribute :keywords, {:array, :struct}, allow_nil?: false, default: []
end
end
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
defmodule Canary.Workers.Keywords do
use Oban.Worker, queue: :keyword, max_attempts: 3
defmodule Canary.Workers.Summary do
use Oban.Worker, queue: :summary, max_attempts: 3

@impl true
def perform(%Oban.Job{args: %{"document_id" => id}}) do
Expand All @@ -18,7 +18,7 @@ defmodule Canary.Workers.Keywords do
]

with {:ok, completion} <- Canary.AI.chat(%{model: chat_model, messages: messages}),
{:ok, _} <- Canary.Sources.Document.update_kewords(doc, transform(completion)) do
{:ok, _} <- Canary.Sources.Document.update_summary(doc, to_summary(completion)) do
:ok
end
end
Expand All @@ -45,15 +45,18 @@ defmodule Canary.Workers.Keywords do
}
end

defp transform(completion) do
~r/<keywords>(.*?)<\/keywords>/s
|> Regex.scan(completion, capture: :all_but_first)
|> Enum.flat_map(fn [keywords] ->
keywords
|> String.split(",")
|> Enum.map(&String.trim/1)
|> Enum.map(&String.downcase/1)
end)
|> Enum.uniq()
defp to_summary(completion) do
keywords =
~r/<keywords>(.*?)<\/keywords>/s
|> Regex.scan(completion, capture: :all_but_first)
|> Enum.flat_map(fn [keywords] ->
keywords
|> String.split(",")
|> Enum.map(&String.trim/1)
|> Enum.map(&String.downcase/1)
end)
|> Enum.uniq()

%Canary.Sources.DocumentSummary{keywords: keywords}
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ defmodule Canary.Repo.Migrations.InitResources do
add :url, :text
add :content, :binary, null: false
add :chunks, {:array, :map}, default: []
add :keywords, {:array, :text}, default: []
add :summary, :map

add :source_id,
references(:sources,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "98879BAC60B4ADD10E3B06B53567B604570352EE9C6327B6BF99833344A47811",
"hash": "383D341378EDC3629A559A480BD4847F2655F2DC4BFA5A068CE012FD1AFB7DE3",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "AC6DF984994063A10E9B27B57B224D4DD8ED58DA0A6C769CAA300ED8C86E94BC",
"hash": "DD6CEC340B8F6643E4DD1BFA0B4A487E5AE4B9FFAA12E828AC4AA722285A1652",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "B35DD5FE904796F7D3D7C40719CF58869119425EF836447CB60CFDE2D54F310F",
"hash": "316149EB3CAEC59B7CFEA63AB0FF5D338AFD0337C427241F6EC7D0EB1CA121BB",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "A51E691C47FC92D0C1C60EF69B74497BFF914204249F0A776972EC6E70C48E9C",
"hash": "09AABB4F17080666362E79F9BFF51BBA47BDB1A0A1FA59BEC53142B29A30D717",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "5C051176E120A13AF02BCD4D2A1DCD854AB200C2F758FCB81830FA568897FD8A",
"hash": "EFDAC643A9F3A4935E330675ED67E5B570662B56B241333379604D9877B14B4C",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "DCCD9248AFCC71CCB74AF01AC1FBC59D92DFA703BEC9C5FF1343200658033FCB",
"hash": "35DAC4B4059AAF23C1590A8C79553316ABBA5ED4096B5E374271D12D661A3F1D",
"identities": [
{
"all_tenants?": false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@
},
{
"allow_nil?": true,
"default": "[]",
"default": "nil",
"generated?": false,
"primary_key?": false,
"references": null,
"size": null,
"source": "keywords",
"type": ["array", "text"]
"source": "summary",
"type": "map"
},
{
"allow_nil?": true,
Expand Down Expand Up @@ -105,7 +105,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "413666EB4B9889C919C94F09DC4ED831ECD16F86E61C56A238AABEA0C557CBA8",
"hash": "D80535A8B54308E72279E976190AD4A2A3EE9E99CA936B31F751848CA4BFE9C5",
"identities": [
{
"all_tenants?": false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": false,
"hash": "713C62CB8FAC47ACAEEA948F98A97FD49476C8DD7B5DCDFAD3CB8C5AD32D6919",
"hash": "6B54196233F50E22ACF8F95D69DC4968D0DEDB58D6061CB3B3B2F0E85A6B9B64",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "5BAA7E570E0A1162D19F01D6E8A99198A0138B460740DD9B10890CA855E21572",
"hash": "2E341957FC52089503F12B7B6489F131C4BBEDDAAB24BDAC75AE0B17EA6516B8",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "EE6332081152F2B571D07826708C5A55DA38AAC8CF086DD81D9DF3056459DE47",
"hash": "FEA2BF4D2BEA6C2FE2268550DC11C24094F408AE5D6BDFFA70C56A1735DB36AA",
"identities": [
{
"all_tenants?": false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "BCD1EFEF0CCA7B40A32546E39C79C72201CF910AE8812DA99B35701A00B5EF9E",
"hash": "475E0A6A723B37EAA2F064A7F9BF9EB303A11A8E38031405371870393B30245A",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "0456C71B7218EC634EA5DCE52A9479E43236ECD5421B066DB8FA2F16E7F1DD38",
"hash": "64CCDBC6E40C48594E0D5E9AD9FDE69D56FD49D616171034F1F1F4E14922F3B6",
"identities": [
{
"all_tenants?": false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "40D24862C7D33396BA0B110B7F0F4BDD0D69404E9B3EC0B8F771BD31AC98E762",
"hash": "9529EFFA4B5AB7FFD6809D4E6C2318229D658F9C131A24BAF9CE6C6AC3F1866B",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "EF5B1D84FF86890DE7641D25F20FA44786D7D4A28AC895113979A220BB8ECAB3",
"hash": "2DD9595A4B30CE001E665CA41FAE7FFA70DB5EE5B7E14AE364C0D7652417B4A7",
"identities": [],
"multitenancy": {
"attribute": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "499697AC6010E054BC094EFE77BFE29D0F3F778D28C88D73159AD3A4505BF517",
"hash": "0B7BEC1405443921328364D5EB9EDB39808DEC6C70397759108C963AD7E909AB",
"identities": [
{
"all_tenants?": false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"custom_indexes": [],
"custom_statements": [],
"has_create_action": true,
"hash": "4D7CECE9BCF9CDF6F17FE3D28FDB1F96A1A82AAA40B47519E2DACD24BA5091A2",
"hash": "F86810D4B686DD4D54A9993CE8F8F902FBF85DC16FE643E2093CA29F91FDE6D6",
"identities": [
{
"all_tenants?": false,
Expand Down

0 comments on commit fcb484e

Please sign in to comment.