From 2fb2e09b81f258a952ca68f6580782a4489f989d Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Fri, 22 Nov 2024 16:42:35 -0600 Subject: [PATCH 1/4] Index documents one at a time after batch Solr update returns an error --- lib/dpul_collections/solr.ex | 30 +++++++++++++++--- .../figgy/indexing_integration_test.exs | 10 +++--- test/dpul_collections/solr_test.exs | 31 +++++++++++++++++++ 3 files changed, 62 insertions(+), 9 deletions(-) diff --git a/lib/dpul_collections/solr.ex b/lib/dpul_collections/solr.ex index b1f1bc69..6ad1695c 100644 --- a/lib/dpul_collections/solr.ex +++ b/lib/dpul_collections/solr.ex @@ -1,4 +1,6 @@ defmodule DpulCollections.Solr do + require Logger + @spec document_count(String.t()) :: integer() def document_count(collection \\ read_collection()) do {:ok, response} = @@ -96,11 +98,29 @@ defmodule DpulCollections.Solr do end @spec add(list(map()), String.t()) :: {:ok, Req.Response.t()} | {:error, Exception.t()} - def add(docs, collection \\ read_collection()) do - Req.post( - update_url(collection), - json: docs - ) + def add(docs, collection \\ read_collection()) when length(docs) > 1 do + response = + Req.post!( + update_url(collection), + json: docs + ) + + if response.status != 200 do + Enum.each(docs, fn doc -> add([doc]) end) + end + end + + def add(docs, collection) when length(docs) when length(docs) == 1 do + response = + Req.post!( + update_url(collection), + json: docs + ) + + if response.status != 200 do + doc = docs |> Enum.at(0) + Logger.warning("error indexing solr document with id: #{doc["id"]} #{response.body}") + end end @spec commit(String.t()) :: {:ok, Req.Response.t()} | {:error, Exception.t()} diff --git a/test/dpul_collections/indexing_pipeline/integration/figgy/indexing_integration_test.exs b/test/dpul_collections/indexing_pipeline/integration/figgy/indexing_integration_test.exs index 1875cd3b..ac47edc5 100644 --- a/test/dpul_collections/indexing_pipeline/integration/figgy/indexing_integration_test.exs +++ b/test/dpul_collections/indexing_pipeline/integration/figgy/indexing_integration_test.exs @@ -71,10 +71,12 @@ defmodule DpulCollections.IndexingPipeline.Figgy.IndexingIntegrationTest do {marker1, _marker2, _marker3} = FiggyTestFixtures.transformation_cache_markers() Solr.add( - %{ - "id" => marker1.id, - "title" => ["old title"] - }, + [ + %{ + "id" => marker1.id, + "title_txtm" => ["old title"] + } + ], active_collection() ) diff --git a/test/dpul_collections/solr_test.exs b/test/dpul_collections/solr_test.exs index 71d33d0e..9b9c91f8 100644 --- a/test/dpul_collections/solr_test.exs +++ b/test/dpul_collections/solr_test.exs @@ -2,6 +2,7 @@ defmodule DpulCollections.SolrTest do use DpulCollections.DataCase alias DpulCollections.Solr import SolrTestSupport + import ExUnit.CaptureLog setup do Solr.delete_all(active_collection()) @@ -201,4 +202,34 @@ defmodule DpulCollections.SolrTest do assert Solr.find_by_id("3cb7627b-defc-401b-9959-42ebc4488f74")["slug_s"] == "él-no-responde-mis-mensajes" end + + test "an exception is logged when indexing a document raises a solr error" do + doc = %{ + # No title + "id" => "3cb7627b-defc-401b-9959-42ebc4488f74" + } + + # Solr.commit(active_collection()) + + assert capture_log(fn -> Solr.add([doc], active_collection()) end) =~ + "error indexing solr document" + end + + test "a valid solr document is indexed when in the same batch as an invalid solr document" do + valid_doc = %{ + "id" => "e0602353-4429-4405-b080-064952f9b267", + "title_txtm" => ["test title 1"] + } + + invalid_doc = %{ + # No title + "id" => "3cb7627b-defc-401b-9959-42ebc4488f74" + } + + assert capture_log(fn -> Solr.add([valid_doc, invalid_doc], active_collection()) end) =~ + "error indexing solr document" + + Solr.commit(active_collection()) + assert Solr.find_by_id(valid_doc["id"])["id"] == valid_doc["id"] + end end From 72411097f03f9813ba0ed199b38416480bc16004 Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Tue, 26 Nov 2024 15:24:00 -0600 Subject: [PATCH 2/4] merge --- lib/dpul_collections/solr.ex | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/dpul_collections/solr.ex b/lib/dpul_collections/solr.ex index 6ad1695c..026ad66c 100644 --- a/lib/dpul_collections/solr.ex +++ b/lib/dpul_collections/solr.ex @@ -97,8 +97,11 @@ defmodule DpulCollections.Solr do end end - @spec add(list(map()), String.t()) :: {:ok, Req.Response.t()} | {:error, Exception.t()} - def add(docs, collection \\ read_collection()) when length(docs) > 1 do + @spec add(list(map()) | String.t(), String.t()) :: + {:ok, Req.Response.t()} | {:error, Exception.t()} + def add(docs, collection \\ read_collection()) + + def add(docs, collection) when is_list(docs) do response = Req.post!( update_url(collection), @@ -106,19 +109,18 @@ defmodule DpulCollections.Solr do ) if response.status != 200 do - Enum.each(docs, fn doc -> add([doc]) end) + docs |> Enum.each(&add/1) end end - def add(docs, collection) when length(docs) when length(docs) == 1 do + def add(doc, collection) when not is_list(doc) do response = Req.post!( update_url(collection), - json: docs + json: [doc] ) if response.status != 200 do - doc = docs |> Enum.at(0) Logger.warning("error indexing solr document with id: #{doc["id"]} #{response.body}") end end From f590fae0c6b188d7042e9e4304322a801f876c84 Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Mon, 25 Nov 2024 12:34:05 -0600 Subject: [PATCH 3/4] Assign default when title is missing --- .../figgy/hydration_cache_entry.ex | 10 ++++++++-- lib/dpul_collections/solr.ex | 4 ++++ .../figgy/hydration_cache_entry_test.exs | 19 +++++++++++++++++++ test/dpul_collections/solr_test.exs | 2 -- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/lib/dpul_collections/indexing_pipeline/figgy/hydration_cache_entry.ex b/lib/dpul_collections/indexing_pipeline/figgy/hydration_cache_entry.ex index 8628c88e..6b211663 100644 --- a/lib/dpul_collections/indexing_pipeline/figgy/hydration_cache_entry.ex +++ b/lib/dpul_collections/indexing_pipeline/figgy/hydration_cache_entry.ex @@ -27,7 +27,7 @@ defmodule DpulCollections.IndexingPipeline.Figgy.HydrationCacheEntry do %{ id: id, - title_txtm: get_in(metadata, ["title"]), + title_txtm: extract_title(metadata), description_txtm: get_in(metadata, ["description"]), years_is: extract_years(data), display_date_s: format_date(metadata), @@ -49,6 +49,8 @@ defmodule DpulCollections.IndexingPipeline.Figgy.HydrationCacheEntry do extract_service_url(member_data[id]) end + defp extract_service_url(_id, _), do: nil + # Find the derivative FileMetadata defp extract_service_url(%{ "internal_resource" => "FileSet", @@ -82,7 +84,11 @@ defmodule DpulCollections.IndexingPipeline.Figgy.HydrationCacheEntry do defp extract_service_url(nil), do: nil - defp extract_service_url(_id, _), do: nil + def extract_title(%{"title" => []}) do + ["[Missing Title]"] + end + + def extract_title(%{"title" => title}), do: title defp is_derivative(%{ "mime_type" => ["image/tiff"], diff --git a/lib/dpul_collections/solr.ex b/lib/dpul_collections/solr.ex index 026ad66c..3a49e0f4 100644 --- a/lib/dpul_collections/solr.ex +++ b/lib/dpul_collections/solr.ex @@ -111,6 +111,8 @@ defmodule DpulCollections.Solr do if response.status != 200 do docs |> Enum.each(&add/1) end + + response end def add(doc, collection) when not is_list(doc) do @@ -123,6 +125,8 @@ defmodule DpulCollections.Solr do if response.status != 200 do Logger.warning("error indexing solr document with id: #{doc["id"]} #{response.body}") end + + response end @spec commit(String.t()) :: {:ok, Req.Response.t()} | {:error, Exception.t()} diff --git a/test/dpul_collections/indexing_pipeline/figgy/hydration_cache_entry_test.exs b/test/dpul_collections/indexing_pipeline/figgy/hydration_cache_entry_test.exs index 0d5c0d32..9f5d333f 100644 --- a/test/dpul_collections/indexing_pipeline/figgy/hydration_cache_entry_test.exs +++ b/test/dpul_collections/indexing_pipeline/figgy/hydration_cache_entry_test.exs @@ -282,5 +282,24 @@ defmodule DpulCollections.IndexingPipeline.Figgy.HydrationCacheEntryTest do assert capture_log(fn -> HydrationCacheEntry.to_solr_document(entry) end) =~ "couldn't parse date" end + + test "an empty solr document is returned with a empty title field" do + {:ok, entry} = + IndexingPipeline.write_hydration_cache_entry(%{ + cache_version: 0, + record_id: "f134f41f-63c5-4fdf-b801-0774e3bc3b2d", + source_cache_order: ~U[2018-03-09 20:19:36.465203Z], + data: %{ + "id" => "f134f41f-63c5-4fdf-b801-0774e3bc3b2d", + "internal_resource" => "EphemeraFolder", + "metadata" => %{ + "title" => [], + "date_created" => ["2022"] + } + } + }) + + assert %{title_txtm: ["[Missing Title]"]} = HydrationCacheEntry.to_solr_document(entry) + end end end diff --git a/test/dpul_collections/solr_test.exs b/test/dpul_collections/solr_test.exs index 9b9c91f8..912371ef 100644 --- a/test/dpul_collections/solr_test.exs +++ b/test/dpul_collections/solr_test.exs @@ -209,8 +209,6 @@ defmodule DpulCollections.SolrTest do "id" => "3cb7627b-defc-401b-9959-42ebc4488f74" } - # Solr.commit(active_collection()) - assert capture_log(fn -> Solr.add([doc], active_collection()) end) =~ "error indexing solr document" end From 8a0d5cffa2f5142754451e1207ce6732d2853461 Mon Sep 17 00:00:00 2001 From: Eliot Jordan Date: Tue, 26 Nov 2024 12:52:25 -0600 Subject: [PATCH 4/4] Resolve test warnings --- test/dpul_collections/indexing_pipeline/coherence_test.exs | 2 +- test/dpul_collections/indexing_pipeline_test.exs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/dpul_collections/indexing_pipeline/coherence_test.exs b/test/dpul_collections/indexing_pipeline/coherence_test.exs index ac4467bf..15827bc7 100644 --- a/test/dpul_collections/indexing_pipeline/coherence_test.exs +++ b/test/dpul_collections/indexing_pipeline/coherence_test.exs @@ -63,7 +63,7 @@ defmodule DpulCollections.IndexingPipeline.CoherenceTest do end test "index_parity?/0 is true when the new index and the old index have equal freshness" do - {marker1, marker2, _marker3} = FiggyTestFixtures.hydration_cache_markers(1) + {marker1, _marker2, _marker3} = FiggyTestFixtures.hydration_cache_markers(1) FiggyTestFixtures.hydration_cache_markers(2) IndexingPipeline.write_processor_marker(%{ diff --git a/test/dpul_collections/indexing_pipeline_test.exs b/test/dpul_collections/indexing_pipeline_test.exs index d7dd8e0b..10947c3a 100644 --- a/test/dpul_collections/indexing_pipeline_test.exs +++ b/test/dpul_collections/indexing_pipeline_test.exs @@ -164,7 +164,7 @@ defmodule DpulCollections.IndexingPipelineTest do test "delete_cache_version/1 deletes entries for that cache version from each table" do for cache_version <- [0, 1] do - {marker1, _marker2, _marker3} = FiggyTestFixtures.hydration_cache_markers(cache_version) + FiggyTestFixtures.hydration_cache_markers(cache_version) {marker1, _marker2, _marker3} = FiggyTestFixtures.transformation_cache_markers(cache_version)