Skip to content

Commit

Permalink
Pull year values from different types of date strings
Browse files Browse the repository at this point in the history
  • Loading branch information
eliotjordan committed Nov 20, 2024
1 parent b7ba283 commit 69ff0dc
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,13 @@ defmodule DpulCollections.IndexingPipeline.Figgy.HydrationCacheEntry do
# If somehow we get more than 1 value, just take the first
# It goes into a multi-valued index field, so keep it looking that way
defp extract_years(%{"metadata" => %{"date_created" => [date | _tail]}, "id" => id}) do
result = Integer.parse(date)

case result do
:error ->
case parse_date(date) do
nil ->
Logger.warning("couldn't parse date \"#{date}\" for record #{id}")
nil

{extracted_date, _rest} ->
[extracted_date]
x ->
[String.to_integer(x)]
end
end

Expand All @@ -142,7 +140,7 @@ defmodule DpulCollections.IndexingPipeline.Figgy.HydrationCacheEntry do
end

defp format_date(%{"date_created" => [date | _tail]}) do
date
parse_date(date)
end

defp format_date(%{"date_created" => []}) do
Expand All @@ -153,4 +151,16 @@ defmodule DpulCollections.IndexingPipeline.Figgy.HydrationCacheEntry do
# there's no date_created value
nil
end

defp parse_date(date_string) do
[
# "29 Raḥab al-Marjab 1321- [July 1923]"
~r/\[*(\d+)(?=\])/,
# "November 1952" or "1943"
~r/\d{4}/
]
|> Enum.map(fn regex -> Regex.run(regex, date_string, capture: :first) end)
|> Enum.find([nil], fn s -> s != nil end)
|> hd
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -130,25 +130,82 @@ defmodule DpulCollections.IndexingPipeline.Figgy.HydrationCacheEntryTest do
}
})

# Add one to exercise date_created with format `.*yyyy`
{:ok, entry6} =
IndexingPipeline.write_hydration_cache_entry(%{
cache_version: 0,
record_id: "f134f41f-63c5-4fdf-b801-0774e3bc3b2d",
source_cache_order: ~U[2018-03-09 20:19:36.465203Z],
data: %{
"id" => "f134f41f-63c5-4fdf-b801-0774e3bc3b2d",
"internal_resource" => "EphemeraFolder",
"metadata" => %{
"title" => ["test title 6"],
"date_created" => ["January 26, 1952"]
}
}
})

# Add one to exercise date_created with format `.*[.*yyyy]`
{:ok, entry7} =
IndexingPipeline.write_hydration_cache_entry(%{
cache_version: 0,
record_id: "f134f41f-63c5-4fdf-b801-0774e3bc3b2d",
source_cache_order: ~U[2018-03-09 20:19:36.465203Z],
data: %{
"id" => "f134f41f-63c5-4fdf-b801-0774e3bc3b2d",
"internal_resource" => "EphemeraFolder",
"metadata" => %{
"title" => ["test title 7"],
"date_created" => ["29 Raḥab al-Marjab 1342- رحب المرجب 1342 - [July 1923]"]
}
}
})

# Add one to exercise date_created with format `.*[.*yyyy]`
{:ok, entry8} =
IndexingPipeline.write_hydration_cache_entry(%{
cache_version: 0,
record_id: "f134f41f-63c5-4fdf-b801-0774e3bc3b2d",
source_cache_order: ~U[2018-03-09 20:19:36.465203Z],
data: %{
"id" => "f134f41f-63c5-4fdf-b801-0774e3bc3b2d",
"internal_resource" => "EphemeraFolder",
"metadata" => %{
"title" => ["test title 8"],
"date_created" => ["September 1931"]
}
}
})

doc4 = HydrationCacheEntry.to_solr_document(entry4)
doc5 = HydrationCacheEntry.to_solr_document(entry5)
doc6 = HydrationCacheEntry.to_solr_document(entry6)
doc7 = HydrationCacheEntry.to_solr_document(entry7)
doc8 = HydrationCacheEntry.to_solr_document(entry8)

assert doc1[:years_is] == [2022]
assert doc2[:years_is] == [1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005]
assert doc3[:years_is] == nil
assert doc4[:years_is] == [2011, 2012, 2013]
assert doc5[:years_is] == nil
assert doc6[:years_is] == [1952]
assert doc7[:years_is] == [1923]
assert doc8[:years_is] == [1931]

assert doc1[:display_date_s] == "2022"
assert doc2[:display_date_s] == "1995 - 2005"
assert doc3[:display_date_s] == nil
assert doc4[:display_date_s] == "2011 - 2013 (approximate)"
assert doc5[:display_date_s] == nil
assert doc6[:display_date_s] == "1952"
assert doc7[:display_date_s] == "1923"
assert doc8[:display_date_s] == "1931"
end

test "logs dates it can't parse" do
# date created has a bad date
{:ok, entry6} =
{:ok, entry} =
IndexingPipeline.write_hydration_cache_entry(%{
cache_version: 0,
record_id: "f134f41f-63c5-4fdf-b801-0774e3bc3b2d",
Expand All @@ -163,8 +220,7 @@ defmodule DpulCollections.IndexingPipeline.Figgy.HydrationCacheEntryTest do
}
})

# doc6 = HydrationCacheEntry.to_solr_document(entry6)
assert capture_log(fn -> HydrationCacheEntry.to_solr_document(entry6) end) =~
assert capture_log(fn -> HydrationCacheEntry.to_solr_document(entry) end) =~
"couldn't parse date"
end
end
Expand Down

0 comments on commit 69ff0dc

Please sign in to comment.