From 7e657059a9f541c567d47de268be17b9f27e23ea Mon Sep 17 00:00:00 2001 From: "Michael B. Klein" Date: Tue, 8 Oct 2024 16:37:22 -0500 Subject: [PATCH] Add support for Level 2 qualifications and years with significant digits --- README.md | 6 ++ lib/edtf.ex | 3 +- lib/edtf/date.ex | 148 ++++++++++++++++++++----------------- lib/edtf/humanize/date.ex | 2 +- lib/edtf/infinity.ex | 6 +- lib/edtf/interval.ex | 4 +- lib/edtf/level.ex | 43 +++++++++++ lib/edtf/season.ex | 3 +- lib/edtf/year.ex | 20 +++-- mix.exs | 2 +- test/edtf/date_test.exs | 150 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 304 insertions(+), 83 deletions(-) create mode 100644 lib/edtf/level.ex create mode 100644 test/edtf/date_test.exs diff --git a/README.md b/README.md index c25ed0b..f6df2f6 100644 --- a/README.md +++ b/README.md @@ -28,3 +28,9 @@ end ## Usage See `EDTF.parse/1`, `EDTF.validate/1`, and `EDTF.humanize/1`. + +## Notes + +- Some human-readable dates containing Level 2 qualifications and years with significant digits, + may produce less specific results than desired. +- Level 2 years without the leading `Y` character (e.g., `2024S03`) are not supported at this time. diff --git a/lib/edtf.ex b/lib/edtf.ex index e731d42..9c673ff 100644 --- a/lib/edtf.ex +++ b/lib/edtf.ex @@ -3,7 +3,7 @@ defmodule EDTF do Parse, validate, and humanize EDTF date strings """ - alias EDTF.{Aggregate, Date, Interval} + alias EDTF.{Aggregate, Date, Interval, Level} @doc """ Parse an EDTF date string @@ -22,6 +22,7 @@ defmodule EDTF do nil -> error() mod -> mod.parse(edtf) end + |> Level.add_level() end @doc """ diff --git a/lib/edtf/date.ex b/lib/edtf/date.ex index ad66553..cf469de 100644 --- a/lib/edtf/date.ex +++ b/lib/edtf/date.ex @@ -5,7 +5,7 @@ defmodule EDTF.Date do alias EDTF.{Season, Year} - @matcher ~r/^Y?-?[\dX]+(?:E\d+)?(?:-[\dX]{2})?(?:-[\dX]{2})?[~%?]?$/ + @matcher ~r/^Y?[~%?]?-?[\dX]+(?:E\d+)?(?:S\d+)?(?:-[~%?]?[\dX]{2})?(?:-[~%?]?[\dX]{2})?[~%?]?$/ @subtypes [Year, Season] defstruct type: :date, @@ -16,8 +16,9 @@ defmodule EDTF.Date do @type edtf_type :: :date | :century | :decade | :year @type edtf_attribute :: {:unspecified, integer()} - | {:uncertain, boolean()} - | {:approximate, boolean()} + | {:uncertain, integer() | boolean()} + | {:approximate, integer() | boolean()} + | {:significant, integer()} | {:earlier, boolean()} | {:later, boolean()} @@ -43,50 +44,48 @@ defmodule EDTF.Date do {edtf, attributes} = get_attributes(edtf) case edtf do - <<"-", val::binary-size(2)>> -> {:century, [0 - String.to_integer(val)], 0} - <> -> {:century, [String.to_integer(val)], 0} - <<"-", val::binary-size(3)>> -> {:decade, [0 - String.to_integer(val)], 2} - <> -> {:decade, [String.to_integer(val)], 2} + <<"-", val::binary-size(2)>> -> {:century, [0 - String.to_integer(val)]} + <> -> {:century, [String.to_integer(val)]} + <<"-", val::binary-size(3)>> -> {:decade, [0 - String.to_integer(val)]} + <> -> {:decade, [String.to_integer(val)]} other -> other end |> case do - {type, values, level} -> - {:ok, %__MODULE__{type: type, values: values, level: level, attributes: attributes}} + {type, values} -> + {:ok, %__MODULE__{type: type, values: values, attributes: attributes}} other -> parse_iso8601(other, attributes) end - |> finalize(edtf) + |> case do + :error -> EDTF.error() + result -> result + end end - defp finalize(:error, _), do: EDTF.error() - defp finalize({:ok, result}, edtf), do: {:ok, %__MODULE__{result | level: level(edtf)}} - - defp parse_iso8601(<<"-", year::binary-size(4)>>, attributes), - do: parse_iso8601("-" <> year <> "-01-01", attributes, :year) + defp parse_iso8601(edtf, attributes) do + {edtf, masks} = + bitmask(edtf) - defp parse_iso8601(<>, attributes), - do: parse_iso8601(year <> "-01-01", attributes, :year) + [_, sign, edtf] = Regex.run(~r/^(-?)(.+)$/, edtf) - defp parse_iso8601(<<"-", year::binary-size(4), "-", month::binary-size(2)>>, attributes), - do: parse_iso8601("-" <> year <> "-" <> month <> "-01", attributes, :month) - - defp parse_iso8601(<>, attributes), - do: parse_iso8601(year <> "-" <> month <> "-01", attributes, :month) - - defp parse_iso8601(edtf, attributes, specificity \\ :day) do - {edtf, mask} = unspecified(edtf) + {edtf, specificity} = + case String.length(edtf) do + 4 -> {"#{edtf}-01-01", :year} + 7 -> {"#{edtf}-01", :month} + _ -> {edtf, :day} + end - case Elixir.Date.from_iso8601(edtf) do + case Elixir.Date.from_iso8601(sign <> edtf) do {:ok, %Date{year: year, month: month, day: day}} -> - [year, month - 1, day] |> process_result(specificity, mask, attributes) + [year, month - 1, day] |> process_result(specificity, masks, attributes) {:error, _} -> :error end end - defp process_result(values, specificity, mask, attributes) do + defp process_result(values, specificity, masks, attributes) do values = case specificity do :day -> values @@ -94,7 +93,7 @@ defmodule EDTF.Date do :year -> Enum.take(values, 1) end - attributes = if mask > 0, do: [{:unspecified, mask} | attributes], else: attributes + attributes = Keyword.merge(attributes, masks) {:ok, %__MODULE__{ @@ -103,57 +102,70 @@ defmodule EDTF.Date do }} end - defp unspecified(<<"-", edtf::binary>>) do - {edtf, mask} = unspecified(edtf) - {"-#{edtf}", mask} - end - - defp unspecified(edtf) do - new_x = fn - {"X", 5} -> {"1", 2 ** 5} - {"X", 7} -> {"1", 2 ** 7} - {"X", p} -> {"0", 2 ** p} - {c, _} -> {c, 0} - end - - {str, mask} = + defp bitmask(edtf) do + {str, _, attrs} = edtf |> String.graphemes() - |> Enum.reject(&(&1 == "-")) - |> Enum.with_index() - |> Enum.map(new_x) - |> Enum.reduce({"", 0}, fn {char, bits}, {str, mask} -> - {str <> char, mask + bits} - end) + |> Enum.reduce( + {"", 1, [unspecified: 0, approximate: 0, uncertain: 0]}, + fn char, {str, bits, attrs} -> + case char do + "X" -> + {str <> "0", bits * 2, add_bits(attrs, :unspecified, bits)} + + "~" -> + {str, bits, add_bits(attrs, :approximate, bits)} + + "?" -> + {str, bits, add_bits(attrs, :uncertain, bits)} + + "%" -> + {str, bits, add_bits(attrs, :approximate, bits) |> add_bits(:uncertain, bits)} + + "-" -> + {str <> "-", bits, attrs} + + d -> + {str <> d, bits * 2, attrs} + end + end + ) {str - |> reassemble() - |> nonzero_month_and_day(), mask} + |> nonzero_month_and_day(), Keyword.reject(attrs, fn {_, v} -> v == 0 end)} end - defp level(edtf) do - cond do - Regex.match?(~r/^\d{2}X{2}$/, edtf) -> 1 - Regex.match?(~r/^\d{3}X$/, edtf) -> 1 - Regex.match?(~r/^\d{4}-XX$/, edtf) -> 1 - Regex.match?(~r/^\d{4}-\d{2}-XX$/, edtf) -> 1 - Regex.match?(~r/^\d{4}-XX-XX$/, edtf) -> 1 - Regex.match?(~r/X/, edtf) -> 2 - true -> 0 - end - end + defp add_bits(attrs, attr, bits) do + bits = + cond do + # unspecified can exist in any place + attr == :unspecified -> bits + # approximate or uncertain year (XXXX-mm-dd) + bits < 15 -> 15 + # approximate or uncertain month (yyyy-XX-dd) + bits < 48 -> 48 + # approximate or uncertain day (yyyy-mm-XX) + bits < 192 -> 192 + end - defp reassemble(<>), - do: [year, month, day] |> Enum.join("-") + Keyword.update!(attrs, attr, fn v -> v + bits end) + end defp nonzero_month_and_day(str), do: String.replace(str, "-00", "-01") defp get_attributes(edtf) do case Regex.named_captures(~r/^(?.+?)(?[~%?])?$/, edtf) do - %{"edtf" => result, "attr" => ""} -> {result, []} - %{"edtf" => result, "attr" => "~"} -> {result, [{:approximate, true}]} - %{"edtf" => result, "attr" => "%"} -> {result, [{:approximate, true}, {:uncertain, true}]} - %{"edtf" => result, "attr" => "?"} -> {result, [{:uncertain, true}]} + %{"edtf" => result, "attr" => ""} -> + {result, []} + + %{"edtf" => result, "attr" => "~"} -> + {result, [{:approximate, true}]} + + %{"edtf" => result, "attr" => "%"} -> + {result, [{:approximate, true}, {:uncertain, true}]} + + %{"edtf" => result, "attr" => "?"} -> + {result, [{:uncertain, true}]} end end end diff --git a/lib/edtf/humanize/date.ex b/lib/edtf/humanize/date.ex index 2579259..6ee869a 100644 --- a/lib/edtf/humanize/date.ex +++ b/lib/edtf/humanize/date.ex @@ -49,7 +49,7 @@ defmodule EDTF.Humanize.Date do defp humanize(:date, _, %{unspecified: _}), do: :original - defp humanize(:date, values, %{uncertain: true} = attributes), + defp humanize(:date, values, %{uncertain: _v} = attributes), do: humanize(:date, values, Map.delete(attributes, :uncertain)) <> "?" defp humanize(:date, values, _) do diff --git a/lib/edtf/infinity.ex b/lib/edtf/infinity.ex index 9f8642a..db70c1c 100644 --- a/lib/edtf/infinity.ex +++ b/lib/edtf/infinity.ex @@ -3,11 +3,11 @@ defmodule EDTF.Infinity do EDTF Infinity struct """ - defstruct [] - @type t :: %__MODULE__{} + defstruct level: 1 + @type t :: %__MODULE__{level: integer()} def match?(".."), do: true def match?(_), do: false - def parse(".."), do: {:ok, %__MODULE__{}} + def parse(".."), do: {:ok, %__MODULE__{level: 1}} def parse(_), do: EDTF.error() end diff --git a/lib/edtf/interval.ex b/lib/edtf/interval.ex index d72a8c6..585bc18 100644 --- a/lib/edtf/interval.ex +++ b/lib/edtf/interval.ex @@ -8,7 +8,7 @@ defmodule EDTF.Interval do defstruct start: nil, end: nil, - level: 1 + level: 2 @type t :: %__MODULE__{ start: EDTF.Date.t() | nil, @@ -42,6 +42,6 @@ defmodule EDTF.Interval do end end - defp module([start | [stop]]), do: %__MODULE__{start: start, end: stop, level: 1} + defp module([start | [stop]]), do: %__MODULE__{start: start, end: stop, level: 2} defp module([v]), do: module([v, nil]) end diff --git a/lib/edtf/level.ex b/lib/edtf/level.ex new file mode 100644 index 0000000..03492e0 --- /dev/null +++ b/lib/edtf/level.ex @@ -0,0 +1,43 @@ +defmodule EDTF.Level do + def add_level({:error, _} = error), do: error + + def add_level(%EDTF.Aggregate{} = aggregate), + do: Map.update!(aggregate, :values, &add_level/1) + + def add_level({:ok, value}), do: {:ok, add_level(value)} + + def add_level([]), do: [] + def add_level([value | values]), do: [add_level(value) | add_level(values)] + def add_level(%{level: level} = result) when level > 0, do: result + def add_level(result), do: Map.put(result, :level, determine_level(result)) + + defp determine_level(%EDTF.Date{type: :century}), do: 1 + defp determine_level(%EDTF.Date{type: :decade}), do: 1 + + defp determine_level(%EDTF.Date{type: :season, values: [_, s]}) do + if s > 24, do: 2, else: 1 + end + + defp determine_level(%EDTF.Date{attributes: attrs, level: level, values: values}) do + if Enum.empty?(attrs), + do: level, + else: attrs |> Enum.into(%{}) |> calculate_level(values) + end + + defp calculate_level(%{unspecified: bits}, values) when length(values) == 1 do + if Enum.member?([15, 14, 12, 8], bits), do: 1, else: 2 + end + + defp calculate_level(%{unspecified: bits}, values) when length(values) == 2 do + if Enum.member?([63, 62, 60, 56, 48, 32], bits), do: 1, else: 2 + end + + defp calculate_level(%{unspecified: bits}, values) when length(values) == 3 do + if Enum.member?([255, 254, 252, 248, 240, 224, 192, 128], bits), do: 1, else: 2 + end + + defp calculate_level(%{approximate: v}, _) when is_boolean(v), do: 1 + defp calculate_level(%{approximate: _v}, _), do: 2 + defp calculate_level(%{uncertain: v}, _) when is_boolean(v), do: 1 + defp calculate_level(%{uncertain: _v}, _), do: 2 +end diff --git a/lib/edtf/season.ex b/lib/edtf/season.ex index ab4c62d..fc7fdb6 100644 --- a/lib/edtf/season.ex +++ b/lib/edtf/season.ex @@ -22,8 +22,7 @@ defmodule EDTF.Season do {:ok, %EDTF.Date{ type: :season, - values: [String.to_integer(year), String.to_integer(season)], - level: 2 + values: [String.to_integer(year), String.to_integer(season)] }} end end diff --git a/lib/edtf/year.ex b/lib/edtf/year.ex index 42ec96f..9bae1f4 100644 --- a/lib/edtf/year.ex +++ b/lib/edtf/year.ex @@ -3,7 +3,7 @@ defmodule EDTF.Year do Parser for EDTF Level 1 Years """ - @matcher ~r/^Y(?-?\d+)(?:E(?\d+))?$/ + @matcher ~r/^Y(?-?\d+)(?:E(?\d+))?(?:S(?\d+))?$/ def match?(edtf), do: Regex.match?(@matcher, edtf) @@ -16,17 +16,27 @@ defmodule EDTF.Year do end end - defp calculate(%{"year" => year, "exponent" => ""}), - do: {:ok, %EDTF.Date{type: :year, values: [String.to_integer(year)], level: 1}} + defp calculate(%{"year" => year, "exponent" => "", "significant" => significant}), + do: + {:ok, + %EDTF.Date{type: :year, values: [String.to_integer(year)], level: 1} + |> add_significance(significant)} - defp calculate(%{"year" => year, "exponent" => exponent}) do + defp calculate(%{"year" => year, "exponent" => exponent, "significant" => significant}) do {:ok, %EDTF.Date{ type: :year, values: [String.to_integer(year) * 10 ** String.to_integer(exponent)], level: 2 - }} + } + |> add_significance(significant)} end defp calculate(_), do: :error + + defp add_significance(result, ""), do: result + + defp add_significance(result, v) do + %EDTF.Date{result | level: 2, attributes: [{:significant, String.to_integer(v)}]} + end end diff --git a/mix.exs b/mix.exs index aef745e..d4a2d6c 100644 --- a/mix.exs +++ b/mix.exs @@ -1,7 +1,7 @@ defmodule EDTF.MixProject do use Mix.Project - @version "1.0.0" + @version "1.1.0" @url "https://github.com/nulib/authoritex" def project do diff --git a/test/edtf/date_test.exs b/test/edtf/date_test.exs new file mode 100644 index 0000000..8788cde --- /dev/null +++ b/test/edtf/date_test.exs @@ -0,0 +1,150 @@ +defmodule EDTF.DateTest do + use ExUnit.Case + + describe "qualification" do + setup %{edtf: edtf} do + {:ok, subject} = EDTF.parse(edtf) + {:ok, %{subject: subject}} + end + + @tag edtf: "2024~" + test "approximate (whole)", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2024] + assert subject.level == 1 + assert subject.attributes[:approximate] + refute subject.attributes[:uncertain] + refute subject.attributes[:unspecified] + end + + @tag edtf: "2024?" + test "uncertain (whole)", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2024] + assert subject.level == 1 + refute subject.attributes[:approximate] + assert subject.attributes[:uncertain] + refute subject.attributes[:unspecified] + end + + @tag edtf: "2024%" + test "approximate and uncertain (whole)", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2024] + assert subject.level == 1 + assert subject.attributes[:approximate] + assert subject.attributes[:uncertain] + refute subject.attributes[:unspecified] + end + + @tag edtf: "2024-~10" + test "approximate (month)", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2024, 9] + assert subject.level == 2 + assert subject.attributes[:approximate] == 48 + refute subject.attributes[:uncertain] + refute subject.attributes[:unspecified] + end + + @tag edtf: "2024-?10" + test "uncertain (month)", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2024, 9] + assert subject.level == 2 + refute subject.attributes[:approximate] + assert subject.attributes[:uncertain] == 48 + refute subject.attributes[:unspecified] + end + + @tag edtf: "2024-%10" + test "approximate and uncertain (month)", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2024, 9] + assert subject.level == 2 + assert subject.attributes[:approximate] == 48 + assert subject.attributes[:uncertain] == 48 + refute subject.attributes[:unspecified] + end + + @tag edtf: "2024-10-~08" + test "approximate (day)", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2024, 9, 8] + assert subject.level == 2 + assert subject.attributes[:approximate] == 192 + refute subject.attributes[:uncertain] + refute subject.attributes[:unspecified] + end + + @tag edtf: "2024-10-?08" + test "uncertain (day)", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2024, 9, 8] + assert subject.level == 2 + refute subject.attributes[:approximate] + assert subject.attributes[:uncertain] == 192 + refute subject.attributes[:unspecified] + end + + @tag edtf: "2024-10-%08" + test "approximate and uncertain (day)", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2024, 9, 8] + assert subject.level == 2 + assert subject.attributes[:approximate] == 192 + assert subject.attributes[:uncertain] == 192 + refute subject.attributes[:unspecified] + end + end + + describe "unspecified" do + setup %{edtf: edtf} do + {:ok, subject} = EDTF.parse(edtf) + {:ok, %{subject: subject}} + end + + @tag edtf: "202X" + test "simple", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2020] + assert subject.level == 1 + refute subject.attributes[:approximate] + refute subject.attributes[:uncertain] + assert subject.attributes[:unspecified] == 8 + end + + @tag edtf: "X0X0-0X-1X" + test "complex", %{subject: subject} do + assert subject.type == :date + assert subject.values == [0, 0, 10] + assert subject.level == 2 + refute subject.attributes[:approximate] + refute subject.attributes[:uncertain] + assert subject.attributes[:unspecified] == 165 + end + end + + describe "significant digits" do + setup %{edtf: edtf} do + {:ok, subject} = EDTF.parse(edtf) + {:ok, %{subject: subject}} + end + + @tag edtf: "Y20200S02" + test "significant digits", %{subject: subject} do + assert subject.type == :date + assert subject.values == [20200] + assert subject.level == 2 + assert subject.attributes[:significant] == 2 + end + + @tag edtf: "Y20200E3S02" + test "significant digits with exponent", %{subject: subject} do + assert subject.type == :date + assert subject.values == [2_020_000] + assert subject.level == 2 + assert subject.attributes[:significant] == 2 + end + end +end