Skip to content

Commit

Permalink
Switch from Regex-based parsing to NimbleParsec grammar-based parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mbklein committed Oct 10, 2024
1 parent 921c60e commit 77dfde1
Show file tree
Hide file tree
Showing 17 changed files with 405 additions and 361 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,3 @@ See `EDTF.parse/1`, `EDTF.validate/1`, and `EDTF.humanize/1`.

- Some human-readable dates containing Level 2 qualifications and years with significant digits,
may produce less specific results than desired.
- Level 2 years without the leading `Y` character (e.g., `2024S03`) are not supported at this time.
41 changes: 12 additions & 29 deletions lib/edtf.ex
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,21 @@ defmodule EDTF do
{:error, :invalid_format}
```
"""
def parse(edtf, include \\ [Interval, Aggregate, Date]) do
case Enum.find(include, & &1.match?(edtf)) do
nil -> error()
mod -> mod.parse(edtf)
def parse(edtf) do
case EDTF.Parser.parse(edtf) do
{:ok, [result], _, _, _, _} -> {:ok, assemble(result) |> Level.add_level()}
{:error, _, _, _, _, _} -> {:error, :invalid_format}
end
|> Level.add_level()
end

defp assemble({:date, _} = result), do: Date.assemble(result)
defp assemble({:year, _} = result), do: Date.assemble(result)
defp assemble({:decade, _} = result), do: Date.assemble(result)
defp assemble({:century, _} = result), do: Date.assemble(result)
defp assemble({:interval, _} = result), do: Interval.assemble(result)
defp assemble({:set, _} = result), do: Aggregate.assemble(result)
defp assemble({:list, _} = result), do: Aggregate.assemble(result)

@doc """
Validate an EDTF date string
Expand Down Expand Up @@ -62,28 +69,4 @@ defmodule EDTF do
other -> other
end
end

@doc """
Generate an error response
"""
def error(error \\ :invalid_format), do: {:error, error}

@doc """
Identify the open-ended continuation markers on an EDTF date string
"""
def open_ended(edtf) do
case Regex.named_captures(~r/^(?<earlier>\.\.)?(?<edtf>.+?)(?<later>\.\.)?$/, edtf) do
%{"earlier" => "..", "edtf" => result, "later" => ".."} ->
{result, [{:earlier, true}, {:later, true}]}

%{"earlier" => "..", "edtf" => result} ->
{result, [{:earlier, true}, {:later, false}]}

%{"edtf" => result, "later" => ".."} ->
{result, [{:earlier, false}, {:later, true}]}

%{"edtf" => result} ->
{result, [{:earlier, false}, {:later, false}]}
end
end
end
53 changes: 16 additions & 37 deletions lib/edtf/aggregate.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@ defmodule EDTF.Aggregate do
Parser for EDTF Lists and Sets
"""

@matchers list: ~r/^\{(.+)\}$/, set: ~r/^\[(.+)\]$/

@valid [EDTF.Date, EDTF.Range]

defstruct type: nil, values: [], level: 2, earlier: false, later: false

@type t :: %__MODULE__{
Expand All @@ -17,38 +13,21 @@ defmodule EDTF.Aggregate do
later: boolean()
}

def match?(edtf), do: Enum.any?(@matchers, fn {_, re} -> Regex.match?(re, edtf) end)

def parse(edtf) do
case Enum.find(@matchers, fn {_, re} -> Regex.match?(re, edtf) end) do
nil ->
EDTF.error()

{type, re} ->
[_, dates] = Regex.run(re, edtf)
{dates, attributes} = EDTF.open_ended(dates)

Regex.split(~r/\s*,\s*/, dates)
|> Enum.reduce_while([], &reducer/2)
|> finalize(type, attributes)
end
end

defp reducer(date, acc) do
case EDTF.parse(date, @valid) do
{:ok, parsed} -> {:cont, [parsed | acc]}
{:error, _error} -> {:halt, :error}
end
end

defp finalize(:error, _, _), do: EDTF.error()

defp finalize(values, type, attributes),
do: %__MODULE__{
type: type,
values: Enum.reverse(values),
earlier: attributes[:earlier],
later: attributes[:later],
level: 2
def assemble({:list, value}), do: %__MODULE__{assemble(value) | type: :list}
def assemble({:set, value}), do: %__MODULE__{assemble(value) | type: :set}

def assemble(value) do
dates =
Keyword.get(value, :dates, [])
|> Enum.map(fn
[{:interval, _}] = v -> EDTF.Interval.assemble(v)
v -> EDTF.Date.assemble({:date, v})
end)

%__MODULE__{
values: dates,
earlier: Keyword.get(value, :earlier, false),
later: Keyword.get(value, :later, false)
}
end
end
166 changes: 33 additions & 133 deletions lib/edtf/date.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,6 @@ defmodule EDTF.Date do
Parser for basic EDTF dates, including year, and decade
"""

alias EDTF.{Season, Year}

@matcher ~r/^Y?[~%?]?-?[\dX]+(?:E\d+)?(?:S\d+)?(?:-[~%?]?[\dX]{2})?(?:-[~%?]?[\dX]{2})?[~%?]?$/
@subtypes [Year, Season]

defstruct type: :date,
values: [],
level: 0,
Expand All @@ -31,146 +26,51 @@ defmodule EDTF.Date do
}
| nil

def match?(edtf), do: Regex.match?(@matcher, edtf)

def parse(edtf) do
case Enum.find(@subtypes, & &1.match?(edtf)) do
nil -> parse_date(edtf)
mod -> mod.parse(edtf)
end
end

defp parse_date(edtf) do
{edtf, attributes} = get_attributes(edtf)
def assemble({_, nil}), do: nil

parse_date(edtf, attributes)
|> case do
:error -> EDTF.error()
result -> result
end
end
def assemble({:century, value}),
do: %__MODULE__{type: :century, values: [Keyword.get(value, :value)]}

defp parse_date(<<"-", val::binary-size(2)>>, attributes) do
{:ok,
%__MODULE__{type: :century, values: [0 - String.to_integer(val)], attributes: attributes}}
end
def assemble({:decade, value}),
do: %__MODULE__{type: :decade, values: [Keyword.get(value, :value)]}

defp parse_date(<<val::binary-size(2)>>, attributes) do
{:ok, %__MODULE__{type: :century, values: [String.to_integer(val)], attributes: attributes}}
end
def assemble({:year, value}) do
attributes = Keyword.get(value, :attributes, [])
multiplier = 10 ** Keyword.get(attributes, :exponent, 0)
significant = Keyword.get(attributes, :significant)
level = if significant, do: 2, else: 1

defp parse_date(<<"-", val::binary-size(3)>>, attributes) do
{:ok,
%__MODULE__{type: :decade, values: [0 - String.to_integer(val)], attributes: attributes}}
end
value = Keyword.get(value, :value) * multiplier

defp parse_date(<<val::binary-size(3)>>, attributes) do
{:ok, %__MODULE__{type: :decade, values: [String.to_integer(val)], attributes: attributes}}
%__MODULE__{
type: :year,
values: [value],
attributes: [significant: significant],
level: level
}
end

defp parse_date(edtf, attributes) do
{edtf, masks} =
bitmask(edtf)

[_, sign, edtf] = Regex.run(~r/^(-?)(.+)$/, edtf)
def assemble({:date, [:infinity]}), do: %EDTF.Infinity{}

{edtf, specificity} =
case String.length(edtf) do
4 -> {"#{edtf}-01-01", :year}
7 -> {"#{edtf}-01", :month}
_ -> {edtf, :day}
end
def assemble({:date, value}) do
values = Keyword.get(value, :values)

case Elixir.Date.from_iso8601(sign <> edtf) do
{:ok, %Date{year: year, month: month, day: day}} ->
[year, month - 1, day] |> process_result(specificity, masks, attributes)
{type, values} =
case values do
[year, month, day] ->
{:date, [year, month - 1, day]}

{:error, _} ->
:error
end
end
[year, month] ->
if month > 12, do: {:season, [year, month]}, else: {:date, [year, month - 1]}

defp process_result(values, specificity, masks, attributes) do
values =
case specificity do
:day -> values
:month -> Enum.take(values, 2)
:year -> Enum.take(values, 1)
[year] ->
{:date, [year]}
end

attributes = Keyword.merge(attributes, masks)

{:ok,
%__MODULE__{
values: values,
attributes: attributes
}}
end

defp bitmask(edtf) do
{str, _, attrs} =
edtf
|> String.graphemes()
|> Enum.reduce(
{"", 1, [unspecified: 0, approximate: 0, uncertain: 0]},
fn char, {str, bits, attrs} ->
case char do
"X" ->
{str <> "0", bits * 2, add_bits(attrs, :unspecified, bits)}

"~" ->
{str, bits, add_bits(attrs, :approximate, bits)}

"?" ->
{str, bits, add_bits(attrs, :uncertain, bits)}

"%" ->
{str, bits, add_bits(attrs, :approximate, bits) |> add_bits(:uncertain, bits)}

"-" ->
{str <> "-", bits, attrs}

d ->
{str <> d, bits * 2, attrs}
end
end
)

{str
|> nonzero_month_and_day(), Keyword.reject(attrs, fn {_, v} -> v == 0 end)}
end

defp add_bits(attrs, attr, bits) do
bits =
cond do
# unspecified can exist in any place
attr == :unspecified -> bits
# approximate or uncertain year (XXXX-mm-dd)
bits < 15 -> 15
# approximate or uncertain month (yyyy-XX-dd)
bits < 48 -> 48
# approximate or uncertain day (yyyy-mm-XX)
bits < 192 -> 192
end

Keyword.update!(attrs, attr, fn v -> v + bits end)
end

defp nonzero_month_and_day(str), do: String.replace(str, "-00", "-01")

defp get_attributes(edtf) do
case Regex.named_captures(~r/^(?<edtf>.+?)(?<attr>[~%?])?$/, edtf) do
%{"edtf" => result, "attr" => ""} ->
{result, []}

%{"edtf" => result, "attr" => "~"} ->
{result, [{:approximate, true}]}

%{"edtf" => result, "attr" => "%"} ->
{result, [{:approximate, true}, {:uncertain, true}]}

%{"edtf" => result, "attr" => "?"} ->
{result, [{:uncertain, true}]}
end
%__MODULE__{
type: type,
values: values,
attributes: Keyword.get(value, :attributes)
}
end
end
7 changes: 2 additions & 5 deletions lib/edtf/humanize.ex
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,10 @@ defmodule EDTF.Humanize do

def humanize(nil), do: "Unknown"

def humanize([start_date | [end_date]]),
do: humanize(%EDTF.Interval{start: start_date, end: end_date})

def humanize(%EDTF.Interval{start: start_date, end: end_date}) do
case [start_date, end_date] do
[value | [%EDTF.Infinity{}]] -> "from #{humanize(value)}"
[%EDTF.Infinity{} | [value]] -> "before #{humanize(value)}"
[value, %EDTF.Infinity{}] -> "from #{humanize(value)}"
[%EDTF.Infinity{}, value] -> "before #{humanize(value)}"
values -> values |> Enum.map_join(" to ", &humanize/1)
end
end
Expand Down
5 changes: 0 additions & 5 deletions lib/edtf/infinity.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,4 @@ defmodule EDTF.Infinity do

defstruct level: 1
@type t :: %__MODULE__{level: integer()}

def match?(".."), do: true
def match?(_), do: false
def parse(".."), do: {:ok, %__MODULE__{level: 1}}
def parse(_), do: EDTF.error()
end
42 changes: 9 additions & 33 deletions lib/edtf/interval.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,21 @@ defmodule EDTF.Interval do
Parser for EDTF Intervals
"""

@matcher ~r"^([^/]+)?/([^/]+)?$"
@valid [EDTF.Date, EDTF.Infinity]

defstruct start: nil,
end: nil,
defstruct start: :unknown,
end: :unknown,
level: 2

@type t :: %__MODULE__{
start: EDTF.Date.t() | nil,
end: EDTF.Date.t() | nil,
start: EDTF.Date.t() | :unknown,
end: EDTF.Date.t() | :unknown,
level: integer()
}

def match?(edtf), do: Regex.match?(@matcher, edtf)

def parse(edtf) do
case Regex.run(@matcher, edtf) do
[_ | values] ->
values
|> Enum.reduce_while([], &reducer/2)
|> case do
:error -> EDTF.error()
values -> {:ok, Enum.reverse(values) |> module()}
end

_ ->
EDTF.error()
end
end

defp reducer("", acc), do: {:cont, [nil | acc]}
def assemble([{:interval, value}]), do: assemble({:interval, value})

defp reducer(date, acc) do
case EDTF.parse(date, @valid) do
{:ok, parsed} -> {:cont, [parsed | acc]}
{:error, _error} -> {:halt, :error}
end
def assemble({:interval, value}) do
start_date = {:date, Keyword.get(value, :start)} |> EDTF.Date.assemble()
end_date = {:date, Keyword.get(value, :end)} |> EDTF.Date.assemble()
%__MODULE__{start: start_date, end: end_date}
end

defp module([start | [stop]]), do: %__MODULE__{start: start, end: stop, level: 2}
defp module([v]), do: module([v, nil])
end
Loading

0 comments on commit 77dfde1

Please sign in to comment.