Skip to content

Commit

Permalink
WIP towards LCSH detector
Browse files Browse the repository at this point in the history
The regex is detecting `--` characters surrounded by spaces, but more work
is needed. More tests are needed.

Update annotations
  • Loading branch information
matt-bernhardt committed Oct 2, 2024
1 parent 862a979 commit c533e30
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 6 deletions.
6 changes: 4 additions & 2 deletions app/graphql/types/standard_identifiers_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

module Types
class StandardIdentifiersType < Types::BaseObject
description 'A detector for standard identifiers in search terms. Currently supported: ISBN, ISSN, PMID, DOI'
description 'A detector for standard identifiers in search terms. Currently supported: DOI, ISBN, ISSN, LCSH, PMID'

field :details, DetailsType, description: 'Additional information about the detected identifier(s)'
field :kind, String, null: false, description: 'The type of identifier detected (one of ISBN, ISSN, PMID, DOI)'
field :kind, String, null: false, description: 'The type of identifier detected (one of DOI, ISBN, ISSN, LCSH, PMID)'
field :value, String, null: false, description: 'The identifier detected in the search term'

# details does external lookups and should only be run if the fields
Expand All @@ -18,6 +18,8 @@ def details
LookupIsbn.new.info(@object[:value])
when :issn
LookupIssn.new.info(@object[:value])
when :lcsh
@object[:value]
when :pmid
LookupPmid.new.info(@object[:value].split.last)
end
Expand Down
4 changes: 3 additions & 1 deletion app/models/detector/standard_identifiers.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# frozen_string_literal: true

class Detector
# Detector::StandardIdentifiers detects the identifiers DOI, ISBN, ISSN, PMID.
# Detector::StandardIdentifiers detects the identifiers DOI, ISBN, ISSN, LCSH, PMID.
# See /docs/reference/pattern_detection_and_enhancement.md for details.
class StandardIdentifiers
attr_reader :identifiers
Expand Down Expand Up @@ -55,10 +55,12 @@ def match(pattern, term)
end

# term_patterns are regex patterns to be applied to the basic search box input
# The LCSH regex is looking for the separator used in the Bento UI, so this is an area for later improvement.
def term_patterns
{
isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/,
lcsh: /\s--\s/,
pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/,
doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b}
}
Expand Down
5 changes: 3 additions & 2 deletions app/models/metrics/algorithms.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# updated_at :datetime not null
# journal_exact :integer
# suggested_resource_exact :integer
# lcsh :integer
#
module Metrics
# Algorithms aggregates statistics for matches for all SearchEvents
Expand Down Expand Up @@ -48,7 +49,7 @@ def generate(month = nil)
count_matches(SearchEvent.includes(:term))
end
Metrics::Algorithms.create(month:, doi: matches[:doi], issn: matches[:issn], isbn: matches[:isbn],
pmid: matches[:pmid], journal_exact: matches[:journal_exact],
lcsh: matches[:lcsh], pmid: matches[:pmid], journal_exact: matches[:journal_exact],
suggested_resource_exact: matches[:suggested_resource_exact],
unmatched: matches[:unmatched])
end
Expand Down Expand Up @@ -89,7 +90,7 @@ def event_matches(event, matches)
# @param matches [Hash] a Hash that keeps track of how many of each algorithm we match
# @return [Array] an array of matched StandardIdentifiers
def match_standard_identifiers(event, matches)
known_ids = %i[unmatched pmid isbn issn doi]
known_ids = %i[unmatched doi isbn issn lcsh pmid]
ids = Detector::StandardIdentifiers.new(event.term.phrase)

known_ids.each do |id|
Expand Down
5 changes: 5 additions & 0 deletions db/migrate/20241001205152_add_lcsh_to_metrics_algorithm.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddLcshToMetricsAlgorithm < ActiveRecord::Migration[7.1]
def change
add_column :metrics_algorithms, :lcsh, :integer
end
end
3 changes: 2 additions & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions test/models/detector/standard_identifiers_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,34 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
end
end

# LCSH
test 'lcsh detected' do
true_samples = [
'Geology -- Massachusetts',
'Space vehicles -- Materials -- Congresses'
]

true_samples.each do |term|
actual = Detector::StandardIdentifiers.new(term).identifiers

assert_includes(actual, :lcsh)
end


false_samples = [
'Geology of Massachusetts',
'Geology-Massachusetts',
'Geology--Massachusetts'
]

false_samples.each do |term|
actual = Detector::StandardIdentifiers.new(term).identifiers

refute_includes(actual, :lcsh)
end
end

# DOI tests
test 'doi detected in string' do
actual = Detector::StandardIdentifiers.new('"Quantum tomography: Measured measurement", Markus Aspelmeyer, nature physics "\
"January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').identifiers
Expand Down
1 change: 1 addition & 0 deletions test/models/metrics/algorithms_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# updated_at :datetime not null
# journal_exact :integer
# suggested_resource_exact :integer
# lcsh :integer
#
require 'test_helper'

Expand Down

0 comments on commit c533e30

Please sign in to comment.