Skip to content

Commit

Permalink
Refactor LCSH to be a separate detector
Browse files Browse the repository at this point in the history
Squash refactor

Squash refactor

Update metrics tests and fixtures
  • Loading branch information
matt-bernhardt committed Oct 2, 2024
1 parent c533e30 commit 6592996
Show file tree
Hide file tree
Showing 14 changed files with 152 additions and 45 deletions.
19 changes: 13 additions & 6 deletions app/graphql/types/detectors_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,28 @@ class DetectorsType < Types::BaseObject
description 'Provides all available search term detectors'

field :journals, [Types::JournalsType], description: 'Information about journals detected in the search term'
field :lcsh, [String], description: 'Library of Congress Subject Heading information'
field :standard_identifiers, [Types::StandardIdentifiersType], description: 'Currently supported: ISBN, ISSN, PMID, DOI'
field :suggested_resources, [Types::SuggestedResourcesType], description: 'Suggested resources detected in the search term'

def standard_identifiers
Detector::StandardIdentifiers.new(@object).identifiers.map do |identifier|
{ kind: identifier.first, value: identifier.last }
end
end

def journals
Detector::Journal.full_term_match(@object).map do |journal|
{ title: journal.name, additional_info: journal.additional_info }
end
end

def lcsh
Detector::Lcsh.new(@object).identifiers.map do |identifier|
identifier.last
end
end

def standard_identifiers
Detector::StandardIdentifiers.new(@object).identifiers.map do |identifier|
{ kind: identifier.first, value: identifier.last }
end
end

def suggested_resources
Detector::SuggestedResource.full_term_match(@object).map do |suggested_resource|
{ title: suggested_resource.title, url: suggested_resource.url }
Expand Down
6 changes: 2 additions & 4 deletions app/graphql/types/standard_identifiers_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

module Types
class StandardIdentifiersType < Types::BaseObject
description 'A detector for standard identifiers in search terms. Currently supported: DOI, ISBN, ISSN, LCSH, PMID'
description 'A detector for standard identifiers in search terms. Currently supported: ISBN, ISSN, PMID, DOI'

field :details, DetailsType, description: 'Additional information about the detected identifier(s)'
field :kind, String, null: false, description: 'The type of identifier detected (one of DOI, ISBN, ISSN, LCSH, PMID)'
field :kind, String, null: false, description: 'The type of identifier detected (one of ISBN, ISSN, PMID, DOI)'
field :value, String, null: false, description: 'The identifier detected in the search term'

# details does external lookups and should only be run if the fields
Expand All @@ -18,8 +18,6 @@ def details
LookupIsbn.new.info(@object[:value])
when :issn
LookupIssn.new.info(@object[:value])
when :lcsh
@object[:value]
when :pmid
LookupPmid.new.info(@object[:value].split.last)
end
Expand Down
43 changes: 43 additions & 0 deletions app/models/detector/lcsh.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# frozen_string_literal: true

class Detector
# Detector::LCSH is a very rudimentary detector for the separator between levels of a Library of Congress Subject
# Heading (LCSH). These subject headings follow this pattern: "Social security beneficiaries -- United States"
class Lcsh
attr_reader :identifiers

def initialize(term)
@identifiers = {}
term_pattern_checker(term)
end

def self.record(term)
foo = Detector::Lcsh.new(term.phrase)

foo.identifiers.each_key do |k|
Detection.find_or_create_by(
term:,
detector: Detector.where(name: 'LCSH').first
)
end
end

private

def term_pattern_checker(term)
subject_patterns.each_pair do |type, pattern|
@identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present?
end
end

def match(pattern, term)
pattern.match(term).to_s.strip
end

def subject_patterns
{
separator: /(.*)\s--\s(.*)/
}
end
end
end
4 changes: 1 addition & 3 deletions app/models/detector/standard_identifiers.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# frozen_string_literal: true

class Detector
# Detector::StandardIdentifiers detects the identifiers DOI, ISBN, ISSN, LCSH, PMID.
# Detector::StandardIdentifiers detects the identifiers DOI, ISBN, ISSN, PMID.
# See /docs/reference/pattern_detection_and_enhancement.md for details.
class StandardIdentifiers
attr_reader :identifiers
Expand Down Expand Up @@ -55,12 +55,10 @@ def match(pattern, term)
end

# term_patterns are regex patterns to be applied to the basic search box input
# The LCSH regex is looking for the separator used in the Bento UI, so this is an area for later improvement.
def term_patterns
{
isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/,
lcsh: /\s--\s/,
pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/,
doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b}
}
Expand Down
17 changes: 14 additions & 3 deletions app/models/metrics/algorithms.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
# doi :integer
# issn :integer
# isbn :integer
# lcsh :integer
# pmid :integer
# unmatched :integer
# created_at :datetime not null
# updated_at :datetime not null
# journal_exact :integer
# suggested_resource_exact :integer
# lcsh :integer
#
module Metrics
# Algorithms aggregates statistics for matches for all SearchEvents
Expand Down Expand Up @@ -80,8 +80,19 @@ def event_matches(event, matches)
ids = match_standard_identifiers(event, matches)
journal_exact = process_journals(event, matches)
suggested_resource_exact = process_suggested_resources(event, matches)
lcshs = match_lcsh(event, matches)

matches[:unmatched] += 1 if ids.identifiers.blank? && lcshs.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero?
end

matches[:unmatched] += 1 if ids.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero?
def match_lcsh(event, matches)
known_ids = %i[separator]
ids = Detector::Lcsh.new(event.term.phrase)

known_ids.each do |id|
matches[:lcsh] += 1 if ids.identifiers[id].present?
end
ids
end

# Checks for StandardIdentifer matches
Expand All @@ -90,7 +101,7 @@ def event_matches(event, matches)
# @param matches [Hash] a Hash that keeps track of how many of each algorithm we match
# @return [Array] an array of matched StandardIdentifiers
def match_standard_identifiers(event, matches)
known_ids = %i[unmatched doi isbn issn lcsh pmid]
known_ids = %i[unmatched doi isbn issn pmid]
ids = Detector::StandardIdentifiers.new(event.term.phrase)

known_ids.each do |id|
Expand Down
1 change: 1 addition & 0 deletions app/models/term.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Term < ApplicationRecord
def record_detections
Detector::StandardIdentifiers.record(self)
Detector::Journal.record(self)
Detector::Lcsh.record(self)
Detector::SuggestedResource.record(self)

nil
Expand Down
6 changes: 6 additions & 0 deletions db/seeds.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Detector.find_or_create_by(name: 'DOI')
Detector.find_or_create_by(name: 'ISBN')
Detector.find_or_create_by(name: 'ISSN')
Detector.find_or_create_by(name: 'LCSH')
Detector.find_or_create_by(name: 'PMID')
Detector.find_or_create_by(name: 'Journal')
Detector.find_or_create_by(name: 'SuggestedResource')
Expand All @@ -48,6 +49,11 @@
category: Category.find_by(name: 'Transactional'),
confidence: 0.6
)
DetectorCategory.find_or_create_by(
detector: Detector.find_by(name: 'LCSH'),
category: Category.find_by(name: 'Informational'),
confidence: 0.7
)
DetectorCategory.find_or_create_by(
detector: Detector.find_by(name: 'PMID'),
category: Category.find_by(name: 'Transactional'),
Expand Down
5 changes: 5 additions & 0 deletions test/fixtures/detector_categories.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@ five:
detector: journal
category: transactional
confidence: 0.5

six:
detector: lcsh
category: informational
confidence: 0.7
3 changes: 3 additions & 0 deletions test/fixtures/detectors.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ isbn:
issn:
name: 'ISSN'

lcsh:
name: 'LCSH'

pmid:
name: 'PMID'

Expand Down
7 changes: 7 additions & 0 deletions test/fixtures/search_events.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ current_month_doi:
current_month_isbn:
term: isbn_9781319145446
source: test
current_month_lcsh:
term: lcsh
source: test
old_month_lcsh:
term: lcsh
source: test
created_at: <%= 1.year.ago %>
current_month_nature_medicine:
term: journal_nature_medicine
source: test
Expand Down
3 changes: 3 additions & 0 deletions test/fixtures/terms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ hi:
pmid_38908367:
phrase: 'TERT activation targets DNA methylation and multiple aging hallmarks. Shim HS, et al. Cell. 2024. PMID: 38908367'

lcsh:
phrase: 'Geology -- Massachusetts'

issn_1075_8623:
phrase: 1075-8623

Expand Down
35 changes: 35 additions & 0 deletions test/models/detector/lcsh_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# frozen_string_literal: true

require 'test_helper'

class Detector
class LcshTest < ActiveSupport::TestCase
test 'lcsh detector activates when a separator is found' do
true_samples = [
'Geology -- Massachusetts',
'Space vehicles -- Materials -- Congresses'
]

true_samples.each do |term|
actual = Detector::Lcsh.new(term).identifiers

assert_includes(actual, :separator)
end
end

test 'lcsh detector does nothing in most cases' do
false_samples = [
'orange cats like popcorn',
'hyphenated names like Lin-Manuel Miranda do nothing',
'dashes used as an aside - like this one - do nothing',
'This one should--also not work'
]

false_samples.each do |term|
actual = Detector::Lcsh.new(term).identifiers

refute_includes(actual, :separator)
end
end
end
end
28 changes: 0 additions & 28 deletions test/models/detector/standard_identifiers_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -137,34 +137,6 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
end
end

# LCSH
test 'lcsh detected' do
true_samples = [
'Geology -- Massachusetts',
'Space vehicles -- Materials -- Congresses'
]

true_samples.each do |term|
actual = Detector::StandardIdentifiers.new(term).identifiers

assert_includes(actual, :lcsh)
end


false_samples = [
'Geology of Massachusetts',
'Geology-Massachusetts',
'Geology--Massachusetts'
]

false_samples.each do |term|
actual = Detector::StandardIdentifiers.new(term).identifiers

refute_includes(actual, :lcsh)
end
end

# DOI tests
test 'doi detected in string' do
actual = Detector::StandardIdentifiers.new('"Quantum tomography: Measured measurement", Markus Aspelmeyer, nature physics "\
"January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').identifiers
Expand Down
20 changes: 19 additions & 1 deletion test/models/metrics/algorithms_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
# doi :integer
# issn :integer
# isbn :integer
# lcsh :integer
# pmid :integer
# unmatched :integer
# created_at :datetime not null
# updated_at :datetime not null
# journal_exact :integer
# suggested_resource_exact :integer
# lcsh :integer
#
require 'test_helper'

Expand All @@ -39,6 +39,12 @@ class Algorithms < ActiveSupport::TestCase
assert_equal 1, aggregate.isbn
end

test 'lcsh counts are included in monthly aggregation' do
aggregate = Metrics::Algorithms.new.generate(DateTime.now)

assert_equal 1, aggregate.lcsh
end

test 'pmids counts are included in monthly aggregation' do
aggregate = Metrics::Algorithms.new.generate(DateTime.now)

Expand Down Expand Up @@ -94,6 +100,11 @@ class Algorithms < ActiveSupport::TestCase
SearchEvent.create(term: terms(:isbn_9781319145446), source: 'test')
end

lcsh_expected_count = rand(1...100)
lcsh_expected_count.times do
SearchEvent.create(term: terms(:lcsh), source: 'test')
end

pmid_expected_count = rand(1...100)
pmid_expected_count.times do
SearchEvent.create(term: terms(:pmid_38908367), source: 'test')
Expand All @@ -109,6 +120,7 @@ class Algorithms < ActiveSupport::TestCase
assert_equal doi_expected_count, aggregate.doi
assert_equal issn_expected_count, aggregate.issn
assert_equal isbn_expected_count, aggregate.isbn
assert_equal lcsh_expected_count, aggregate.lcsh
assert_equal pmid_expected_count, aggregate.pmid
assert_equal unmatched_expected_count, aggregate.unmatched
end
Expand All @@ -132,6 +144,12 @@ class Algorithms < ActiveSupport::TestCase
assert_equal 1, aggregate.isbn
end

test 'lcsh counts are included in total aggregation' do
aggregate = Metrics::Algorithms.new.generate

assert_equal 2, aggregate.lcsh
end

test 'pmids counts are included in total aggregation' do
aggregate = Metrics::Algorithms.new.generate

Expand Down

0 comments on commit 6592996

Please sign in to comment.