From 8693617e5d2ba81540ca25b22db4534466f44772 Mon Sep 17 00:00:00 2001 From: Matthew Bernhardt Date: Wed, 2 Oct 2024 14:19:48 -0400 Subject: [PATCH] Refactor LCSH to be a separate detector Squash refactor Squash refactor Update metrics tests and fixtures More tests --- app/graphql/types/detectors_type.rb | 19 ++++--- .../types/standard_identifiers_type.rb | 6 +-- app/models/detector/lcsh.rb | 43 +++++++++++++++ app/models/detector/standard_identifiers.rb | 4 +- app/models/metrics/algorithms.rb | 17 ++++-- app/models/term.rb | 1 + db/seeds.rb | 6 +++ test/controllers/graphql_controller_test.rb | 29 ++++++++++ test/fixtures/detector_categories.yml | 5 ++ test/fixtures/detectors.yml | 3 ++ test/fixtures/search_events.yml | 7 +++ test/fixtures/terms.yml | 3 ++ test/models/detector/lcsh_test.rb | 53 +++++++++++++++++++ .../detector/standard_identifiers_test.rb | 28 ---------- test/models/metrics/algorithms_test.rb | 20 ++++++- 15 files changed, 199 insertions(+), 45 deletions(-) create mode 100644 app/models/detector/lcsh.rb create mode 100644 test/models/detector/lcsh_test.rb diff --git a/app/graphql/types/detectors_type.rb b/app/graphql/types/detectors_type.rb index 0626fb8..9f7729b 100644 --- a/app/graphql/types/detectors_type.rb +++ b/app/graphql/types/detectors_type.rb @@ -5,21 +5,28 @@ class DetectorsType < Types::BaseObject description 'Provides all available search term detectors' field :journals, [Types::JournalsType], description: 'Information about journals detected in the search term' + field :lcsh, [String], description: 'Library of Congress Subject Heading information' field :standard_identifiers, [Types::StandardIdentifiersType], description: 'Currently supported: ISBN, ISSN, PMID, DOI' field :suggested_resources, [Types::SuggestedResourcesType], description: 'Suggested resources detected in the search term' - def standard_identifiers - Detector::StandardIdentifiers.new(@object).identifiers.map do |identifier| - { kind: identifier.first, value: identifier.last } - end - end - def journals Detector::Journal.full_term_match(@object).map do |journal| { title: journal.name, additional_info: journal.additional_info } end end + def lcsh + Detector::Lcsh.new(@object).identifiers.map do |identifier| + identifier.last + end + end + + def standard_identifiers + Detector::StandardIdentifiers.new(@object).identifiers.map do |identifier| + { kind: identifier.first, value: identifier.last } + end + end + def suggested_resources Detector::SuggestedResource.full_term_match(@object).map do |suggested_resource| { title: suggested_resource.title, url: suggested_resource.url } diff --git a/app/graphql/types/standard_identifiers_type.rb b/app/graphql/types/standard_identifiers_type.rb index 4ec3d32..df329f2 100644 --- a/app/graphql/types/standard_identifiers_type.rb +++ b/app/graphql/types/standard_identifiers_type.rb @@ -2,10 +2,10 @@ module Types class StandardIdentifiersType < Types::BaseObject - description 'A detector for standard identifiers in search terms. Currently supported: DOI, ISBN, ISSN, LCSH, PMID' + description 'A detector for standard identifiers in search terms. Currently supported: ISBN, ISSN, PMID, DOI' field :details, DetailsType, description: 'Additional information about the detected identifier(s)' - field :kind, String, null: false, description: 'The type of identifier detected (one of DOI, ISBN, ISSN, LCSH, PMID)' + field :kind, String, null: false, description: 'The type of identifier detected (one of ISBN, ISSN, PMID, DOI)' field :value, String, null: false, description: 'The identifier detected in the search term' # details does external lookups and should only be run if the fields @@ -18,8 +18,6 @@ def details LookupIsbn.new.info(@object[:value]) when :issn LookupIssn.new.info(@object[:value]) - when :lcsh - @object[:value] when :pmid LookupPmid.new.info(@object[:value].split.last) end diff --git a/app/models/detector/lcsh.rb b/app/models/detector/lcsh.rb new file mode 100644 index 0000000..56d7aeb --- /dev/null +++ b/app/models/detector/lcsh.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +class Detector + # Detector::LCSH is a very rudimentary detector for the separator between levels of a Library of Congress Subject + # Heading (LCSH). These subject headings follow this pattern: "Social security beneficiaries -- United States" + class Lcsh + attr_reader :identifiers + + def initialize(term) + @identifiers = {} + term_pattern_checker(term) + end + + def self.record(term) + results = Detector::Lcsh.new(term.phrase) + + results.identifiers.each_key do |k| + Detection.find_or_create_by( + term:, + detector: Detector.where(name: 'LCSH').first + ) + end + end + + private + + def term_pattern_checker(term) + subject_patterns.each_pair do |type, pattern| + @identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present? + end + end + + def match(pattern, term) + pattern.match(term).to_s.strip + end + + def subject_patterns + { + separator: /(.*)\s--\s(.*)/ + } + end + end +end diff --git a/app/models/detector/standard_identifiers.rb b/app/models/detector/standard_identifiers.rb index 89ccab9..bb21211 100644 --- a/app/models/detector/standard_identifiers.rb +++ b/app/models/detector/standard_identifiers.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true class Detector - # Detector::StandardIdentifiers detects the identifiers DOI, ISBN, ISSN, LCSH, PMID. + # Detector::StandardIdentifiers detects the identifiers DOI, ISBN, ISSN, PMID. # See /docs/reference/pattern_detection_and_enhancement.md for details. class StandardIdentifiers attr_reader :identifiers @@ -55,12 +55,10 @@ def match(pattern, term) end # term_patterns are regex patterns to be applied to the basic search box input - # The LCSH regex is looking for the separator used in the Bento UI, so this is an area for later improvement. def term_patterns { isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/, issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/, - lcsh: /\s--\s/, pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/, doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b} } diff --git a/app/models/metrics/algorithms.rb b/app/models/metrics/algorithms.rb index 3b1fef4..e333f15 100644 --- a/app/models/metrics/algorithms.rb +++ b/app/models/metrics/algorithms.rb @@ -9,13 +9,13 @@ # doi :integer # issn :integer # isbn :integer +# lcsh :integer # pmid :integer # unmatched :integer # created_at :datetime not null # updated_at :datetime not null # journal_exact :integer # suggested_resource_exact :integer -# lcsh :integer # module Metrics # Algorithms aggregates statistics for matches for all SearchEvents @@ -80,8 +80,19 @@ def event_matches(event, matches) ids = match_standard_identifiers(event, matches) journal_exact = process_journals(event, matches) suggested_resource_exact = process_suggested_resources(event, matches) + lcshs = match_lcsh(event, matches) + + matches[:unmatched] += 1 if ids.identifiers.blank? && lcshs.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero? + end - matches[:unmatched] += 1 if ids.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero? + def match_lcsh(event, matches) + known_ids = %i[separator] + ids = Detector::Lcsh.new(event.term.phrase) + + known_ids.each do |id| + matches[:lcsh] += 1 if ids.identifiers[id].present? + end + ids end # Checks for StandardIdentifer matches @@ -90,7 +101,7 @@ def event_matches(event, matches) # @param matches [Hash] a Hash that keeps track of how many of each algorithm we match # @return [Array] an array of matched StandardIdentifiers def match_standard_identifiers(event, matches) - known_ids = %i[unmatched doi isbn issn lcsh pmid] + known_ids = %i[unmatched doi isbn issn pmid] ids = Detector::StandardIdentifiers.new(event.term.phrase) known_ids.each do |id| diff --git a/app/models/term.rb b/app/models/term.rb index e9e4381..2b33dc6 100644 --- a/app/models/term.rb +++ b/app/models/term.rb @@ -24,6 +24,7 @@ class Term < ApplicationRecord def record_detections Detector::StandardIdentifiers.record(self) Detector::Journal.record(self) + Detector::Lcsh.record(self) Detector::SuggestedResource.record(self) nil diff --git a/db/seeds.rb b/db/seeds.rb index b144734..acc5b8f 100644 --- a/db/seeds.rb +++ b/db/seeds.rb @@ -28,6 +28,7 @@ Detector.find_or_create_by(name: 'DOI') Detector.find_or_create_by(name: 'ISBN') Detector.find_or_create_by(name: 'ISSN') +Detector.find_or_create_by(name: 'LCSH') Detector.find_or_create_by(name: 'PMID') Detector.find_or_create_by(name: 'Journal') Detector.find_or_create_by(name: 'SuggestedResource') @@ -48,6 +49,11 @@ category: Category.find_by(name: 'Transactional'), confidence: 0.6 ) +DetectorCategory.find_or_create_by( + detector: Detector.find_by(name: 'LCSH'), + category: Category.find_by(name: 'Informational'), + confidence: 0.7 +) DetectorCategory.find_or_create_by( detector: Detector.find_by(name: 'PMID'), category: Category.find_by(name: 'Transactional'), diff --git a/test/controllers/graphql_controller_test.rb b/test/controllers/graphql_controller_test.rb index 34c92e5..418d503 100644 --- a/test/controllers/graphql_controller_test.rb +++ b/test/controllers/graphql_controller_test.rb @@ -111,6 +111,20 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest json['data']['logSearchEvent']['detectors']['suggestedResources'].first['url'] end + test 'search event query can return detected library of congress subject headings' do + post '/graphql', params: { query: '{ + logSearchEvent(sourceSystem: "timdex", searchTerm: "Maryland -- Geography") { + detectors { + lcsh + } + } + }' } + json = response.parsed_body + + assert_equal 'Maryland -- Geography', + json['data']['logSearchEvent']['detectors']['lcsh'].first + end + test 'search event query can return phrase from logged term' do post '/graphql', params: { query: '{ logSearchEvent(sourceSystem: "timdex", searchTerm: "10.1038/nphys1170") { @@ -170,6 +184,21 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest assert_in_delta 0.95, json['data']['logSearchEvent']['categories'].first['confidence'] end + test 'term lookup query can return detected library of congress subject headings' do + post '/graphql', params: { query: '{ + lookupTerm(searchTerm: "Geology -- Massachusetts") { + detectors { + lcsh + } + } + }' } + + json = response.parsed_body + + assert_equal('Geology -- Massachusetts', + json['data']['lookupTerm']['detectors']['lcsh'].first) + end + test 'term lookup query can return categorization details for searches that trip a detector' do post '/graphql', params: { query: '{ lookupTerm(searchTerm: "10.1016/j.physio.2010.12.004") { diff --git a/test/fixtures/detector_categories.yml b/test/fixtures/detector_categories.yml index 113fc20..9ddee4c 100644 --- a/test/fixtures/detector_categories.yml +++ b/test/fixtures/detector_categories.yml @@ -33,3 +33,8 @@ five: detector: journal category: transactional confidence: 0.5 + +six: + detector: lcsh + category: informational + confidence: 0.7 diff --git a/test/fixtures/detectors.yml b/test/fixtures/detectors.yml index 81271f2..d9c837c 100644 --- a/test/fixtures/detectors.yml +++ b/test/fixtures/detectors.yml @@ -16,6 +16,9 @@ isbn: issn: name: 'ISSN' +lcsh: + name: 'LCSH' + pmid: name: 'PMID' diff --git a/test/fixtures/search_events.yml b/test/fixtures/search_events.yml index 168250f..2060684 100644 --- a/test/fixtures/search_events.yml +++ b/test/fixtures/search_events.yml @@ -31,6 +31,13 @@ current_month_doi: current_month_isbn: term: isbn_9781319145446 source: test +current_month_lcsh: + term: lcsh + source: test +old_month_lcsh: + term: lcsh + source: test + created_at: <%= 1.year.ago %> current_month_nature_medicine: term: journal_nature_medicine source: test diff --git a/test/fixtures/terms.yml b/test/fixtures/terms.yml index cb88d9c..20b81ee 100644 --- a/test/fixtures/terms.yml +++ b/test/fixtures/terms.yml @@ -17,6 +17,9 @@ hi: pmid_38908367: phrase: 'TERT activation targets DNA methylation and multiple aging hallmarks. Shim HS, et al. Cell. 2024. PMID: 38908367' +lcsh: + phrase: 'Geology -- Massachusetts' + issn_1075_8623: phrase: 1075-8623 diff --git a/test/models/detector/lcsh_test.rb b/test/models/detector/lcsh_test.rb new file mode 100644 index 0000000..2c811e3 --- /dev/null +++ b/test/models/detector/lcsh_test.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +require 'test_helper' + +class Detector + class LcshTest < ActiveSupport::TestCase + test 'lcsh detector activates when a separator is found' do + true_samples = [ + 'Geology -- Massachusetts', + 'Space vehicles -- Materials -- Congresses' + ] + + true_samples.each do |term| + actual = Detector::Lcsh.new(term).identifiers + + assert_includes(actual, :separator) + end + end + + test 'lcsh detector does nothing in most cases' do + false_samples = [ + 'orange cats like popcorn', + 'hyphenated names like Lin-Manuel Miranda do nothing', + 'dashes used as an aside - like this one - do nothing', + 'This one should--also not work' + ] + + false_samples.each do |term| + actual = Detector::Lcsh.new(term).identifiers + + refute_includes(actual, :separator) + end + end + + test 'record method does relevant work' do + detection_count = Detection.count + t = terms('lcsh') + + Detector::Lcsh.record(t) + + assert_equal(detection_count + 1, Detection.count) + end + + test 'record does nothing when not needed' do + detection_count = Detection.count + t = terms('isbn_9781319145446') + + Detector::Lcsh.record(t) + + assert_equal(detection_count, Detection.count) + end + end +end diff --git a/test/models/detector/standard_identifiers_test.rb b/test/models/detector/standard_identifiers_test.rb index f70842e..8d66301 100644 --- a/test/models/detector/standard_identifiers_test.rb +++ b/test/models/detector/standard_identifiers_test.rb @@ -137,34 +137,6 @@ class StandardIdentifiersTest < ActiveSupport::TestCase end end - # LCSH - test 'lcsh detected' do - true_samples = [ - 'Geology -- Massachusetts', - 'Space vehicles -- Materials -- Congresses' - ] - - true_samples.each do |term| - actual = Detector::StandardIdentifiers.new(term).identifiers - - assert_includes(actual, :lcsh) - end - - - false_samples = [ - 'Geology of Massachusetts', - 'Geology-Massachusetts', - 'Geology--Massachusetts' - ] - - false_samples.each do |term| - actual = Detector::StandardIdentifiers.new(term).identifiers - - refute_includes(actual, :lcsh) - end - end - - # DOI tests test 'doi detected in string' do actual = Detector::StandardIdentifiers.new('"Quantum tomography: Measured measurement", Markus Aspelmeyer, nature physics "\ "January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').identifiers diff --git a/test/models/metrics/algorithms_test.rb b/test/models/metrics/algorithms_test.rb index 7b5ba6f..7428096 100644 --- a/test/models/metrics/algorithms_test.rb +++ b/test/models/metrics/algorithms_test.rb @@ -9,13 +9,13 @@ # doi :integer # issn :integer # isbn :integer +# lcsh :integer # pmid :integer # unmatched :integer # created_at :datetime not null # updated_at :datetime not null # journal_exact :integer # suggested_resource_exact :integer -# lcsh :integer # require 'test_helper' @@ -39,6 +39,12 @@ class Algorithms < ActiveSupport::TestCase assert_equal 1, aggregate.isbn end + test 'lcsh counts are included in monthly aggregation' do + aggregate = Metrics::Algorithms.new.generate(DateTime.now) + + assert_equal 1, aggregate.lcsh + end + test 'pmids counts are included in monthly aggregation' do aggregate = Metrics::Algorithms.new.generate(DateTime.now) @@ -94,6 +100,11 @@ class Algorithms < ActiveSupport::TestCase SearchEvent.create(term: terms(:isbn_9781319145446), source: 'test') end + lcsh_expected_count = rand(1...100) + lcsh_expected_count.times do + SearchEvent.create(term: terms(:lcsh), source: 'test') + end + pmid_expected_count = rand(1...100) pmid_expected_count.times do SearchEvent.create(term: terms(:pmid_38908367), source: 'test') @@ -109,6 +120,7 @@ class Algorithms < ActiveSupport::TestCase assert_equal doi_expected_count, aggregate.doi assert_equal issn_expected_count, aggregate.issn assert_equal isbn_expected_count, aggregate.isbn + assert_equal lcsh_expected_count, aggregate.lcsh assert_equal pmid_expected_count, aggregate.pmid assert_equal unmatched_expected_count, aggregate.unmatched end @@ -132,6 +144,12 @@ class Algorithms < ActiveSupport::TestCase assert_equal 1, aggregate.isbn end + test 'lcsh counts are included in total aggregation' do + aggregate = Metrics::Algorithms.new.generate + + assert_equal 2, aggregate.lcsh + end + test 'pmids counts are included in total aggregation' do aggregate = Metrics::Algorithms.new.generate