Merge pull request #13 from MITLibraries/engx242-identifierpatterns

Engx242 identifierpatterns
MITLibraries · Nov 17, 2023 · 0503811 · 0503811
2 parents 0b587a2 + f14464e
commit 0503811
Show file tree

Hide file tree

Showing 8 changed files with 382 additions and 0 deletions.
diff --git a/app/graphql/types/query_type.rb b/app/graphql/types/query_type.rb
@@ -32,5 +32,14 @@ def log_search_event(search_term:, source_system:)
       term = Term.create_or_find_by!(phrase: search_term)
       term.search_events.create!(source: source_system)
     end
+
+    field :lookup_term, TermType, null: true,
+                                  description: 'Lookup a term to return information about it (bypasses logging)' do
+      argument :search_term, String, required: true
+    end
+
+    def lookup_term(search_term:)
+      term = Term.find_by(phrase: search_term)
+    end
   end
 end
diff --git a/app/graphql/types/search_event_type.rb b/app/graphql/types/search_event_type.rb
@@ -7,5 +7,19 @@ class SearchEventType < Types::BaseObject
     field :source, String
     field :created_at, GraphQL::Types::ISO8601DateTime, null: false
     field :updated_at, GraphQL::Types::ISO8601DateTime, null: false
+    field :phrase, String
+    field :standard_identifiers, [StandardIdentifiersType]
+
+    def phrase
+      @object.term.phrase
+    end
+
+    def standard_identifiers
+      ids = []
+      StandardIdentifiers.new(@object.term.phrase).identifiers.each do |identifier|
+        ids << { kind: identifier.first, value: identifier.last }
+      end
+      ids
+    end
   end
 end
diff --git a/app/graphql/types/standard_identifiers_type.rb b/app/graphql/types/standard_identifiers_type.rb
@@ -0,0 +1,8 @@
+# frozen_string_literal: true
+
+module Types
+  class StandardIdentifiersType < Types::BaseObject
+    field :kind, String, null: false
+    field :value, String, null: false
+  end
+end
diff --git a/app/graphql/types/term_type.rb b/app/graphql/types/term_type.rb
@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+
+module Types
+  class TermType < Types::BaseObject
+    field :id, ID, null: false
+    field :created_at, GraphQL::Types::ISO8601DateTime, null: false
+    field :updated_at, GraphQL::Types::ISO8601DateTime, null: false
+    field :phrase, String, null: false
+    field :occurence_count, Integer
+    field :search_events, [SearchEventType], null: false
+    field :standard_identifiers, [StandardIdentifiersType]
+
+    def occurence_count
+      @object.search_events.count
+    end
+
+    def standard_identifiers
+      ids = []
+      StandardIdentifiers.new(@object.phrase).identifiers.each do |identifier|
+        ids << { kind: identifier.first, value: identifier.last }
+      end
+      ids
+    end
+  end
+end
diff --git a/app/models/standard_identifiers.rb b/app/models/standard_identifiers.rb
@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+
+# StandardIdentifiers is a PatternDectector implementation that detects the identifiers DOI, ISBN, ISSN, PMID.
+# See /docs/reference/pattern_detection_and_enhancement.md for details.
+class StandardIdentifiers
+  attr_reader :identifiers
+
+  def initialize(term)
+    @identifiers = {}
+    term_pattern_checker(term)
+    strip_invalid_issns
+  end
+
+  private
+
+  def term_pattern_checker(term)
+    term_patterns.each_pair do |type, pattern|
+      @identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present?
+    end
+  end
+
+  # Note on the limitations of this implementation
+  # We only detect the first match of each pattern, so a search of "1234-5678 5678-1234" will not return two ISSNs as
+  # might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect
+  # all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing
+  # that change did result in unintended results so it was backed out for now.
+  def match(pattern, term)
+    pattern.match(term).to_s.strip
+  end
+
+  # term_patterns are regex patterns to be applied to the basic search box input
+  def term_patterns
+    {
+      isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
+      issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/,
+      pmid: /\b((pmid|PMID): (\d{7,8}))\b/,
+      doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b}
+    }
+  end
+
+  def strip_invalid_issns
+    return unless @identifiers[:issn]
+
+    @identifiers[:issn] = nil unless validate_issn(@identifiers[:issn])
+  end
+
+  # validate_issn is only called when the regex for an ISSN has indicated an ISSN
+  # of sufficient format is present - but the regex does not attempt to
+  # validate that the check digit in the ISSN spec is correct. This method
+  # does that calculation, so we do not returned falsely detected ISSNs,
+  # like "2015-2019".
+  #
+  # The algorithm is defined at
+  # https://datatracker.ietf.org/doc/html/rfc3044#section-2.2
+  # An example calculation is shared at
+  # https://en.wikipedia.org/wiki/International_Standard_Serial_Number#Code_format
+  def validate_issn(candidate)
+    digits = candidate.gsub('-', '').chars[..6]
+    check_digit = candidate.last.downcase
+    sum = 0
+
+    digits.each_with_index do |digit, idx|
+      sum += digit.to_i * (8 - idx.to_i)
+    end
+
+    actual_digit = 11 - sum.modulo(11)
+    actual_digit = 'x' if actual_digit == 10
+
+    return true if actual_digit.to_s == check_digit.to_s
+
+    false
+  end
+end
diff --git a/docs/reference/pattern_detection_and_enhancement.md b/docs/reference/pattern_detection_and_enhancement.md
@@ -0,0 +1,60 @@
+## Pattern detection and metadata enhancement
+
+A Pattern Detector is responsible for identifying specific patterns within the input, such as using regular expressions to detect ISSN, ISBN, DOI, and PMID (implemented in our StandardIdentifiers Class). Other techniques than regular expressions may also occur as Pattern Detectors, such as doing phrase matching to identify known scientific journals, or fingerprint matching to identify librarian curated responses.
+
+A Pattern Detector is only run when the incoming data has requested this type of information to be returned. This will take the form of requesting specific fields to be returned via GraphQL that require using Pattern Detector to populate.
+
+An appropriate Enhancer for the specific Pattern will add more detailed metadata if requested via GraphQL. This will allow the slowest portion of this data flow -- the external data lookups (Enhancers) -- to only be run if the caller has specifically asked for that data. Some users may only be interested in knowing that patterns were found and what they were, whereas others are willing to wait longer for more detailed information. And others still won't be interested in either. **The incoming GraphQL will be the driver of which algorithms we run, and which external data we request.**
+
+```mermaid
+---
+title: "Pattern Detector: detecting known patterns and selectively enhancing the output"
+---
+flowchart LR
+  accTitle: "Pattern Detector: detecting known patterns and selectively enhancing the output"
+  accDescr: A flow chart showing how input is analyzed for patterns and decisions are made based on what was found. The workflow is described fully in the paragraphs of text following this diagram.
+
+  input(input)
+  detect[PatternDetector]
+  lookup[(DataLookup)]
+  enhance(enhance)
+  found{found?}
+  details{details requested?}
+  metadata{metadata found?}
+  annotate[[annotate]]
+  output
+  enhance --> output
+
+  subgraph PatternDetector
+    direction TB
+      detect --doi--> found
+      detect --issn--> found
+      detect --isbn----> found
+      detect --journal title--> found
+      detect --pmid--> found
+      annotate
+  end
+
+  subgraph Enhancer
+    lookup --> metadata
+    metadata -- yes --> enhance[[enhance]]
+    enhance
+  end
+
+  input --> PatternDetector
+  metadata -- no --> output
+  found -- no --> output
+  found -- yes --> annotate
+  annotate --> details
+  details -- no --> output
+  details -- yes --> lookup
+  output
+```
+
+When receiving an input, first we detect known patterns such as DOI, ISSN, ISBN, PMID, or Journal Titles.
+
+If we do not find any, we exit the flow with an empty output.
+
+If we find one more more patterns, we annotate the eventual response with what we found. If the original input did not request details for found patterns, we return the annotated response with what we found.
+
+If the original input did request details for found patterns, we lookup information. If we do not find additional information, we return the annotated output. If we do find additional information, we enhance the annotation with the metadata we have found and return that in the output.
diff --git a/test/controllers/graphql_controller_test.rb b/test/controllers/graphql_controller_test.rb
@@ -1,3 +1,5 @@
+# frozen_string_literal: true
+
 require 'test_helper'
 
 class GraphqlControllerTest < ActionDispatch::IntegrationTest
@@ -48,4 +50,19 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest
     assert_equal(200, response.status)
     assert_equal Term.count, initial_term_count
   end
+
+  test 'search event query can return detected standard identifiers' do
+    post '/graphql', params: { query: '{
+                                 logSearchEvent(sourceSystem: "timdex", searchTerm: "10.1038/nphys1170") {
+                                  standardIdentifiers {
+                                        kind
+                                        value
+                                  }
+                                 }
+                               }' }
+
+    json = JSON.parse(response.body)
+    assert_equal('doi', json['data']['logSearchEvent']['standardIdentifiers'].first['kind'])
+    assert_equal('10.1038/nphys1170', json['data']['logSearchEvent']['standardIdentifiers'].first['value'])
+  end
 end