Adds data lookup for detected standard identifiers

Why are these changes being introduced: * The ability to provide additional details about what TACOS can detect will allow consuming systems to act on that information. That may mean simply displaying it, or it may mean modifying queries to other systems in more nuanced ways than the original raw search from the user may have led to Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/ENGX-244 How does this address that need: * This brings the Fact lookup models over from TIMDEX UI and modifies them slightly to be more useful to this application. * A details section has been added under standard_identifiers in graphql that will trigger lookups if identifiers have been detected. * Only lookups that have matches are run, and only if a user has requested details be provided by requesting fields within the details block Document any side effects to this change: dotenv-rails was introduced as a dependency to simplify the test environment configuration. Any missing configuration documented in the README will result in exceptions.
MITLibraries · Nov 17, 2023 · 9364206 · 9364206
1 parent c766a83
commit 9364206
Show file tree

Hide file tree

Showing 26 changed files with 1,095 additions and 1 deletion.
diff --git a/.env.test b/.env.test
@@ -0,0 +1,2 @@
+LINKRESOLVER_BASEURL=https://mit.primo.exlibrisgroup.com/discovery/openurl?institution=01MIT_INST&rfr_id=info:sid/mit.tacos.api&vid=01MIT_INST:MIT
+UNPAYWALL_EMAIL=timdex@mit.edu
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,8 @@
 # Ignore all environment files (except templates).
 /.env*
 !/.env*.erb
+# Include test env file
+!/.env.test
 
 # Ignore all logfiles and tempfiles.
 /log/*

diff --git a/Gemfile b/Gemfile
@@ -12,6 +12,9 @@ gem 'bootsnap', require: false
 # Ruby GraphQL implememntation [https://github.com/rmosolgo/graphql-ruby]
 gem 'graphql'
 
+# HTTP is an easy-to-use client library for making requests from Ruby [https://github.com/httprb/http]
+gem 'http'
+
 # Use JavaScript with ESM import maps [https://github.com/rails/importmap-rails]
 gem 'importmap-rails'
 
@@ -54,6 +57,9 @@ gem 'tzinfo-data', platforms: %i[windows jruby]
 group :development, :test do
   # See https://guides.rubyonrails.org/debugging_rails_applications.html#debugging-with-the-debug-gem
   gem 'debug', platforms: %i[mri windows]
+
+  # Allow selective loading of configuration in different contexts (dev/test)
+  gem 'dotenv-rails'
 end
 
 group :development do
@@ -81,4 +87,6 @@ group :test do
   gem 'selenium-webdriver'
   gem 'simplecov'
   gem 'simplecov-lcov'
+  gem 'vcr'
+  gem 'webmock'
 end
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -98,22 +98,43 @@ GEM
       xpath (~> 3.2)
     concurrent-ruby (1.2.2)
     connection_pool (2.4.1)
+    crack (0.4.5)
+      rexml
     crass (1.0.6)
     date (3.3.4)
     debug (1.8.0)
       irb (>= 1.5.0)
       reline (>= 0.3.1)
     docile (1.4.0)
+    domain_name (0.5.20190701)
+      unf (>= 0.0.5, < 1.0.0)
+    dotenv (2.8.1)
+    dotenv-rails (2.8.1)
+      dotenv (= 2.8.1)
+      railties (>= 3.2)
     drb (2.2.0)
       ruby2_keywords
     erubi (1.12.0)
+    ffi (1.16.3)
+    ffi-compiler (1.0.1)
+      ffi (>= 1.0.0)
+      rake
     globalid (1.2.1)
       activesupport (>= 6.1)
     graphiql-rails (1.9.0)
       railties
       sprockets-rails
     graphql (2.1.6)
       racc (~> 1.4)
+    hashdiff (1.0.1)
+    http (5.1.1)
+      addressable (~> 2.8)
+      http-cookie (~> 1.0)
+      http-form_data (~> 2.2)
+      llhttp-ffi (~> 0.4.0)
+    http-cookie (1.0.5)
+      domain_name (~> 0.5)
+    http-form_data (2.3.0)
     i18n (1.14.1)
       concurrent-ruby (~> 1.0)
     importmap-rails (1.2.3)
@@ -129,6 +150,9 @@ GEM
       activesupport (>= 5.0.0)
     json (2.6.3)
     language_server-protocol (3.17.0.3)
+    llhttp-ffi (0.4.0)
+      ffi-compiler (~> 1.0)
+      rake (~> 13.0)
     loofah (2.22.0)
       crass (~> 1.0.2)
       nokogiri (>= 1.12.0)
@@ -268,12 +292,20 @@ GEM
       railties (>= 6.0.0)
     tzinfo (2.0.6)
       concurrent-ruby (~> 1.0)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.8.2)
     unicode-display_width (2.5.0)
+    vcr (6.2.0)
     web-console (4.2.1)
       actionview (>= 6.0.0)
       activemodel (>= 6.0.0)
       bindex (>= 0.4.0)
       railties (>= 6.0.0)
+    webmock (3.19.1)
+      addressable (>= 2.8.0)
+      crack (>= 0.3.2)
+      hashdiff (>= 0.4.0, < 2.0.0)
     webrick (1.8.1)
     websocket (1.2.10)
     websocket-driver (0.7.6)
@@ -293,8 +325,10 @@ DEPENDENCIES
   bootsnap
   capybara
   debug
+  dotenv-rails
   graphiql-rails
   graphql
+  http
   importmap-rails
   jbuilder
   puma (>= 5.0)
@@ -310,7 +344,9 @@ DEPENDENCIES
   stimulus-rails
   turbo-rails
   tzinfo-data
+  vcr
   web-console
+  webmock
 
 RUBY VERSION
    ruby 3.2.2p53

diff --git a/README.md b/README.md
@@ -1 +1,7 @@
-# tacos
+# tacos
+
+## Required Environment Variables
+
+`LINKRESOLVER_BASEURL`: base url for our link resolver. `https://mit.primo.exlibrisgroup.com/discovery/openurl?institution=01MIT_INST&rfr_id=info:sid/mit.tacos.api&vid=01MIT_INST:MIT` is probably the best value unless you are doing something interesting.
+
+`UNPAYWALL_EMAIL`: email address to include in API call as required in their [documentation](https://unpaywall.org/products/api). Your personal email is appropriate for development. Deployed and for tests, use the timdex moira list email.
diff --git a/app/graphql/types/details_type.rb b/app/graphql/types/details_type.rb
@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+
+module Types
+  class DetailsType < Types::BaseObject
+    field :title, String
+    field :authors, [String]
+    field :date, String
+    field :publisher, String
+    field :oa, Boolean
+    field :oa_status, String
+    field :best_oa_location, String
+    field :issns, [String]
+    field :journal_name, String
+    field :doi, String
+    field :link_resolver_url, String
+
+    def issns
+      @object[:journal_issns]&.split(',')
+    end
+
+    def authors
+      @object[:authors]&.split(',')
+    end
+  end
+end
diff --git a/app/graphql/types/standard_identifiers_type.rb b/app/graphql/types/standard_identifiers_type.rb
@@ -4,5 +4,21 @@ module Types
   class StandardIdentifiersType < Types::BaseObject
     field :kind, String, null: false
     field :value, String, null: false
+    field :details, DetailsType
+
+    # details does external lookups and should only be run if the fields
+    # have been explicitly requested
+    def details
+      case @object[:kind]
+      when :doi
+        LookupDoi.new.info(@object[:value])
+      when :isbn
+        LookupIsbn.new.info(@object[:value])
+      when :issn
+        LookupIssn.new.info(@object[:value])
+      when :pmid
+        LookupPmid.new.info(@object[:value].split.last)
+      end
+    end
   end
 end
diff --git a/app/models/lookup_doi.rb b/app/models/lookup_doi.rb
@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+
+class LookupDoi
+  def info(doi)
+    external_data = fetch(doi)
+    return if external_data == 'Error'
+
+    metadata = extract_metadata(external_data)
+    metadata[:doi] = doi
+    metadata[:link_resolver_url] = link_resolver_url(metadata)
+    metadata
+  end
+
+  private
+
+  # NOTE: authors are available as objects within `'z_authors` but is somewhat
+  # complicated so wasn't implemented during this initial work
+  def extract_metadata(external_data)
+    {
+      genre: external_data['genre'],
+      title: external_data['title'],
+      date: external_data['year'],
+      publisher: external_data['publisher'],
+      oa: external_data['is_oa'],
+      oa_status: external_data['oa_status'],
+      best_oa_location: external_data['best_oa_location'],
+      journal_issns: external_data['journal_issns'],
+      journal_name: external_data['journal_name']
+    }
+  end
+
+  def url(doi)
+    "https://api.unpaywall.org/v2/#{doi}?email=#{ENV.fetch('UNPAYWALL_EMAIL')}"
+  end
+
+  def fetch(doi)
+    resp = HTTP.headers(accept: 'application/json').get(url(doi))
+    if resp.status == 200
+      JSON.parse(resp.to_s)
+    else
+      Rails.logger.debug("Fact lookup error. DOI #{doi} detected but unpaywall returned no data or otherwise errored")
+      Rails.logger.debug("URL: #{url(doi)}")
+      'Error'
+    end
+  end
+
+  def link_resolver_url(metadata)
+    "#{ENV.fetch('LINKRESOLVER_BASEURL')}&rft.atitle=#{metadata[:title]}&rft.date=#{metadata[:year]}&rft.genre=#{metadata[:genre]}&rft.jtitle=#{metadata[:journal_name]}&rft_id=info:doi/#{metadata[:doi]}"
+  end
+end
diff --git a/app/models/lookup_isbn.rb b/app/models/lookup_isbn.rb
@@ -0,0 +1,53 @@
+# frozen_string_literal: true
+
+class LookupIsbn
+  def info(isbn)
+    json = fetch_isbn(isbn)
+    return if json == 'Error'
+
+    {
+      title: json['title'],
+      date: json['publish_date'],
+      publisher: json['publishers'].join(','),
+      authors: fetch_authors(json),
+      link_resolver_url: link_resolver_url(isbn)
+    }
+  end
+
+  def base_url
+    'https://openlibrary.org'
+  end
+
+  def fetch_isbn(isbn)
+    url = [base_url, "/isbn/#{isbn}.json"].join
+    parse_response(url)
+  end
+
+  def fetch_authors(isbn_json)
+    return unless isbn_json['authors']
+
+    authors = isbn_json['authors'].map { |a| a['key'] }
+    author_names = authors.map do |author|
+      url = [base_url, author, '.json'].join
+      json = parse_response(url)
+      json['name']
+    end
+    author_names.join(' ; ')
+  end
+
+  def parse_response(url)
+    resp = HTTP.headers(accept: 'application/json', 'Content-Type': 'application/json').follow.get(url)
+
+    if resp.status == 200
+      JSON.parse(resp.to_s)
+    else
+      Rails.logger.debug('Fact lookup error: openlibrary returned no data')
+      Rails.logger.debug("URL: #{url}")
+      'Error'
+    end
+  end
+
+  def link_resolver_url(isbn)
+    "#{ENV.fetch('LINKRESOLVER_BASEURL')}&rft.isbn=#{isbn}"
+  end
+end
diff --git a/app/models/lookup_issn.rb b/app/models/lookup_issn.rb
@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+
+# LookupIssn assumes the ISSN being supplied has been validated prior to this Class being used.
+# In this application, we only LookupIssns that have been detected in StandardIdentifiers which performs
+# that validation for us. If extracting this logic to be used elsewhere, it is highly recommended to validate
+# ISSNs before doing an external lookup.
+class LookupIssn
+  def info(issn)
+    json = fetch(issn)
+    return if json == 'Error'
+
+    metadata = extract_metadata(json)
+    metadata[:link_resolver_url] = openurl(issn)
+    metadata
+  end
+
+  def extract_metadata(response)
+    {
+      journal_name: response['message']['title'],
+      publisher: response['message']['publisher'],
+      journal_issns: response['message']['ISSN'].join(',')
+    }
+  end
+
+  def url(issn)
+    "https://api.crossref.org/journals/#{issn}"
+  end
+
+  def fetch(issn)
+    resp = HTTP.headers(accept: 'application/json').get(url(issn))
+    if resp.status == 200
+      JSON.parse(resp.to_s)
+    else
+      Rails.logger.debug("ISSN Lookup error. ISSN #{issn} detected but crossref returned no data")
+      Rails.logger.debug("URL: #{url(issn)}")
+      'Error'
+    end
+  end
+
+  def openurl(issn)
+    "#{ENV.fetch('LINKRESOLVER_BASEURL')}&rft.issn=#{issn}"
+  end
+end
diff --git a/app/models/lookup_pmid.rb b/app/models/lookup_pmid.rb
@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+
+class LookupPmid
+  def info(pmid)
+    xml = fetch(pmid)
+    return if xml == 'Error'
+
+    metadata = extract_metadata(xml)
+    metadata[:pmid] = pmid
+    metadata[:link_resolver_url] = link_resolver_url(metadata)
+
+    if metadata.reject { |_k, v| v.empty? }.present?
+      metadata
+    else
+      Rails.logger.debug("Fact lookup error. PMID #{pmid} detected but ncbi returned no data")
+      nil
+    end
+  end
+
+  def extract_metadata(xml)
+    {
+      title: xml.xpath('//ArticleTitle').text,
+      journal_name: xml.xpath('//Journal/Title').text,
+      journal_volume: xml.xpath('//Journal/JournalIssue/Volume').text,
+      date: xml.xpath('//Journal/JournalIssue/PubDate/Year').text,
+      doi: xml.xpath('//PubmedData/ArticleIdList/ArticleId[@IdType="doi"]').text
+    }
+  end
+
+  def url(pmid)
+    "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=#{pmid}&retmode=xml"
+  end
+
+  def fetch(pmid)
+    resp = HTTP.headers(accept: 'application/xml').get(url(pmid))
+
+    if resp.status == 200
+      Nokogiri::XML(resp.to_s)
+    else
+      Rails.logger.debug("Fact lookup error. PMID #{pmid} detected but ncbi an error status")
+      Rails.logger.debug("URL: #{url(pmid)}")
+      'Error'
+    end
+  end
+
+  def link_resolver_url(metadata)
+    "#{ENV.fetch('LINKRESOLVER_BASEURL')}&rft.atitle=#{metadata[:title]}&rft.date=#{metadata[:date]}&rft.jtitle=#{metadata[:journal_name]}&rft_id=info:doi/#{metadata[:doi]}"
+  end
+end