dgidb · jsstevenson · Apr 19, 2024 · Oct 9, 2023 · Dec 4, 2023 · Dec 11, 2023
diff --git a/README.md b/README.md
@@ -87,6 +87,120 @@ rails s
 
 Navigate to `localhost:3000/api/graphiql` in your browser. If the example query provided runs successfully, then you're all set.
 
+### Data loading
+
+To perform a data load from scratch, first run the `reset` task to provide a clean, seeded DB:
+
+```shell
+rake db:reset
+```
+
+Most DGIdb data comes from static files, typically called `claims.tsv`. The data loader classes expect `server/lib/data/` to contain the following files:
+
+```
+lib/data
+├── bader_lab
+│   └── claims.tsv
+├── cancer_commons
+│   └── claims.tsv
+├── caris_molecular_intelligence
+│   └── claims.tsv
+├── cgi
+│   └── claims.tsv
+├── chembl
+│   └── chembl.db
+├── clearity_foundation_biomarkers
+│   └── claims.tsv
+├── clearity_foundation_clinical_trial
+│   └── claims.tsv
+├── cosmic
+│   └── claims.csv
+├── dgene
+│   └── claims.tsv
+├── drugbank
+│   └── claims.xml
+├── dtc
+│   └── claims.csv
+├── ensembl
+│   └── claims.tsv
+├── entrez
+│   └── claims.tsv
+├── fda
+│   └── claims.tsv
+├── foundation_one_genes
+│   └── claims.tsv
+├── go
+│   └── targets.tsv
+├── guide_to_pharmacology
+│   ├── interactions.csv
+│   └── targets_and_families.csv
+├── hingorani_casas
+│   └── claims.tsv
+├── hopkins_groom
+│   └── claims.tsv
+├── human_protein_atlas
+│   └── claims.tsv
+├── idg
+│   ├── claims.json
+│   └── claims.tsv
+├── msk_impact
+│   └── claims.tsv
+├── my_cancer_genome
+│   └── claims.tsv
+├── my_cancer_genome_clinical_trial
+│   └── claims.tsv
+├── nci
+│   ├── claims.tsv
+│   └── claims.xml
+├── oncokb
+│   ├── drug_claim.csv
+│   ├── gene_claim.csv
+│   ├── gene_claim_aliases.csv
+│   ├── interaction_claim.csv
+│   ├── interaction_claim_attributes.csv
+│   └── interaction_claim_links.csv
+├── oncomine
+│   └── claims.tsv
+├── pharmgkb
+│   └── claims.tsv
+├── russ_lampel
+│   └── claims.tsv
+├── talc
+│   └── claims.tsv
+├── tdg_clinical_trial
+│   ├── claims.tsv
+├── tempus
+│   └── claims.tsv
+├── tend
+│   └── claims.tsv
+└── ttd
+    └── claims.csv
+```
+
+First, load claims:
+
+```shell
+rake dgidb:import:all
+```
+
+Then, run grouping. By default, the groupers will expect a normalizer service to be running locally on port 8000; use the `THERAPY_HOSTNAME` and `GENE_HOSTNAME` environment variables to specify alternate hosts:
+
+```shell
+export THERAPY_HOSTNAME=http://localhost:7999  # no trailing backslash
+rake dgidb:group:drugs
+export GENE_HOSTNAME=http://localhost:7998  # no trailing backslash
+rake dgidb:group:genes
+rake dgidb:group:interactions
+```
+
+Finally, normalize remaining metadata:
+
+```shell
+rake dgidb:normalize:drug_approval_types
+rake dgidb:normalize:drug_types
+rake dgidb:normalize:populate_source_counters
+```
+
 ### Client setup
 
 Navigate to the [/client directory](/client):

diff --git a/server/lib/genome/groupers/base.rb b/server/lib/genome/groupers/base.rb
@@ -25,9 +25,9 @@ def fetch_json_response(url)
       end
 
       def fetch_source_meta
-        url = URI("#{@normalizer_url_root}search?q=")
+        url = URI("#{@normalizer_host}search?q=")
         body = fetch_json_response(url)
-        body['source_matches'].reduce({}) { |map, source| map.update(source['source'] => source['source_meta_']) }
+        body['source_matches'].transform_values { |value| value['source_meta_'] }
       end
 
       # Normalize claim terms
@@ -60,7 +60,7 @@ def normalize_claim(primary_term, claim_aliases)
             response = retrieve_normalizer_response(claim_alias.alias)
             match_type = response['match_type']
             if !response.nil? && match_type > 0
-              concept_id = response[@descriptor_name][@id_name]
+              concept_id = response['normalized_id']
               if !claim_responses.key?(concept_id)
                 claim_responses[concept_id] = response
               end
@@ -93,6 +93,10 @@ def normalize_claim(primary_term, claim_aliases)
         response
       end
 
+      def get_concept_id(response)
+        response['normalized_id'] unless response['match_type'].zero?
+      end
+
       def retrieve_extension(descriptor, type, default = nil)
         unless descriptor.fetch('extensions').blank?
           descriptor['extensions'].each do |extension|
@@ -103,7 +107,7 @@ def retrieve_extension(descriptor, type, default = nil)
       end
 
       def retrieve_normalizer_response(term)
-        body = fetch_json_response("#{@normalizer_url_root}normalize?q=#{CGI.escape(term)}")
+        body = fetch_json_response("#{@normalizer_host}normalize?q=#{CGI.escape(term)}")
         @term_to_match_dict[term.upcase] = get_concept_id(body) unless term == '' || body.nil?
 
         body
@@ -114,7 +118,7 @@ def key_non_nil_match(term)
       end
 
       def retrieve_normalizer_data(term)
-        body = fetch_json_response("#{@normalizer_url_root}normalize_unmerged?q=#{CGI.escape(term)}")
+        body = fetch_json_response("#{@normalizer_host}normalize_unmerged?q=#{CGI.escape(term)}")
         body['source_matches']
       end
     end

diff --git a/server/lib/genome/groupers/drug_grouper.rb b/server/lib/genome/groupers/drug_grouper.rb
@@ -4,8 +4,11 @@ class DrugGrouper < Genome::Groupers::Base
       attr_reader :term_to_match_dict
 
       def initialize
-        url_base = ENV['THERAPY_URL_BASE'] || 'http://localhost:8000'
-        @normalizer_url_root = "#{url_base}/therapy/"
+        url_base = ENV['THERAPY_HOSTNAME'] || 'http://localhost:8000'
+        if !url_base.ends_with? "/"
+          url_base += "/"
+        end
+        @normalizer_host = "#{url_base}therapy/"
 
         @term_to_match_dict = {}
 
@@ -28,7 +31,6 @@ def run(source_id = nil)
           puts "Grouping #{claims.length} ungrouped drug claims from #{source_name}"
         end
 
-        set_response_structure
         create_sources
 
 
@@ -40,28 +42,15 @@ def run(source_id = nil)
           if normalized_drug.is_a? String
             normalized_id = normalized_drug
           else
-            normalized_id = normalized_drug[@descriptor_name][@id_name]
-            create_new_drug(normalized_drug[@descriptor_name]) if Drug.find_by(concept_id: normalized_id).nil?
+            normalized_id = normalized_drug['normalized_id']
+            create_new_drug(normalized_drug['therapeutic_agent'], normalized_id) if Drug.find_by(concept_id: normalized_id).nil?
           end
           add_claim_to_drug(drug_claim, normalized_id)
 
           pbar.progress += 1
         end
       end
 
-      def set_response_structure
-        url = URI("#{@normalizer_url_root}search?q=")
-        body = fetch_json_response(url)
-        version = body['service_meta_']['version']
-        if version < '0.4.0'
-          @descriptor_name = 'therapy_descriptor'
-          @id_name = 'therapy_id'
-        else
-          @descriptor_name = 'therapeutic_descriptor'
-          @id_name = 'therapeutic'
-        end
-      end
-
       def create_sources
         drug_source_type = SourceType.find_by(type: 'drug')
 
@@ -165,10 +154,6 @@ def create_sources
         }
       end
 
-      def get_concept_id(response)
-        response[@descriptor_name][@id_name] unless response['match_type'].zero?
-      end
-
       def produce_concept_id_nomenclature(concept_id)
         case concept_id
         when /rxcui:/
@@ -289,8 +274,8 @@ def add_grouper_claim_aliases(claim, record)
         end
       end
 
-      def add_grouper_data(drug, descriptor)
-        drug_data = retrieve_normalizer_data(descriptor[@id_name])
+      def add_grouper_data(drug, drug_response, concept_id)
+        drug_data = retrieve_normalizer_data(concept_id)
 
         drug_data.each do |source_name, source_data|
           next if %w[DrugBank ChEMBL GuideToPHARMACOLOGY].include?(source_name)
@@ -306,15 +291,15 @@ def add_grouper_data(drug, descriptor)
         end
       end
 
-      def create_new_drug(descriptor)
-        name = if descriptor.fetch('label').blank?
-                 descriptor[@id_name]
+      def create_new_drug(drug_response, concept_id)
+        name = if drug_response['label'].nil? || drug_response['label'].blank?
+                 concept_id
                else
-                 descriptor['label']
+                 drug_response['label']
                end
-        drug = Drug.where(concept_id: descriptor[@id_name], name: name.upcase).first_or_create
+        drug = Drug.where(concept_id: concept_id, name: name.upcase).first_or_create
 
-        add_grouper_data(drug, descriptor)
+        add_grouper_data(drug, drug_response, concept_id)
       end
 
       def find_drug_attribute(drug_claim_attribute)

diff --git a/server/lib/genome/groupers/gene_grouper.rb b/server/lib/genome/groupers/gene_grouper.rb
@@ -4,8 +4,11 @@ class GeneGrouper < Genome::Groupers::Base
       attr_reader :term_to_match_dict
 
       def initialize
-        url_base = ENV['GENE_URL_BASE'] || 'http://localhost:8000'
-        @normalizer_url_root = "#{url_base}/gene/"
+        url_base = ENV['GENE_HOSTNAME'] || 'http://localhost:8000'
+        if !url_base.ends_with? "/"
+          url_base += "/"
+        end
+        @normalizer_host = "#{url_base}gene/"
 
         @term_to_match_dict = {}
         @sources = {}
@@ -33,7 +36,6 @@ def run(source_id = nil)
           puts "Grouping #{claims.length} ungrouped gene claims from #{source_name}"
         end
 
-        set_response_structure
         create_sources
 
         pbar = ProgressBar.create(title: 'Grouping genes', total: claims.size, format: "%t: %p%% %a |%B|")
@@ -44,28 +46,15 @@ def run(source_id = nil)
           if normalized_gene.is_a? String
             normalized_id = normalized_gene
           else
-            normalized_id = normalized_gene[@descriptor_name][@id_name]
-            create_new_gene normalized_gene[@descriptor_name] if Gene.find_by(concept_id: normalized_id).nil?
+            normalized_id = normalized_gene['normalized_id']
+            create_new_gene(normalized_gene['gene'], normalized_id) if Gene.find_by(concept_id: normalized_id).nil?
           end
           add_claim_to_gene(gene_claim, normalized_id)
 
           pbar.progress += 1
         end
       end
 
-      def set_response_structure
-        @descriptor_name = 'gene_descriptor'
-
-        url = URI("#{@normalizer_url_root}search?q=")
-        body = fetch_json_response(url)
-        version = body['service_meta_']['version']
-        if version < '0.2.0'
-          @id_name = 'gene_id'
-        else
-          @id_name = 'gene'
-        end
-      end
-
       def create_sources
         gene_source_type = SourceType.find_by(type: 'gene')
 
@@ -90,8 +79,8 @@ def create_sources
           source_db_version: source_meta['HGNC']['version'],
           base_url: 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/',
           site_url: 'https://www.genenames.org',
-          citation: 'Tweedie S, Braschi B, Gray K, Jones TEM, Seal RL, Yates B, Bruford EA. Genenames.org: the HGNC and VGNC resources in 2021. Nucleic Acids Res. 2021 Jan 8;49(D1):D939-D946. doi: 10.1093/nar/gkaa980. PMID: 33152070; PMCID: PMC7779007.',
-          citation_short: 'Tweedie S, et al. Genenames.org: the HGNC and VGNC resources in 2021. Nucleic Acids Res. 2021 Jan 8;49(D1):D939-D946.',
+          citation: 'Seal RL, Braschi B, Gray K, Jones TEM, Tweedie S, Haim-Vilmovsky L, Bruford EA. Genenames.org: the HGNC resources in 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D1003-D1009. doi: 10.1093/nar/gkac888. PMID: 36243972; PMCID: PMC9825485.',
+          citation_short: 'Seal RL, et al. Genenames.org: the HGNC resources in 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D1003-D1009.',
           pmid: '33152070',
           pmcid: 'PMC7779007',
           doi: '10.1093/nar/gkaa980',
@@ -105,8 +94,8 @@ def create_sources
           source_db_version: source_meta['Ensembl']['version'],
           base_url: 'https://ensembl.org/Homo_sapiens/Gene/Summary?g=',
           site_url: 'https://ensembl.org',
-          citation: 'Cunningham F, Allen JE, Allen J, Alvarez-Jarreta J, Amode MR, Armean IM, Austine-Orimoloye O, Azov AG, Barnes I, Bennett R, Berry A, Bhai J, Bignell A, Billis K, Boddu S, Brooks L, Charkhchi M, Cummins C, Da Rin Fioretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Martinez JG, Guijarro-Clarke C, Gymer A, Hardy M, Hollis Z, Hourlier T, Hunt T, Juettemann T, Kaikala V, Kay M, Lavidas I, Le T, Lemos D, Marugán JC, Mohanan S, Mushtaq A, Naven M, Ogeh DN, Parker A, Parton A, Perry M, Piližota I, Prosovetskaia I, Sakthivel MP, Salam AIA, Schmitt BM, Schuilenburg H, Sheppard D, Pérez-Silva JG, Stark W, Steed E, Sutinen K, Sukumaran R, Sumathipala D, Suner MM, Szpak M, Thormann A, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Willhoft N, Winterbottom A, Wass E, Chakiachvili M, Flint B, Frankish A, Giorgetti S, Haggerty L, Hunt SE, IIsley GR, Loveland JE, Martin FJ, Moore B, Mudge JM, Muffato M, Perry E, Ruffier M, Tate J, Thybert D, Trevanion SJ, Dyer S, Harrison PW, Howe KL, Yates AD, Zerbino DR, Flicek P. Ensembl 2022. Nucleic Acids Res. 2022 Jan 7;50(D1):D988-D995. doi: 10.1093/nar/gkab1049. PMID: 34791404; PMCID: PMC8728283.',
-          citation_short: 'Cunningham F, et al. Ensembl 2022. Nucleic Acids Res. 2022 Jan 7;50(D1):D988-D995.',
+          citation: 'Harrison PW, Amode MR, Austine-Orimoloye O, Azov AG, Barba M, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Campbell LI, Martinez MC, Charkhchi M, Chougule K, Cockburn A, Davidson C, De Silva NH, Dodiya K, Donaldson S, El Houdaigui B, Naboulsi TE, Fatima R, Giron CG, Genez T, Grigoriadis D, Ghattaoraya GS, Martinez JG, Gurbich TA, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Lodha D, Marques-Coelho D, Maslen G, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Poppleton D, Prosovetskaia I, Raj S, Pérez-Silva JG, Salam AIA, Saraf S, Saraiva-Agostinho N, Sheppard D, Sinha S, Sipos B, Sitnik V, Stark W, Steed E, Suner MM, Surapaneni L, Sutinen K, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Ware D, Wass E, Willhoft NL, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Keatley J, Loveland JE, Moore B, Mudge JM, Naamati G, Tate J, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Cunningham F, Dyer S, Finn RD, Martin FJ, Yates AD. Ensembl 2024. Nucleic Acids Res. 2024 Jan 5;52(D1):D891-D899. doi: 10.1093/nar/gkad1049. PMID: 37953337; PMCID: PMC10767893.',
+          citation_short: 'Harrison PW, et al. Ensembl 2024. Nucleic Acids Res. 2024 Jan 5;52(D1):D891-D899.',
           pmid: '34791404',
           pmcid: 'PMC8728283',
           doi: '10.1093/nar/gkab1049',
@@ -130,10 +119,6 @@ def create_sources
         }
       end
 
-      def get_concept_id(response)
-        response[@descriptor_name][@id_name] unless response['match_type'].zero?
-      end
-
       def create_gene_claim(record, source)
         GeneClaim.where(
           name: record['symbol'],
@@ -208,8 +193,8 @@ def add_grouper_claim_attribute(claim, record)
         )
       end
 
-      def add_grouper_data(gene, descriptor)
-        gene_data = retrieve_normalizer_data(descriptor[@id_name])
+      def add_grouper_data(gene, descriptor, normalized_id)
+        gene_data = retrieve_normalizer_data(normalized_id)
         gene_data.each do |source_name, source_data|
           source = @sources[source_name.to_sym]
 
@@ -223,19 +208,19 @@ def add_grouper_data(gene, descriptor)
         end
       end
 
-      def create_new_gene(descriptor)
-        name = if descriptor.fetch('label').blank?
-                 descriptor[@id_name]
+      def create_new_gene(gene_response, normalized_id)
+        name = if gene_response.fetch('label').blank?
+                 normalized_id
                else
-                 descriptor['label']
+                 gene_response['label']
                end
         gene = Gene.where(
-          concept_id: descriptor[@id_name],
+          concept_id: normalized_id,
           name: name,
-          long_name: retrieve_extension(descriptor, 'approved_name')
+          long_name: retrieve_extension(gene_response, 'approved_name')
         ).first_or_create
 
-        add_grouper_data(gene, descriptor)
+        add_grouper_data(gene, gene_response, normalized_id)
       end
 
       def add_claim_attributes(claim, gene)