From 79f22e9284dfe16ffb0a7ed269b5149947387a34 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 19 Apr 2024 10:55:26 -0400 Subject: [PATCH] fix: update grouper API (#433) --- README.md | 114 ++++++++++++++++++ server/lib/genome/groupers/base.rb | 14 ++- server/lib/genome/groupers/drug_grouper.rb | 45 +++---- server/lib/genome/groupers/gene_grouper.rb | 55 +++------ .../importers/file_importers/drugbank.rb | 12 +- .../importers/file_importers/pharmgkb.rb | 2 +- 6 files changed, 165 insertions(+), 77 deletions(-) diff --git a/README.md b/README.md index 4097589a..ffb4383d 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,120 @@ rails s Navigate to `localhost:3000/api/graphiql` in your browser. If the example query provided runs successfully, then you're all set. +### Data loading + +To perform a data load from scratch, first run the `reset` task to provide a clean, seeded DB: + +```shell +rake db:reset +``` + +Most DGIdb data comes from static files, typically called `claims.tsv`. The data loader classes expect `server/lib/data/` to contain the following files: + +``` +lib/data +├── bader_lab +│ └── claims.tsv +├── cancer_commons +│ └── claims.tsv +├── caris_molecular_intelligence +│ └── claims.tsv +├── cgi +│ └── claims.tsv +├── chembl +│ └── chembl.db +├── clearity_foundation_biomarkers +│ └── claims.tsv +├── clearity_foundation_clinical_trial +│ └── claims.tsv +├── cosmic +│ └── claims.csv +├── dgene +│ └── claims.tsv +├── drugbank +│ └── claims.xml +├── dtc +│ └── claims.csv +├── ensembl +│ └── claims.tsv +├── entrez +│ └── claims.tsv +├── fda +│ └── claims.tsv +├── foundation_one_genes +│ └── claims.tsv +├── go +│ └── targets.tsv +├── guide_to_pharmacology +│ ├── interactions.csv +│ └── targets_and_families.csv +├── hingorani_casas +│ └── claims.tsv +├── hopkins_groom +│ └── claims.tsv +├── human_protein_atlas +│ └── claims.tsv +├── idg +│ ├── claims.json +│ └── claims.tsv +├── msk_impact +│ └── claims.tsv +├── my_cancer_genome +│ └── claims.tsv +├── my_cancer_genome_clinical_trial +│ └── claims.tsv +├── nci +│ ├── claims.tsv +│ └── claims.xml +├── oncokb +│ ├── drug_claim.csv +│ ├── gene_claim.csv +│ ├── gene_claim_aliases.csv +│ ├── interaction_claim.csv +│ ├── interaction_claim_attributes.csv +│ └── interaction_claim_links.csv +├── oncomine +│ └── claims.tsv +├── pharmgkb +│ └── claims.tsv +├── russ_lampel +│ └── claims.tsv +├── talc +│ └── claims.tsv +├── tdg_clinical_trial +│ ├── claims.tsv +├── tempus +│ └── claims.tsv +├── tend +│ └── claims.tsv +└── ttd + └── claims.csv +``` + +First, load claims: + +```shell +rake dgidb:import:all +``` + +Then, run grouping. By default, the groupers will expect a normalizer service to be running locally on port 8000; use the `THERAPY_HOSTNAME` and `GENE_HOSTNAME` environment variables to specify alternate hosts: + +```shell +export THERAPY_HOSTNAME=http://localhost:7999 # no trailing backslash +rake dgidb:group:drugs +export GENE_HOSTNAME=http://localhost:7998 # no trailing backslash +rake dgidb:group:genes +rake dgidb:group:interactions +``` + +Finally, normalize remaining metadata: + +```shell +rake dgidb:normalize:drug_approval_types +rake dgidb:normalize:drug_types +rake dgidb:normalize:populate_source_counters +``` + ### Client setup Navigate to the [/client directory](/client): diff --git a/server/lib/genome/groupers/base.rb b/server/lib/genome/groupers/base.rb index 58c73ad9..31d15395 100644 --- a/server/lib/genome/groupers/base.rb +++ b/server/lib/genome/groupers/base.rb @@ -25,9 +25,9 @@ def fetch_json_response(url) end def fetch_source_meta - url = URI("#{@normalizer_url_root}search?q=") + url = URI("#{@normalizer_host}search?q=") body = fetch_json_response(url) - body['source_matches'].reduce({}) { |map, source| map.update(source['source'] => source['source_meta_']) } + body['source_matches'].transform_values { |value| value['source_meta_'] } end # Normalize claim terms @@ -60,7 +60,7 @@ def normalize_claim(primary_term, claim_aliases) response = retrieve_normalizer_response(claim_alias.alias) match_type = response['match_type'] if !response.nil? && match_type > 0 - concept_id = response[@descriptor_name][@id_name] + concept_id = response['normalized_id'] if !claim_responses.key?(concept_id) claim_responses[concept_id] = response end @@ -93,6 +93,10 @@ def normalize_claim(primary_term, claim_aliases) response end + def get_concept_id(response) + response['normalized_id'] unless response['match_type'].zero? + end + def retrieve_extension(descriptor, type, default = nil) unless descriptor.fetch('extensions').blank? descriptor['extensions'].each do |extension| @@ -103,7 +107,7 @@ def retrieve_extension(descriptor, type, default = nil) end def retrieve_normalizer_response(term) - body = fetch_json_response("#{@normalizer_url_root}normalize?q=#{CGI.escape(term)}") + body = fetch_json_response("#{@normalizer_host}normalize?q=#{CGI.escape(term)}") @term_to_match_dict[term.upcase] = get_concept_id(body) unless term == '' || body.nil? body @@ -114,7 +118,7 @@ def key_non_nil_match(term) end def retrieve_normalizer_data(term) - body = fetch_json_response("#{@normalizer_url_root}normalize_unmerged?q=#{CGI.escape(term)}") + body = fetch_json_response("#{@normalizer_host}normalize_unmerged?q=#{CGI.escape(term)}") body['source_matches'] end end diff --git a/server/lib/genome/groupers/drug_grouper.rb b/server/lib/genome/groupers/drug_grouper.rb index c9c20260..3acd2b66 100644 --- a/server/lib/genome/groupers/drug_grouper.rb +++ b/server/lib/genome/groupers/drug_grouper.rb @@ -4,8 +4,11 @@ class DrugGrouper < Genome::Groupers::Base attr_reader :term_to_match_dict def initialize - url_base = ENV['THERAPY_URL_BASE'] || 'http://localhost:8000' - @normalizer_url_root = "#{url_base}/therapy/" + url_base = ENV['THERAPY_HOSTNAME'] || 'http://localhost:8000' + if !url_base.ends_with? "/" + url_base += "/" + end + @normalizer_host = "#{url_base}therapy/" @term_to_match_dict = {} @@ -28,7 +31,6 @@ def run(source_id = nil) puts "Grouping #{claims.length} ungrouped drug claims from #{source_name}" end - set_response_structure create_sources @@ -40,8 +42,8 @@ def run(source_id = nil) if normalized_drug.is_a? String normalized_id = normalized_drug else - normalized_id = normalized_drug[@descriptor_name][@id_name] - create_new_drug(normalized_drug[@descriptor_name]) if Drug.find_by(concept_id: normalized_id).nil? + normalized_id = normalized_drug['normalized_id'] + create_new_drug(normalized_drug['therapeutic_agent'], normalized_id) if Drug.find_by(concept_id: normalized_id).nil? end add_claim_to_drug(drug_claim, normalized_id) @@ -49,19 +51,6 @@ def run(source_id = nil) end end - def set_response_structure - url = URI("#{@normalizer_url_root}search?q=") - body = fetch_json_response(url) - version = body['service_meta_']['version'] - if version < '0.4.0' - @descriptor_name = 'therapy_descriptor' - @id_name = 'therapy_id' - else - @descriptor_name = 'therapeutic_descriptor' - @id_name = 'therapeutic' - end - end - def create_sources drug_source_type = SourceType.find_by(type: 'drug') @@ -165,10 +154,6 @@ def create_sources } end - def get_concept_id(response) - response[@descriptor_name][@id_name] unless response['match_type'].zero? - end - def produce_concept_id_nomenclature(concept_id) case concept_id when /rxcui:/ @@ -289,8 +274,8 @@ def add_grouper_claim_aliases(claim, record) end end - def add_grouper_data(drug, descriptor) - drug_data = retrieve_normalizer_data(descriptor[@id_name]) + def add_grouper_data(drug, drug_response, concept_id) + drug_data = retrieve_normalizer_data(concept_id) drug_data.each do |source_name, source_data| next if %w[DrugBank ChEMBL GuideToPHARMACOLOGY].include?(source_name) @@ -306,15 +291,15 @@ def add_grouper_data(drug, descriptor) end end - def create_new_drug(descriptor) - name = if descriptor.fetch('label').blank? - descriptor[@id_name] + def create_new_drug(drug_response, concept_id) + name = if drug_response['label'].nil? || drug_response['label'].blank? + concept_id else - descriptor['label'] + drug_response['label'] end - drug = Drug.where(concept_id: descriptor[@id_name], name: name.upcase).first_or_create + drug = Drug.where(concept_id: concept_id, name: name.upcase).first_or_create - add_grouper_data(drug, descriptor) + add_grouper_data(drug, drug_response, concept_id) end def find_drug_attribute(drug_claim_attribute) diff --git a/server/lib/genome/groupers/gene_grouper.rb b/server/lib/genome/groupers/gene_grouper.rb index ae33ae87..ea4b35ea 100644 --- a/server/lib/genome/groupers/gene_grouper.rb +++ b/server/lib/genome/groupers/gene_grouper.rb @@ -4,8 +4,11 @@ class GeneGrouper < Genome::Groupers::Base attr_reader :term_to_match_dict def initialize - url_base = ENV['GENE_URL_BASE'] || 'http://localhost:8000' - @normalizer_url_root = "#{url_base}/gene/" + url_base = ENV['GENE_HOSTNAME'] || 'http://localhost:8000' + if !url_base.ends_with? "/" + url_base += "/" + end + @normalizer_host = "#{url_base}gene/" @term_to_match_dict = {} @sources = {} @@ -33,7 +36,6 @@ def run(source_id = nil) puts "Grouping #{claims.length} ungrouped gene claims from #{source_name}" end - set_response_structure create_sources pbar = ProgressBar.create(title: 'Grouping genes', total: claims.size, format: "%t: %p%% %a |%B|") @@ -44,8 +46,8 @@ def run(source_id = nil) if normalized_gene.is_a? String normalized_id = normalized_gene else - normalized_id = normalized_gene[@descriptor_name][@id_name] - create_new_gene normalized_gene[@descriptor_name] if Gene.find_by(concept_id: normalized_id).nil? + normalized_id = normalized_gene['normalized_id'] + create_new_gene(normalized_gene['gene'], normalized_id) if Gene.find_by(concept_id: normalized_id).nil? end add_claim_to_gene(gene_claim, normalized_id) @@ -53,19 +55,6 @@ def run(source_id = nil) end end - def set_response_structure - @descriptor_name = 'gene_descriptor' - - url = URI("#{@normalizer_url_root}search?q=") - body = fetch_json_response(url) - version = body['service_meta_']['version'] - if version < '0.2.0' - @id_name = 'gene_id' - else - @id_name = 'gene' - end - end - def create_sources gene_source_type = SourceType.find_by(type: 'gene') @@ -90,8 +79,8 @@ def create_sources source_db_version: source_meta['HGNC']['version'], base_url: 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', site_url: 'https://www.genenames.org', - citation: 'Tweedie S, Braschi B, Gray K, Jones TEM, Seal RL, Yates B, Bruford EA. Genenames.org: the HGNC and VGNC resources in 2021. Nucleic Acids Res. 2021 Jan 8;49(D1):D939-D946. doi: 10.1093/nar/gkaa980. PMID: 33152070; PMCID: PMC7779007.', - citation_short: 'Tweedie S, et al. Genenames.org: the HGNC and VGNC resources in 2021. Nucleic Acids Res. 2021 Jan 8;49(D1):D939-D946.', + citation: 'Seal RL, Braschi B, Gray K, Jones TEM, Tweedie S, Haim-Vilmovsky L, Bruford EA. Genenames.org: the HGNC resources in 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D1003-D1009. doi: 10.1093/nar/gkac888. PMID: 36243972; PMCID: PMC9825485.', + citation_short: 'Seal RL, et al. Genenames.org: the HGNC resources in 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D1003-D1009.', pmid: '33152070', pmcid: 'PMC7779007', doi: '10.1093/nar/gkaa980', @@ -105,8 +94,8 @@ def create_sources source_db_version: source_meta['Ensembl']['version'], base_url: 'https://ensembl.org/Homo_sapiens/Gene/Summary?g=', site_url: 'https://ensembl.org', - citation: 'Cunningham F, Allen JE, Allen J, Alvarez-Jarreta J, Amode MR, Armean IM, Austine-Orimoloye O, Azov AG, Barnes I, Bennett R, Berry A, Bhai J, Bignell A, Billis K, Boddu S, Brooks L, Charkhchi M, Cummins C, Da Rin Fioretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Martinez JG, Guijarro-Clarke C, Gymer A, Hardy M, Hollis Z, Hourlier T, Hunt T, Juettemann T, Kaikala V, Kay M, Lavidas I, Le T, Lemos D, Marugán JC, Mohanan S, Mushtaq A, Naven M, Ogeh DN, Parker A, Parton A, Perry M, Piližota I, Prosovetskaia I, Sakthivel MP, Salam AIA, Schmitt BM, Schuilenburg H, Sheppard D, Pérez-Silva JG, Stark W, Steed E, Sutinen K, Sukumaran R, Sumathipala D, Suner MM, Szpak M, Thormann A, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Willhoft N, Winterbottom A, Wass E, Chakiachvili M, Flint B, Frankish A, Giorgetti S, Haggerty L, Hunt SE, IIsley GR, Loveland JE, Martin FJ, Moore B, Mudge JM, Muffato M, Perry E, Ruffier M, Tate J, Thybert D, Trevanion SJ, Dyer S, Harrison PW, Howe KL, Yates AD, Zerbino DR, Flicek P. Ensembl 2022. Nucleic Acids Res. 2022 Jan 7;50(D1):D988-D995. doi: 10.1093/nar/gkab1049. PMID: 34791404; PMCID: PMC8728283.', - citation_short: 'Cunningham F, et al. Ensembl 2022. Nucleic Acids Res. 2022 Jan 7;50(D1):D988-D995.', + citation: 'Harrison PW, Amode MR, Austine-Orimoloye O, Azov AG, Barba M, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Campbell LI, Martinez MC, Charkhchi M, Chougule K, Cockburn A, Davidson C, De Silva NH, Dodiya K, Donaldson S, El Houdaigui B, Naboulsi TE, Fatima R, Giron CG, Genez T, Grigoriadis D, Ghattaoraya GS, Martinez JG, Gurbich TA, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Lodha D, Marques-Coelho D, Maslen G, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Poppleton D, Prosovetskaia I, Raj S, Pérez-Silva JG, Salam AIA, Saraf S, Saraiva-Agostinho N, Sheppard D, Sinha S, Sipos B, Sitnik V, Stark W, Steed E, Suner MM, Surapaneni L, Sutinen K, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Ware D, Wass E, Willhoft NL, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Keatley J, Loveland JE, Moore B, Mudge JM, Naamati G, Tate J, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Cunningham F, Dyer S, Finn RD, Martin FJ, Yates AD. Ensembl 2024. Nucleic Acids Res. 2024 Jan 5;52(D1):D891-D899. doi: 10.1093/nar/gkad1049. PMID: 37953337; PMCID: PMC10767893.', + citation_short: 'Harrison PW, et al. Ensembl 2024. Nucleic Acids Res. 2024 Jan 5;52(D1):D891-D899.', pmid: '34791404', pmcid: 'PMC8728283', doi: '10.1093/nar/gkab1049', @@ -130,10 +119,6 @@ def create_sources } end - def get_concept_id(response) - response[@descriptor_name][@id_name] unless response['match_type'].zero? - end - def create_gene_claim(record, source) GeneClaim.where( name: record['symbol'], @@ -208,8 +193,8 @@ def add_grouper_claim_attribute(claim, record) ) end - def add_grouper_data(gene, descriptor) - gene_data = retrieve_normalizer_data(descriptor[@id_name]) + def add_grouper_data(gene, descriptor, normalized_id) + gene_data = retrieve_normalizer_data(normalized_id) gene_data.each do |source_name, source_data| source = @sources[source_name.to_sym] @@ -223,19 +208,19 @@ def add_grouper_data(gene, descriptor) end end - def create_new_gene(descriptor) - name = if descriptor.fetch('label').blank? - descriptor[@id_name] + def create_new_gene(gene_response, normalized_id) + name = if gene_response.fetch('label').blank? + normalized_id else - descriptor['label'] + gene_response['label'] end gene = Gene.where( - concept_id: descriptor[@id_name], + concept_id: normalized_id, name: name, - long_name: retrieve_extension(descriptor, 'approved_name') + long_name: retrieve_extension(gene_response, 'approved_name') ).first_or_create - add_grouper_data(gene, descriptor) + add_grouper_data(gene, gene_response, normalized_id) end def add_claim_attributes(claim, gene) diff --git a/server/lib/genome/importers/file_importers/drugbank.rb b/server/lib/genome/importers/file_importers/drugbank.rb index 7547848a..a200c0c3 100644 --- a/server/lib/genome/importers/file_importers/drugbank.rb +++ b/server/lib/genome/importers/file_importers/drugbank.rb @@ -26,12 +26,12 @@ def create_new_source { base_url: 'https://go.drugbank.com/drugs', site_url: 'https://go.drugbank.com/', - citation: 'Wishart DS, Feunang YD, Guo AC, Lo EJ, Marcu A, Grant JR, Sajed T, Johnson D, Li C, Sayeeda Z, Assempour N, Iynkkaran I, Liu Y, Maciejewski A, Gale N, Wilson A, Chin L, Cummings R, Le D, Pon A, Knox C, Wilson M. DrugBank 5.0: a major update to the DrugBank database for 2018. Nucleic Acids Res. 2018 Jan 4;46(D1):D1074-D1082. doi: 10.1093/nar/gkx1037. PMID: 29126136; PMCID: PMC5753335.', - citation_short: 'Wishart DS, et al. DrugBank 5.0: a major update to the DrugBank database for 2018. Nucleic Acids Res. 2018 Jan 4;46(D1):D1074-D1082.', - pmid: '29126136', - pmcid: 'PMC5753335', - doi: '10.1093/nar/gkx1037', - source_db_version: '5.1.10', + citation: 'Knox C, Wilson M, Klinger CM, Franklin M, Oler E, Wilson A, Pon A, Cox J, Chin NEL, Strawbridge SA, Garcia-Patino M, Kruger R, Sivakumaran A, Sanford S, Doshi R, Khetarpal N, Fatokun O, Doucet D, Zubkowski A, Rayat DY, Jackson H, Harford K, Anjum A, Zakir M, Wang F, Tian S, Lee B, Liigand J, Peters H, Wang RQR, Nguyen T, So D, Sharp M, da Silva R, Gabriel C, Scantlebury J, Jasinski M, Ackerman D, Jewison T, Sajed T, Gautam V, Wishart DS. DrugBank 6.0: the DrugBank Knowledgebase for 2024. Nucleic Acids Res. 2024 Jan 5;52(D1):D1265-D1275. doi: 10.1093/nar/gkad976. PMID: 37953279; PMCID: PMC10767804.', + citation_short: 'Knox C, et al. DrugBank 6.0: the DrugBank Knowledgebase for 2024. Nucleic Acids Res. 2024 Jan 5;52(D1):D1265-D1275.', + pmid: '37953279', + pmcid: 'PMC10767804', + doi: '10.1093/nar/gkad976', + source_db_version: '5.1.12', source_db_name: 'DrugBank', full_name: 'DrugBank - Open Data Drug & Drug Target Database', license: License::CUSTOM_NON_COMMERCIAL, diff --git a/server/lib/genome/importers/file_importers/pharmgkb.rb b/server/lib/genome/importers/file_importers/pharmgkb.rb index 338855ad..88dbfdec 100644 --- a/server/lib/genome/importers/file_importers/pharmgkb.rb +++ b/server/lib/genome/importers/file_importers/pharmgkb.rb @@ -23,7 +23,7 @@ def create_new_source pmid: '34216021', pmcid: 'PMC8457105', doi: '10.1002/cpt.2350', - source_db_version: '2020-08-18', # using static file, see issue #420 + source_db_version: '2024-04-05', # using static file, see issue #420 source_db_name: source_db_name, full_name: 'PharmGKB - The Pharmacogenomics Knowledgebase', license: License::CC_BY_SA_4_0,