diff --git a/examples/compare_ontologies.rs b/examples/compare_ontologies.rs index 6e79b87..3b39ca6 100644 --- a/examples/compare_ontologies.rs +++ b/examples/compare_ontologies.rs @@ -16,6 +16,15 @@ fn ontology(path_arg: &str) -> Ontology { } } +fn ontology2(path_arg: &str) -> Ontology { + let path = Path::new(path_arg); + + match path.is_file() { + true => Ontology::from_binary(path).unwrap(), + false => Ontology::from_standard_transitive(&path.to_string_lossy()).unwrap(), + } +} + /// Prints some basic stats about the differences /// between two Ontologies fn overview(diffs: &Comparison) { @@ -89,15 +98,19 @@ fn changed_terms(diffs: &Comparison) { /// Prints info about Gene-specific changes fn changed_genes(diffs: &Comparison) { - println!("#Gene Delta\tID\tOld Name:New Name\tAdded Parents\tRemoved Parents"); - for term in diffs.changed_genes() { - print!("Delta\t{}", term.id()); - if let Some(names) = term.changed_name() { + println!( + "#Gene Delta\tID\tOld Name:New Name\tn Terms Old\tn Terms New\tAdded Terms\tRemoved Terms" + ); + for gene in diffs.changed_genes() { + print!("Delta\t{}", gene.id()); + if let Some(names) = gene.changed_name() { print!("\t{}:{}", names.0, names.1); } else { print!("\t."); } - if let Some(added) = term.added_terms() { + let (n_l, n_r) = gene.n_terms(); + print!("\t{n_l}\t{n_r}"); + if let Some(added) = gene.added_terms() { print!( "\t{}", added @@ -109,7 +122,7 @@ fn changed_genes(diffs: &Comparison) { } else { print!("\t."); } - if let Some(removed) = term.removed_terms() { + if let Some(removed) = gene.removed_terms() { print!( "\t{}", removed @@ -127,15 +140,19 @@ fn changed_genes(diffs: &Comparison) { /// Prints info about Gene-specific changes fn changed_diseases(diffs: &Comparison) { - println!("#Disease Delta\tID\tOld Name:New Name\tAdded Parents\tRemoved Parents"); - for term in diffs.changed_omim_diseases() { - print!("Delta\t{}", term.id()); - if let Some(names) = term.changed_name() { + println!("#Disease Delta\tID\tOld Name:New Name\tn Terms Old\tn Terms New\tAdded Terms\tRemoved Terms"); + for disease in diffs.changed_omim_diseases() { + print!("Delta\t{}", disease.id()); + if let Some(names) = disease.changed_name() { print!("\t{}:{}", names.0, names.1); } else { print!("\t."); } - if let Some(added) = term.added_terms() { + + let (n_l, n_r) = disease.n_terms(); + print!("\t{n_l}\t{n_r}"); + + if let Some(added) = disease.added_terms() { print!( "\t{}", added @@ -147,7 +164,7 @@ fn changed_diseases(diffs: &Comparison) { } else { print!("\t."); } - if let Some(removed) = term.removed_terms() { + if let Some(removed) = disease.removed_terms() { print!( "\t{}", removed @@ -206,7 +223,7 @@ fn main() { let arg_new = args.next().unwrap(); let lhs = ontology(&arg_old); - let rhs = ontology(&arg_new); + let rhs = ontology2(&arg_new); let diffs = lhs.compare(&rhs); diff --git a/src/lib.rs b/src/lib.rs index ef4efc7..ee1a777 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,6 +31,7 @@ const MAX_HPO_ID_INTEGER: usize = 10_000_000; const OBO_FILENAME: &str = "hp.obo"; const GENE_FILENAME: &str = "phenotype_to_genes.txt"; +const GENE_TO_PHENO_FILENAME: &str = "genes_to_phenotype.txt"; const DISEASE_FILENAME: &str = "phenotype.hpoa"; /// The `HpoTermId` of `HP:0000118 | Phenotypic abnormality` diff --git a/src/ontology.rs b/src/ontology.rs index 4d90a2a..20875f6 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -72,7 +72,8 @@ use termarena::Arena; /// Then use [`Ontology::from_standard`] to load the data. /// You need the following files: /// - `phenotype.hpoa` (Required to connect [`OmimDisease`]s to [`HpoTerm`]s) -/// - `phenotype_to_genes.txt` (Required to connect [`Gene`]s to [`HpoTerm`]s) +/// - `genes_to_phenotype.txt` (Required to connect [`Gene`]s to [`HpoTerm`]s) +/// - alternatively: `phenotype_to_genes.txt` (use [`Ontology::from_standard_transitive`]) /// - `hp.obo` (Required for [`HpoTerm`]s and their connection to each other) /// 2. Load the ontology from a binary build using [`Ontology::from_binary`]. /// @@ -112,6 +113,15 @@ use termarena::Arena; /// [`Ontology`] does not contain a direct relationship between genes and diseases. This relation /// is only present indirectly via the connected [`HpoTerm`]s. /// +/// # Transivity of relations +/// +/// **New in 0.9.0** +/// +/// During the construction of the Ontology, every [`HpoTerm`] inherits all gene and disease +/// association of its child terms. +/// But [`Gene`]s and [`OmimDisease`]s will only contain links to *direct* [`HpoTerm`]s. The annotations +/// are not transitiv. +/// /// ```mermaid /// erDiagram /// ONTOLOGY ||--|{ HPOTERM : contains @@ -368,13 +378,22 @@ impl Ontology { /// - [`Ontology::add_gene`] failed /// - [`Ontology::add_omim_disease`] failed /// + /// + /// # Note + /// + /// Since version `0.9.0` this method will not load genes transitively. That means + /// only directly linked [`HpoTerm`]s are connected to each gene. However, every + /// [`HpoTerm`] will still inherit all gene and disease associations from its children. + /// See [this discussion](https://github.com/anergictcell/hpo/issues/44) for a more detailed + /// explanation + /// /// # Examples /// /// ```no_run /// use hpo::Ontology; /// use hpo::HpoTermId; /// - /// let ontology = Ontology::from_binary("/path/to/jax_hpo_data/").unwrap(); + /// let ontology = Ontology::from_standard("/path/to/jax_hpo_data/").unwrap(); /// /// assert!(ontology.len() == 26); /// @@ -387,12 +406,68 @@ impl Ontology { /// ``` /// pub fn from_standard(folder: &str) -> HpoResult { + let mut ont = Ontology::default(); + let path = Path::new(folder); + let obo = path.join(crate::OBO_FILENAME); + let gene = path.join(crate::GENE_TO_PHENO_FILENAME); + let disease = path.join(crate::DISEASE_FILENAME); + parser::load_from_jax_files(&obo, &gene, &disease, &mut ont)?; + ont.calculate_information_content()?; + ont.set_default_categories()?; + ont.set_default_modifier()?; + Ok(ont) + } + + /// Initialize the [`Ontology`] from data provided by [Jax HPO](https://hpo.jax.org/) + /// + /// You must download: + /// + /// - Actual OBO data: [`hp.obo`](https://hpo.jax.org/app/data/ontology) + /// - Links between HPO and OMIM diseases: [`phenotype.hpoa`](https://hpo.jax.org/app/data/annotations) + /// - Links between HPO and Genes: [`genes_to_phenotypes.txt`](http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt) + /// + /// and then specify the folder where the data is stored. + /// + /// # Errors + /// + /// This method can fail for various reasons: + /// + /// - obo file not present or available: [`HpoError::CannotOpenFile`] + /// - [`Ontology::add_gene`] failed + /// - [`Ontology::add_omim_disease`] failed + /// + /// # Note + /// + /// This method has one difference to [`Ontology::from_standard`] in that every [`Gene`] + /// contains directly linked [`HpoTerm`] and all their ancestor terms. + /// See [this discussion](https://github.com/anergictcell/hpo/issues/44) for a more detailed + /// explanation + /// + /// # Examples + /// + /// ```no_run + /// use hpo::Ontology; + /// use hpo::HpoTermId; + /// + /// let ontology = Ontology::from_standard_transitive("/path/to/jax_hpo_data/").unwrap(); + /// + /// assert!(ontology.len() == 26); + /// + /// let absent_term = HpoTermId::try_from("HP:9999999").unwrap(); + /// assert!(ontology.hpo(absent_term).is_none()); + /// + /// let present_term = HpoTermId::try_from("HP:0000001").unwrap(); + /// let root_term = ontology.hpo(present_term).unwrap(); + /// assert_eq!(root_term.name(), "All"); + /// ``` + /// + pub fn from_standard_transitive(folder: &str) -> HpoResult { let mut ont = Ontology::default(); let path = Path::new(folder); let obo = path.join(crate::OBO_FILENAME); let gene = path.join(crate::GENE_FILENAME); let disease = path.join(crate::DISEASE_FILENAME); - parser::load_from_standard_files(&obo, &gene, &disease, &mut ont)?; + parser::load_from_jax_files_with_transivitve_genes(&obo, &gene, &disease, &mut ont)?; ont.calculate_information_content()?; ont.set_default_categories()?; ont.set_default_modifier()?; diff --git a/src/ontology/comparison.rs b/src/ontology/comparison.rs index 0fed4af..f7dab4b 100644 --- a/src/ontology/comparison.rs +++ b/src/ontology/comparison.rs @@ -286,6 +286,7 @@ impl HpoTermDelta { pub struct AnnotationDelta { id: String, names: (String, String), + n_terms: (usize, usize), added_terms: Vec, removed_terms: Vec, } @@ -337,6 +338,7 @@ impl<'a> AnnotationDelta { Some(Self { id, names, + n_terms: (lhs_terms.len(), rhs_terms.len()), added_terms, removed_terms, }) @@ -384,4 +386,9 @@ impl<'a> AnnotationDelta { pub fn id(&self) -> &str { &self.id } + + /// Returns the number of terms linked to `old` and `new` + pub fn n_terms(&self) -> (usize, usize) { + self.n_terms + } } diff --git a/src/parser.rs b/src/parser.rs index 5ad5fe6..716072c 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -8,6 +8,41 @@ pub(crate) mod binary; /// Module to parse `hp.obo` file pub(crate) mod hp_obo; +/// Module to parse HPO - `Gene` associations from `genes_to_phenotype.txt` file +pub(crate) mod genes_to_phenotype { + use crate::parser::Path; + use crate::HpoResult; + use std::fs::File; + use std::io::BufRead; + use std::io::BufReader; + + use crate::HpoTermId; + use crate::Ontology; + + /// Quick and dirty parser for development and debugging + pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { + let file = File::open(file).unwrap(); + let reader = BufReader::new(file); + for line in reader.lines() { + let line = line.unwrap(); + // TODO: Check for the header outside of the `lines` iterator + if line.starts_with('#') || line.starts_with("ncbi_gene_id") { + continue; + } + let cols: Vec<&str> = line.trim().split('\t').collect(); + let gene_id = ontology.add_gene(cols[1], cols[0])?; + let term_id = HpoTermId::try_from(cols[2])?; + ontology.link_gene_term(term_id, gene_id)?; + + ontology + .gene_mut(&gene_id) + .expect("Cannot find gene {gene_id}") + .add_term(term_id); + } + Ok(()) + } +} + /// Module to parse HPO - `Gene` associations from `phenotype_to_genes.txt` file pub(crate) mod phenotype_to_genes { use crate::parser::Path; @@ -126,7 +161,7 @@ pub(crate) mod phenotype_hpoa { } } -pub(crate) fn load_from_standard_files>( +pub(crate) fn load_from_jax_files_with_transivitve_genes>( obo_file: P, gene_file: P, disease_file: P, @@ -137,3 +172,15 @@ pub(crate) fn load_from_standard_files>( phenotype_hpoa::parse(disease_file, ontology)?; Ok(()) } + +pub(crate) fn load_from_jax_files>( + obo_file: P, + gene_file: P, + disease_file: P, + ontology: &mut Ontology, +) -> HpoResult<()> { + hp_obo::read_obo_file(obo_file, ontology)?; + genes_to_phenotype::parse(gene_file, ontology)?; + phenotype_hpoa::parse(disease_file, ontology)?; + Ok(()) +} diff --git a/tests/ontology.hpo b/tests/ontology.hpo index 9106607..9c5dfae 100644 Binary files a/tests/ontology.hpo and b/tests/ontology.hpo differ