From f698a9e591f98ad877f39c36fc9193715ee6931e Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Thu, 8 Aug 2024 06:39:39 +0200 Subject: [PATCH 1/8] Add first draft of Ontology-Builder --- src/ontology/builder.rs | 669 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 669 insertions(+) create mode 100644 src/ontology/builder.rs diff --git a/src/ontology/builder.rs b/src/ontology/builder.rs new file mode 100644 index 0000000..3d515e9 --- /dev/null +++ b/src/ontology/builder.rs @@ -0,0 +1,669 @@ +use crate::annotations::Disease; +use crate::term::internal::HpoTermInternal; +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::marker::PhantomData; +use std::ops::BitOr; + +use crate::annotations::{Gene, GeneId}; +use crate::annotations::{OmimDisease, OmimDiseaseId}; +use crate::annotations::{OrphaDisease, OrphaDiseaseId}; +use crate::parser::binary::{BinaryTermBuilder, BinaryVersion, Bytes}; +use crate::term::HpoGroup; + +use crate::{u32_from_bytes, HpoTermId, Ontology}; +use crate::HpoResult; +use crate::HpoError; + +use crate::ontology::termarena::Arena; + + +pub struct LooseCollection; +pub struct AllTerms; +pub struct ConnectedTerms; +pub struct FullyAnnotated; + +pub trait AddAnotation{} +impl AddAnotation for LooseCollection{} +impl AddAnotation for AllTerms{} +impl AddAnotation for ConnectedTerms{} + +fn transition_state(builder: Builder) -> Builder { + Builder::{ + hpo_terms: builder.hpo_terms, + genes: builder.genes, + omim_diseases: builder.omim_diseases, + orpha_diseases: builder.orpha_diseases, + hpo_version: builder.hpo_version, + categories: builder.categories, + modifier: builder.modifier, + state: PhantomData + } +} + + +pub struct Builder { + hpo_terms: Arena, + genes: HashMap, + omim_diseases: HashMap, + orpha_diseases: HashMap, + hpo_version: (u16, u8, u8), + categories: HpoGroup, + modifier: HpoGroup, + state: PhantomData +} + + +impl Builder { + pub fn add_gene(&mut self, gene_name: &str, gene_id: GeneId) { + if let Entry::Vacant(entry) = self.genes.entry(gene_id) { + entry.insert(Gene::new(gene_id, gene_name)); + } + } + + pub fn add_omim_disease( + &mut self, + omim_disease_name: &str, + omim_disease_id: &str, + ) -> HpoResult { + let id = OmimDiseaseId::try_from(omim_disease_id)?; + match self.omim_diseases.entry(id) { + std::collections::hash_map::Entry::Occupied(_) => Ok(id), + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(OmimDisease::new(id, omim_disease_name)); + Ok(id) + } + } + } + + pub fn add_orpha_disease( + &mut self, + orpha_disease_name: &str, + orpha_disease_id: &str, + ) -> HpoResult { + let id = OrphaDiseaseId::try_from(orpha_disease_id)?; + match self.orpha_diseases.entry(id) { + std::collections::hash_map::Entry::Occupied(_) => Ok(id), + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(OrphaDisease::new(id, orpha_disease_name)); + Ok(id) + } + } + } +} + + + +impl Builder{ + pub fn new() -> Builder { + Builder:: { + hpo_terms: Arena::default(), + genes: HashMap::default(), + omim_diseases: HashMap::default(), + orpha_diseases: HashMap::default(), + hpo_version: (0u16, 0u8, 0u8), + categories: HpoGroup::default(), + modifier: HpoGroup::default(), + state: PhantomData + } + } + + /// Adds an [`HpoTerm`] to the ontology + /// + /// This method is part of the Ontology-building, based on the binary + /// data format and requires a specified data layout. + /// + /// The method assumes that the data is in the right format and also + /// assumes that the caller takes care of handling all consistencies + /// like parent-child connection etc. + /// + /// See [`HpoTermInternal::as_bytes`] for explanation of the binary layout. + pub (crate) fn add_terms_from_bytes(&mut self, bytes: Bytes) { + for term in BinaryTermBuilder::new(bytes) { + self.add_term(term); + } + } + + /// Insert an `HpoTermInternal` to the ontology + /// + /// This method does not link the term to its parents or to any annotations + pub(crate) fn add_term(&mut self, term: HpoTermInternal) -> HpoTermId { + let id = *term.id(); + self.hpo_terms.insert(term); + id + } + + #[must_use] + pub fn terms_complete(self) -> Builder { + transition_state(self) + } +} + +impl Builder { + /// Add a connection from an [`HpoTerm`] to its parent + /// + /// This method is called once for every dependency in the Ontology during the initialization. + /// + /// There should rarely be a need to call this method outside of the ontology building + /// + /// # Panics + /// + /// This method will panic if the `parent_id` or `child_id` is not present in the Ontology + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// + /// let mut ontology = Ontology::default(); + /// ontology.insert_term("Foo".into(), 1u32); + /// ontology.insert_term("Bar".into(), 2u32); + /// + /// ontology.add_parent(1u32, 2u32); + /// + /// assert!(ontology.hpo(2u32).unwrap().parent_ids().contains(&1u32.into())); + /// ``` + pub fn add_parent + Copy, J: Into + Copy>( + &mut self, + parent_id: I, + child_id: J, + ) { + let parent = self.hpo_terms.get_unchecked_mut(parent_id.into()); + parent.add_child(child_id); + + let child = self.hpo_terms.get_unchecked_mut(child_id.into()); + child.add_parent(parent_id); + } + + /// Connects an [`HpoTerm`] to its parent term + /// + /// This method is part of the Ontology-building, based on the binary + /// data format and requires a specified data layout. + /// + /// The method assumes that the data is in the right format and also + /// assumes that the caller will populate the `all_parents` caches for + /// each term. + /// + /// See [`HpoTermInternal::parents_as_byte`] for explanation of the binary layout. + /// + /// # Panics + /// + /// This method will panic if the length of bytes does not exactly correspond + /// to the contained data + pub(crate) fn add_parent_from_bytes(&mut self, bytes: &[u8]) { + let mut idx: usize = 0; + loop { + if idx == bytes.len() { + break; + } + let n_parents = u32_from_bytes(&bytes[idx..]) as usize; + + idx += 4; + let term = + HpoTermId::from([bytes[idx], bytes[idx + 1], bytes[idx + 2], bytes[idx + 3]]); + idx += 4; + for _ in 0..n_parents { + let parent = + HpoTermId::from([bytes[idx], bytes[idx + 1], bytes[idx + 2], bytes[idx + 3]]); + self.add_parent(parent, term); + idx += 4; + } + } + } + + + /// Crates and caches the `all_parents` values for every term + /// + /// This method can only be called once and afterwards no new terms + /// should be added to the Ontology anymore and no new term-parent connection + /// should be created. + /// Since this method caches the results, rerunning it will not cause a new + /// calculation. + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// + /// let mut ontology = Ontology::default(); + /// ontology.insert_term("Root".into(), 1u32); + /// ontology.insert_term("Foo".into(), 2u32); + /// ontology.insert_term("Bar".into(), 3u32); + /// + /// ontology.add_parent(1u32, 2u32); + /// ontology.add_parent(2u32, 3u32); + /// + /// // At this point #3 does not have info about grandparents + /// assert!(!ontology.hpo(3u32).unwrap().all_parent_ids().contains(&1u32.into())); + /// + /// ontology.create_cache(); + /// assert!(ontology.hpo(3u32).unwrap().all_parent_ids().contains(&1u32.into())); + /// ``` + #[must_use] + pub fn connect_all_terms(mut self) -> Builder { + let term_ids: Vec = self.hpo_terms.keys(); + + for id in term_ids { + self.create_cache_of_grandparents(id); + } + transition_state(self) + } + + /// This method is part of the cache creation to link all terms to their + /// direct and indirect parents (grandparents) + /// + /// It will (somewhat) recursively iterate all parents and copy all their parents. + /// During this recursion, the list of `all_parents` is cached in each term that was + /// iterated. + /// + /// The logic is that the recursion bubbles up all the way to the top of the ontolgy + /// and then caches the list of direct and indirect parents for every term bubbling + /// back down. The recursion does not reach the top level again, because it will stop + /// once it reaches a term with already cached `all_parents`. + /// + /// # Panics + /// + /// This method will panic if the `term_id` is not present in the Ontology + fn create_cache_of_grandparents(&mut self, term_id: HpoTermId) { + let mut res = HpoGroup::default(); + let parents = self.hpo_terms.get_unchecked(term_id).parents().clone(); + for parent in &parents { + let grandparents = self.all_grandparents(parent); + for gp in grandparents { + res.insert(gp); + } + } + let term = self.hpo_terms.get_unchecked_mut(term_id); + *term.all_parents_mut() = res.bitor(&parents); + } + + /// This method is part of the cache creation to link all terms to their + /// direct and indirect parents (grandparents) + /// + /// # Panics + /// + /// This method will panic if the `term_id` is not present in the Ontology + fn all_grandparents(&mut self, term_id: HpoTermId) -> &HpoGroup { + if !self.hpo_terms.get_unchecked(term_id).parents_cached() { + self.create_cache_of_grandparents(term_id); + } + let term = self.hpo_terms.get_unchecked(term_id); + term.all_parents() + } +} + + +impl Builder { + /// Adds genes to the ontoloigy and connects them to connected terms + /// + /// This method is part of the Ontology-building, based on the binary + /// data format and requires a specified data layout. + /// + /// It connects all connected terms and their parents properly. The + /// method assumes that the bytes encode all gene-term connections. + /// + /// See [`Gene::as_bytes`] for explanation of the binary layout + pub (crate) fn add_genes_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { + let mut idx: usize = 0; + loop { + if idx >= bytes.len() { + break; + } + let gene_len = u32_from_bytes(&bytes[idx..]) as usize; + let gene = Gene::try_from(&bytes[idx..idx + gene_len])?; + for term in gene.hpo_terms() { + self.link_gene_term(term, *gene.id())?; + } + self.genes.insert(*gene.id(), gene); + idx += gene_len; + } + Ok(()) + } + + /// Adds [`OmimDisease`]s to the ontoloigy and connects them to connected terms + /// + /// This method is part of the Ontology-building, based on the binary + /// data format and requires a specified data layout. + /// + /// It connects all connected terms and their parents properly. The + /// method assumes that the bytes encode all Disease-term connections. + /// + /// See [`OmimDisease::as_bytes`] for explanation of the binary layout + pub (crate) fn add_omim_disease_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { + let mut idx: usize = 0; + loop { + if idx >= bytes.len() { + break; + } + let disease_len = u32_from_bytes(&bytes[idx..]) as usize; + let disease = OmimDisease::try_from(&bytes[idx..idx + disease_len])?; + for term in disease.hpo_terms() { + self.link_omim_disease_term(term, *disease.id())?; + } + self.omim_diseases.insert(*disease.id(), disease); + idx += disease_len; + } + Ok(()) + } + + /// Adds [`OrphaDisease`]s to the ontoloigy and connects them to connected terms + /// + /// This method is part of the Ontology-building, based on the binary + /// data format and requires a specified data layout. + /// + /// It connects all connected terms and their parents properly. The + /// method assumes that the bytes encode all Disease-term connections. + /// + /// See [`OrphaDisease::as_bytes`] for explanation of the binary layout + pub (crate) fn add_orpha_disease_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { + let mut idx: usize = 0; + loop { + if idx >= bytes.len() { + break; + } + let disease_len = u32_from_bytes(&bytes[idx..]) as usize; + let disease = OrphaDisease::try_from(&bytes[idx..idx + disease_len])?; + for term in disease.hpo_terms() { + self.link_orpha_disease_term(term, *disease.id())?; + } + self.orpha_diseases.insert(*disease.id(), disease); + idx += disease_len; + } + Ok(()) + } + + /// Add the [`Gene`] as annotation to the [`HpoTerm`] + /// + /// The gene will be recursively connected to all parent `HpoTerms` as well. + /// + /// This method does not add the HPO-term to the [`Gene`], this must be handled + /// by the client. + /// + /// # Errors + /// + /// If the HPO term is not present, an [`HpoError::DoesNotExist`] is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// use hpo::annotations::GeneId; + /// + /// let mut ontology = Ontology::default(); + /// ontology.insert_term("Term-Foo".into(), 1u32); + /// ontology.add_gene("Foo", GeneId::from(5)); + /// ontology.link_gene_term(1u32, GeneId::from(5u32)).unwrap(); + /// + /// let term = ontology.hpo(1u32).unwrap(); + /// assert_eq!(term.genes().next().unwrap().name(), "Foo"); + /// ``` + pub fn link_gene_term>( + &mut self, + term_id: I, + gene_id: GeneId, + ) -> HpoResult<()> { + let term = self.hpo_terms.get_mut(term_id.into()).ok_or(HpoError::DoesNotExist)?; + + if term.add_gene(gene_id) { + // If the gene is already associated to the term, this branch will + // be skipped. That is desired, because by definition + // all parent terms are already linked as well + let parents = term.all_parents().clone(); + for parent in &parents { + self.link_gene_term(parent, gene_id)?; + } + } + Ok(()) + } + + /// Add the [`OmimDisease`] as annotation to the [`HpoTerm`] + /// + /// The disease will be recursively connected to all parent `HpoTerms` as well. + /// + /// This method does not add the HPO-term to the [`OmimDisease`], this + /// must be handled by the client. + /// + /// # Errors + /// + /// If the HPO term is not present, an [`HpoError`] is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// use hpo::annotations::{Disease, OmimDiseaseId}; + /// + /// let mut ontology = Ontology::default(); + /// ontology.insert_term("Term-Foo".into(), 1u32); + /// ontology.add_omim_disease("Foo", "5"); + /// ontology.link_omim_disease_term(1u32, OmimDiseaseId::from(5u32)).unwrap(); + /// + /// let term = ontology.hpo(1u32).unwrap(); + /// assert_eq!(term.omim_diseases().next().unwrap().name(), "Foo"); + /// ``` + pub fn link_omim_disease_term>( + &mut self, + term_id: I, + omim_disease_id: OmimDiseaseId, + ) -> HpoResult<()> { + let term = self.hpo_terms.get_mut(term_id.into()).ok_or(HpoError::DoesNotExist)?; + + if term.add_omim_disease(omim_disease_id) { + // If the disease is already associated to the term, this branch will + // be skipped. That is desired, because by definition + // all parent terms are already linked as well + let parents = term.all_parents().clone(); + for parent in &parents { + self.link_omim_disease_term(parent, omim_disease_id)?; + } + } + Ok(()) + } + + /// Add the [`OrphaDisease`] as annotation to the [`HpoTerm`] + /// + /// The disease will be recursively connected to all parent `HpoTerms` as well. + /// + /// This method does not add the HPO-term to the [`OrphaDisease`], this + /// must be handled by the client. + /// + /// # Errors + /// + /// If the HPO term is not present, an [`HpoError`] is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// use hpo::annotations::{Disease, OrphaDiseaseId}; + /// + /// let mut ontology = Ontology::default(); + /// ontology.insert_term("Term-Foo".into(), 1u32); + /// ontology.add_orpha_disease("Foo", "5"); + /// ontology.link_orpha_disease_term(1u32, OrphaDiseaseId::from(5u32)).unwrap(); + /// + /// let term = ontology.hpo(1u32).unwrap(); + /// assert_eq!(term.orpha_diseases().next().unwrap().name(), "Foo"); + /// ``` + pub fn link_orpha_disease_term>( + &mut self, + term_id: I, + orpha_disease_id: OrphaDiseaseId, + ) -> HpoResult<()> { + let term = self.hpo_terms.get_mut(term_id.into()).ok_or(HpoError::DoesNotExist)?; + + if term.add_orpha_disease(orpha_disease_id) { + // If the disease is already associated to the term, this branch will + // be skipped. That is desired, because by definition + // all parent terms are already linked as well + let parents = term.all_parents().clone(); + for parent in &parents { + self.link_orpha_disease_term(parent, orpha_disease_id)?; + } + } + Ok(()) + } + + /// Calculates the [`crate::term::InformationContent`]s for every term + /// + /// This method should only be called **after** all terms are added, + /// connected and all genes and diseases are linked as well. + /// + /// It can be called repeatedly, all values are recalculated each time, + /// as long as the Ontology contains at least 1 gene/disease. + /// When no genes/diseases are present, the IC is not calculated nor updated. + /// + /// # Errors + /// + /// This method returns an error if there are more Genes or Terms than `u16::MAX` + /// because larger numbers can't be safely converted to `f32` + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// + /// let mut ontology = Ontology::default(); + /// + /// // [all kind of logic to add terms, diseases, genes....] + /// + /// ontology.calculate_information_content().unwrap(); + /// ``` + #[must_use] + pub fn calculate_information_content(mut self) -> HpoResult> { + self.calculate_gene_ic()?; + self.calculate_omim_disease_ic()?; + self.calculate_orpha_disease_ic()?; + + Ok(transition_state(self)) + } + + /// Calculates the gene-specific Information Content for every term + /// + /// If no genes are present in the Ontology, no IC are calculated + fn calculate_gene_ic(&mut self) -> HpoResult<()> { + let n_genes = self.genes.len(); + for term in self.hpo_terms.values_mut() { + let current_genes = term.genes().len(); + term.information_content_mut() + .set_gene(n_genes, current_genes)?; + } + Ok(()) + } + + /// Calculates the Omim-Disease-specific Information Content for every term + /// + /// If no diseases are present in the Ontology, no IC are calculated + fn calculate_omim_disease_ic(&mut self) -> HpoResult<()> { + let n_omim_diseases = self.omim_diseases.len(); + + for term in self.hpo_terms.values_mut() { + let current_diseases = term.omim_diseases().len(); + term.information_content_mut() + .set_omim_disease(n_omim_diseases, current_diseases)?; + } + Ok(()) + } + + /// Calculates the Orpha-Disease-specific Information Content for every term + /// + /// If no diseases are present in the Ontology, no IC are calculated + fn calculate_orpha_disease_ic(&mut self) -> HpoResult<()> { + let n_orpha_diseases = self.orpha_diseases.len(); + + for term in self.hpo_terms.values_mut() { + let current_diseases = term.orpha_diseases().len(); + term.information_content_mut() + .set_orpha_disease(n_orpha_diseases, current_diseases)?; + } + Ok(()) + } +} + + +impl Builder { + pub fn build_with_defaults(self) -> HpoResult { + let mut ont = Ontology { + hpo_terms: self.hpo_terms, + genes: self.genes, + omim_diseases: self.omim_diseases, + orpha_diseases: self.orpha_diseases, + hpo_version: self.hpo_version, + ..Default::default() + }; + ont.set_default_categories()?; + ont.set_default_modifier()?; + Ok(ont) + } +} + +impl Builder { + pub fn set_hpo_version(&mut self, version: (u16, u8, u8)) { + self.hpo_version = version; + } + + /// Parses `Bytes` into the Jax-Ontology release version + pub (crate) fn hpo_version_from_bytes(&mut self, bytes: &Bytes) -> HpoResult { + if bytes.version() == BinaryVersion::V1 { + self.set_hpo_version((0u16, 0u8, 0u8)); + Ok(0) + } else { + if bytes.len() < 4 { + return Err(HpoError::ParseBinaryError); + } + let year = u16::from_be_bytes([bytes[0], bytes[1]]); + let month = u8::from_be_bytes([bytes[2]]); + let day = u8::from_be_bytes([bytes[3]]); + self.set_hpo_version((year, month, day)); + Ok(4) + } + } +} + + +/* +struct OntologyBuilder{} +impl OntologyBuilder { + fn add_terms(&mut self, ) +} + +1. add terms +2. connect terms to parents +3. add genes or diseases +4. connect genes or diseases with terms (2 must be finished) + +stateDiagram-v2 + Builder --> Builder : add terms + Builder --> Builder: add annotations (gene, disease) + Builder --> Builder2: connect parents and children (add_parent()) + Builder2 --> Builder3: cache all terms and parents (create_cache()) + Builder2 --> Builder2: add annotations (gene, disease) + Builder3 --> Builder3: add annotations (gene, disease) + Builder3 --> Builder3: set_categories, set_modifier + Builder3 --> Builder3: link annotations to terms + Builder3 --> Builder4: calculate information content + Builder4 --> Builder4: set_categories, set_modifier + + +Builder +| +add_parent() +| +V +Builder +| +crate_cache() +| +V +Builder +| +calculate_information_content() +| +V +Builder +| +ontology() +| +V +Ontology +*/ From 15495c4db412efa0d55ace358da8d95d5bb391a9 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Wed, 21 Aug 2024 19:53:16 +0200 Subject: [PATCH 2/8] Add Builder struct to build the Ontology --- README.md | 18 +- clippy.toml | 1 + src/annotations.rs | 12 +- src/annotations/gene.rs | 4 +- src/annotations/omim_disease.rs | 2 +- src/annotations/orpha_disease.rs | 2 +- src/lib.rs | 1 + src/ontology.rs | 861 +++++-------------------------- src/ontology/builder.rs | 810 ++++++++++++++++++++--------- src/parser.rs | 100 ++-- src/parser/hp_obo.rs | 32 +- src/set.rs | 56 +- 12 files changed, 842 insertions(+), 1057 deletions(-) create mode 100644 clippy.toml diff --git a/README.md b/README.md index 6dc1cf0..0bc4c51 100644 --- a/README.md +++ b/README.md @@ -51,16 +51,20 @@ HPO data must be downloaded first from [Jax HPO](https://hpo.jax.org/) itself. 1. Data can be loaded directly from the code with [`Ontology::from_standard`]: ```no_run - use hpo::Ontology; - let ontology = Ontology::from_standard("/path/to/master-data/").unwrap(); +use hpo::Ontology; +let ontology = Ontology::from_standard("/path/to/master-data/").unwrap(); +``` + +2. Or it can be converted to a localy binary by copy `examples/obo_to_bin.rs` into your project, then run +```sh +cargo run --example --release obo_to_bin ` ``` -2. Or it can be converted to a localy binary by copy `examples/obo_to_bin.rs` into your project, then run . -`cargo run --example --release obo_to_bin ` Finally, load the data using [`Ontology::from_binary`]: + ```no_run - use hpo::Ontology; - let ontology = Ontology::from_binary("your-hpo-binary.hpo").unwrap(); +use hpo::Ontology; +let ontology = Ontology::from_binary("your-hpo-binary.hpo").unwrap(); ``` 3. Another possibility is to use the snapshot from the [Github repository](https://github.com/anergictcell/hpo) of this crate which contains a binary build of the ontology . IT will not always be up to date, so please double-check yourself. @@ -164,7 +168,7 @@ fn example() { ``` ### Enrichment -Identify which genes (or diseases) are enriched in a set of HpoTerms, e.g. in +Identify which genes (or diseases) are enriched in a set of `HpoTerm`s, e.g. in the clinical information of a patient or patient cohort ```rust diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000..8cd768c --- /dev/null +++ b/clippy.toml @@ -0,0 +1 @@ +doc-valid-idents = ["MacBook", ".."] \ No newline at end of file diff --git a/src/annotations.rs b/src/annotations.rs index 910a5c2..3b38ff4 100644 --- a/src/annotations.rs +++ b/src/annotations.rs @@ -30,7 +30,17 @@ pub use orpha_disease::{OrphaDisease, OrphaDiseaseId, OrphaDiseaseIterator, Orph /// The ID must be unique only within the annotation type, i.e. a gene and a disease /// can have the same ID. pub trait AnnotationId: - Clone + Copy + Debug + Hash + PartialEq + PartialOrd + Eq + Ord + Display + From + Clone + + Copy + + Debug + + Hash + + PartialEq + + PartialOrd + + Eq + + Ord + + Display + + From + + for<'a> TryFrom<&'a str> { /// Return the integer representation of the annotation ID fn as_u32(&self) -> u32; diff --git a/src/annotations/gene.rs b/src/annotations/gene.rs index d92adcf..d68c637 100644 --- a/src/annotations/gene.rs +++ b/src/annotations/gene.rs @@ -77,7 +77,7 @@ impl Gene { /// Initializes a new Gene /// /// This method should rarely, if ever, be used directly. The - /// preferred way to create new genes is through [`Ontology::add_gene`] + /// preferred way to create new genes is through [`Builder::annotate_gene`](`crate::builder::Builder::annotate_gene`) /// to ensure that each gene exists only once. pub fn new(id: GeneId, name: &str) -> Gene { Gene { @@ -90,7 +90,7 @@ impl Gene { /// Initializes a new Gene from `str` values /// /// This method should rarely, if ever, be used directly. The - /// preferred way to create new genes is through [`Ontology::add_gene`] + /// preferred way to create new genes is through [`Builder::annotate_gene`](`crate::builder::Builder::annotate_gene`) /// to ensure that each gene exists only once. /// /// # Errors diff --git a/src/annotations/omim_disease.rs b/src/annotations/omim_disease.rs index 1eec5a6..1271dbd 100644 --- a/src/annotations/omim_disease.rs +++ b/src/annotations/omim_disease.rs @@ -72,7 +72,7 @@ impl Disease for OmimDisease { /// Initializes a new OMIM disease /// /// This method should rarely, if ever, be used directly. The - /// preferred way to create new genes is through [`crate::Ontology::add_omim_disease`] + /// preferred way to create new genes is through [`Builder::annotate_omim_disease`](`crate::builder::Builder::annotate_omim_disease`) /// to ensure that each disease exists only once. fn new(id: Self::AnnoID, name: &str) -> OmimDisease { Self { diff --git a/src/annotations/orpha_disease.rs b/src/annotations/orpha_disease.rs index 4f073a9..d5d8357 100644 --- a/src/annotations/orpha_disease.rs +++ b/src/annotations/orpha_disease.rs @@ -72,7 +72,7 @@ impl Disease for OrphaDisease { /// Initializes a new Orpha disease /// /// This method should rarely, if ever, be used directly. The - /// preferred way to create new genes is through [`crate::Ontology::add_orpha_disease`] + /// preferred way to create new genes is through [`Builder::annotate_orpha_disease`](`crate::builder::Builder::annotate_orpha_disease`) /// to ensure that each disease exists only once. fn new(id: Self::AnnoID, name: &str) -> OrphaDisease { Self { diff --git a/src/lib.rs b/src/lib.rs index 3d631da..ab0ec3e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,6 +18,7 @@ pub mod stats; pub mod term; pub mod utils; +pub use ontology::builder; pub use ontology::comparison; pub use ontology::Ontology; pub use set::HpoSet; diff --git a/src/ontology.rs b/src/ontology.rs index f891a95..2f6b298 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -1,32 +1,32 @@ use crate::annotations::Disease; use core::fmt::Debug; -use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::Read; -use std::ops::BitOr; use std::path::Path; use tracing::debug; -use crate::annotations::AnnotationId; use crate::annotations::{Gene, GeneId}; use crate::annotations::{OmimDisease, OmimDiseaseFilter, OmimDiseaseId}; use crate::annotations::{OrphaDisease, OrphaDiseaseId}; use crate::parser; -use crate::parser::binary::{BinaryTermBuilder, BinaryVersion, Bytes}; +use crate::parser::binary::BinaryVersion; use crate::term::internal::HpoTermInternal; use crate::term::{HpoGroup, HpoTerm}; use crate::u32_from_bytes; use crate::HpoResult; use crate::{HpoError, HpoTermId}; +pub mod builder; pub mod comparison; mod termarena; use comparison::Comparison; use termarena::Arena; +pub use builder::Builder; + #[cfg_attr(doc, aquamarine::aquamarine)] /// `Ontology` is the main interface of the `hpo` crate and contains all data /// @@ -356,8 +356,9 @@ impl Ontology { /// This method can fail for various reasons: /// /// - obo file not present or available: [`HpoError::CannotOpenFile`] - /// - [`Ontology::add_gene`] failed - /// - [`Ontology::add_omim_disease`] failed + /// - annotation file(s) not present: [`HpoError::CannotOpenFile`] + /// - invalid data in the annotation file(s): [`HpoError::InvalidInput`] + /// - annotation file(s) contain references to non-existing HPO terms: [`HpoError::DoesNotExist`] /// /// /// # Note @@ -387,16 +388,11 @@ impl Ontology { /// ``` /// pub fn from_standard(folder: &str) -> HpoResult { - let mut ont = Ontology::default(); let path = Path::new(folder); let obo = path.join(crate::OBO_FILENAME); let gene = path.join(crate::GENE_TO_PHENO_FILENAME); let disease = path.join(crate::DISEASE_FILENAME); - parser::load_from_jax_files(&obo, &gene, &disease, &mut ont)?; - ont.calculate_information_content()?; - ont.set_default_categories()?; - ont.set_default_modifier()?; - Ok(ont) + parser::load_from_jax_files(&obo, &gene, &disease) } /// Initialize the [`Ontology`] from data provided by [Jax HPO](https://hpo.jax.org/) @@ -414,8 +410,9 @@ impl Ontology { /// This method can fail for various reasons: /// /// - obo file not present or available: [`HpoError::CannotOpenFile`] - /// - [`Ontology::add_gene`] failed - /// - [`Ontology::add_omim_disease`] failed + /// - annotation file(s) not present: [`HpoError::CannotOpenFile`] + /// - invalid data in the annotation file(s): [`HpoError::InvalidInput`] + /// - annotation file(s) contain references to non-existing HPO terms: [`HpoError::DoesNotExist`] /// /// # Note /// @@ -443,16 +440,11 @@ impl Ontology { /// ``` /// pub fn from_standard_transitive(folder: &str) -> HpoResult { - let mut ont = Ontology::default(); let path = Path::new(folder); let obo = path.join(crate::OBO_FILENAME); let gene = path.join(crate::GENE_FILENAME); let disease = path.join(crate::DISEASE_FILENAME); - parser::load_from_jax_files_with_transivitve_genes(&obo, &gene, &disease, &mut ont)?; - ont.calculate_information_content()?; - ont.set_default_categories()?; - ont.set_default_modifier()?; - Ok(ont) + parser::load_from_jax_files_with_transivitve_genes(&obo, &gene, &disease) } /// Build an Ontology from a binary data blob @@ -561,9 +553,10 @@ impl Ontology { pub fn from_bytes(bytes: &[u8]) -> HpoResult { let bytes = parser::binary::ontology::version(bytes)?; debug!("Parsing from bytes v{}", bytes.version()); - let mut ont = Ontology::default(); - let offset = ont.hpo_version_from_bytes(&bytes)?; + let mut builder = Builder::new(); + + let offset = builder.hpo_version_from_bytes(&bytes)?; let mut section_start = offset; let mut section_end: usize; @@ -571,41 +564,42 @@ impl Ontology { // Terms let mut section_len = u32_from_bytes(&bytes[section_start..]) as usize; section_end = section_start + 4 + section_len; - ont.add_terms_from_bytes(bytes.subset(section_start + 4..section_end)); + builder.add_terms_from_bytes(bytes.subset(section_start + 4..section_end)); section_start += section_len + 4; + let mut builder = builder.terms_complete(); + // Term - Parents section_len = u32_from_bytes(&bytes[section_start..]) as usize; section_end += 4 + section_len; - ont.add_parent_from_bytes(&bytes[section_start + 4..section_end]); - ont.create_cache(); + builder.add_parent_from_bytes(&bytes[section_start + 4..section_end]); + let mut builder = builder.connect_all_terms(); section_start += section_len + 4; // Genes section_len = u32_from_bytes(&bytes[section_start..]) as usize; section_end += 4 + section_len; - ont.add_genes_from_bytes(&bytes[section_start + 4..section_end])?; + builder.add_genes_from_bytes(&bytes[section_start + 4..section_end])?; section_start += section_len + 4; // Omim Diseases section_len = u32_from_bytes(&bytes[section_start..]) as usize; section_end += 4 + section_len; - ont.add_omim_disease_from_bytes(&bytes[section_start + 4..section_end])?; + builder.add_omim_disease_from_bytes(&bytes[section_start + 4..section_end])?; section_start += section_len + 4; // Orpha Diseases if bytes.version() > BinaryVersion::V2 { section_len = u32_from_bytes(&bytes[section_start..]) as usize; section_end += 4 + section_len; - ont.add_orpha_disease_from_bytes(&bytes[section_start + 4..section_end])?; + builder.add_orpha_disease_from_bytes(&bytes[section_start + 4..section_end])?; section_start += section_len + 4; } if section_start == bytes.len() { - ont.calculate_information_content()?; - ont.set_default_categories()?; - ont.set_default_modifier()?; - Ok(ont) + builder + .calculate_information_content()? + .build_with_defaults() } else { Err(HpoError::ParseBinaryError) } @@ -843,12 +837,13 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; + /// use hpo::builder::Builder; /// use hpo::annotations::GeneId; /// /// let ontology_1 = Ontology::from_binary("tests/example.hpo").unwrap(); - /// let mut ontology_2 = Ontology::default(); - /// - /// ontology_2.add_gene("FOOBAR", GeneId::from(666666)); + /// let mut builder = Builder::default().terms_complete().connect_all_terms(); + /// builder.add_gene("FOOBAR", 666666.into()); + /// let ontology_2 = builder.calculate_information_content().unwrap().build_minimal(); /// /// let compare = ontology_1.compare(&ontology_2); /// assert_eq!(compare.added_hpo_terms().len(), 0); @@ -1048,22 +1043,26 @@ impl Ontology { // should be connected to let ids: HpoGroup = terms.iter().map(|term| *term.id()).collect(); - let mut ont = Self::default(); + let mut builder = Builder::new(); + for &term in &terms { let mut copied_term = HpoTermInternal::new(term.name().to_string(), *term.id()); *copied_term.obsolete_mut() = term.obsolete(); *copied_term.replacement_mut() = term.replacement(); - ont.add_term(copied_term); + builder.add_term(copied_term); } + + let mut builder = builder.terms_complete(); + for term in &terms { for parent in term.parents() { if ids.contains(&parent) { - ont.add_parent(parent, *term.id()); + builder.add_parent_unchecked(parent, *term.id()); } } } - ont.create_cache(); + let mut builder = builder.connect_all_terms(); // We only want to add genes and diseases to the ontology that are // associated with an actual phenotype of the ontology and not just @@ -1082,18 +1081,10 @@ impl Ontology { if (gene.hpo_terms() & &phenotype_ids).is_empty() { continue; } - ont.add_gene( - self.gene(gene.id()).ok_or(HpoError::DoesNotExist)?.name(), - *gene.id(), - ); - // Link the gene to every term in the new ontology // --> also modifier terms for term in &(gene.hpo_terms() & &ids) { - ont.link_gene_term(term, *gene.id())?; - ont.gene_mut(gene.id()) - .ok_or(HpoError::DoesNotExist)? - .add_term(term); + builder.annotate_gene(*gene.id(), gene.name(), term)?; } } @@ -1104,20 +1095,11 @@ impl Ontology { if (omim_disease.hpo_terms() & &phenotype_ids).is_empty() { continue; } - let omim_disease_id = ont.add_omim_disease( - self.omim_disease(omim_disease.id()) - .ok_or(HpoError::DoesNotExist)? - .name(), - &omim_disease.id().as_u32().to_string(), - )?; // Link the omim_disease to every term in the new ontology // --> also modifier terms for term in &(omim_disease.hpo_terms() & &ids) { - ont.link_omim_disease_term(term, omim_disease_id)?; - ont.omim_disease_mut(&omim_disease_id) - .ok_or(HpoError::DoesNotExist)? - .add_term(term); + builder.annotate_omim_disease(*omim_disease.id(), omim_disease.name(), term)?; } } @@ -1128,26 +1110,15 @@ impl Ontology { if (orpha_disease.hpo_terms() & &phenotype_ids).is_empty() { continue; } - let orpha_disease_id = ont.add_orpha_disease( - self.orpha_disease(orpha_disease.id()) - .ok_or(HpoError::DoesNotExist)? - .name(), - &orpha_disease.id().as_u32().to_string(), - )?; // Link the orpha_disease to every term in the new ontology // --> also modifier terms for term in &(orpha_disease.hpo_terms() & &ids) { - ont.link_orpha_disease_term(term, orpha_disease_id)?; - ont.orpha_disease_mut(&orpha_disease_id) - .ok_or(HpoError::DoesNotExist)? - .add_term(term); + builder.annotate_orpha_disease(*orpha_disease.id(), orpha_disease.name(), term)?; } } - ont.calculate_information_content()?; - - Ok(ont) + Ok(builder.calculate_information_content()?.build_minimal()) } /// Returns the code to create a `Mermaid` flow diagram @@ -1188,9 +1159,11 @@ impl Ontology { let term_name = term.name().replace(' ', "\n"); let child_name = child.name().replace(' ', "\n"); code.push_str(&format!("\"{term_name}\" -> \"{child_name}\"\n")); + println!("In function: {code}"); } } code.push_str("}\n"); + println!("At the end: {code}"); code } } @@ -1293,367 +1266,6 @@ impl Ontology { Ok(()) } - /// Crates and inserts a new term to the ontology - /// - /// This method does not link the term to its parents or to any annotations - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("FooBar".into(), 1u32); - /// - /// assert_eq!(ontology.len(), 1); - /// ``` - pub fn insert_term>(&mut self, name: String, id: I) { - let term = HpoTermInternal::new(name, id.into()); - self.hpo_terms.insert(term); - } - - /// Add a connection from an [`HpoTerm`] to its parent - /// - /// This method is called once for every dependency in the Ontology during the initialization. - /// - /// There should rarely be a need to call this method outside of the ontology building - /// - /// # Panics - /// - /// This method will panic if the `parent_id` or `child_id` is not present in the Ontology - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Foo".into(), 1u32); - /// ontology.insert_term("Bar".into(), 2u32); - /// - /// ontology.add_parent(1u32, 2u32); - /// - /// assert!(ontology.hpo(2u32).unwrap().parent_ids().contains(&1u32.into())); - /// ``` - pub fn add_parent + Copy, J: Into + Copy>( - &mut self, - parent_id: I, - child_id: J, - ) { - let parent = self.get_unchecked_mut(parent_id); - parent.add_child(child_id); - - let child = self.get_unchecked_mut(child_id); - child.add_parent(parent_id); - } - - /// Crates and caches the `all_parents` values for every term - /// - /// This method can only be called once and afterwards no new terms - /// should be added to the Ontology anymore and no new term-parent connection - /// should be created. - /// Since this method caches the results, rerunning it will not cause a new - /// calculation. - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Root".into(), 1u32); - /// ontology.insert_term("Foo".into(), 2u32); - /// ontology.insert_term("Bar".into(), 3u32); - /// - /// ontology.add_parent(1u32, 2u32); - /// ontology.add_parent(2u32, 3u32); - /// - /// // At this point #3 does not have info about grandparents - /// assert!(!ontology.hpo(3u32).unwrap().all_parent_ids().contains(&1u32.into())); - /// - /// ontology.create_cache(); - /// assert!(ontology.hpo(3u32).unwrap().all_parent_ids().contains(&1u32.into())); - /// ``` - pub fn create_cache(&mut self) { - let term_ids: Vec = self.hpo_terms.keys(); - - for id in term_ids { - self.create_cache_of_grandparents(id); - } - } - - /// Add a gene to the Ontology - /// - /// If the gene does not yet exist, a new [`Gene`] entity is created - /// and stored in the Ontology. - /// If the gene already exists in the ontology, it is not added again. - /// - /// # Note - /// - /// Adding a gene does not connect it to any HPO terms. - /// Use [`Ontology::link_gene_term`] for creating connections. - /// - /// This method was changed to receive the `gene_id` as [`GeneId`] - /// instead of `str` in `0.10` and does not return a `Result` anymore. - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::GeneId; - /// - /// let mut ontology = Ontology::default(); - /// assert!(ontology.gene(&1u32.into()).is_none()); - /// - /// ontology.add_gene("Foo", GeneId::from(1)); - /// - /// // Genes can be iterated... - /// let mut gene_iterator = ontology.genes(); - /// let gene = gene_iterator.next().unwrap(); - /// assert_eq!(gene.name(), "Foo"); - /// assert!(gene_iterator.next().is_none()); - /// - /// // .. or accessed directly - /// assert!(ontology.gene(&1u32.into()).is_some()); - /// ``` - pub fn add_gene(&mut self, gene_name: &str, gene_id: GeneId) { - if let Entry::Vacant(entry) = self.genes.entry(gene_id) { - entry.insert(Gene::new(gene_id, gene_name)); - } - } - - /// Add an OMIM disease to the Ontology and return the [`OmimDiseaseId`] - /// - /// If the disease does not yet exist, a new [`OmimDisease`] entity is - /// created and stored in the Ontology. - /// If the disease already exists in the ontology, it is not added again. - /// - /// # Note - /// - /// Adding a disease does not connect it to any HPO terms. - /// Use [`Ontology::link_omim_disease_term`] for creating connections. - /// - /// # Errors - /// - /// If the `omim_disease_id` is invalid, an [`HpoError::ParseIntError`] is returned - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::Disease; - /// - /// let mut ontology = Ontology::default(); - /// assert!(ontology.omim_disease(&1u32.into()).is_none()); - /// - /// ontology.add_omim_disease("Foo", "1"); - /// - /// // Diseases can be iterated... - /// let mut disease_iterator = ontology.omim_diseases(); - /// let omim_disease = disease_iterator.next().unwrap(); - /// assert_eq!(omim_disease.name(), "Foo"); - /// assert!(disease_iterator.next().is_none()); - /// - /// // .. or accessed directly - /// assert!(ontology.omim_disease(&1u32.into()).is_some()); - /// ``` - pub fn add_omim_disease( - &mut self, - omim_disease_name: &str, - omim_disease_id: &str, - ) -> HpoResult { - let id = OmimDiseaseId::try_from(omim_disease_id)?; - match self.omim_diseases.entry(id) { - std::collections::hash_map::Entry::Occupied(_) => Ok(id), - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(OmimDisease::new(id, omim_disease_name)); - Ok(id) - } - } - } - - /// Add an ORPHA disease to the Ontology and return the [`OrphaDiseaseId`] - /// - /// If the disease does not yet exist, a new [`OrphaDisease`] entity is - /// created and stored in the Ontology. - /// If the disease already exists in the ontology, it is not added again. - /// - /// # Note - /// - /// Adding a disease does not connect it to any HPO terms. - /// Use [`Ontology::link_orpha_disease_term`] for creating connections. - /// - /// # Errors - /// - /// If the `orpha_disease_id` is invalid, an [`HpoError::ParseIntError`] is returned - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::Disease; - /// - /// let mut ontology = Ontology::default(); - /// assert!(ontology.orpha_disease(&1u32.into()).is_none()); - /// - /// ontology.add_orpha_disease("Foo", "1"); - /// - /// // Diseases can be iterated... - /// let mut disease_iterator = ontology.orpha_diseases(); - /// let orpha_disease = disease_iterator.next().unwrap(); - /// assert_eq!(orpha_disease.name(), "Foo"); - /// assert!(disease_iterator.next().is_none()); - /// - /// // .. or accessed directly - /// assert!(ontology.orpha_disease(&1u32.into()).is_some()); - /// ``` - pub fn add_orpha_disease( - &mut self, - orpha_disease_name: &str, - orpha_disease_id: &str, - ) -> HpoResult { - let id = OrphaDiseaseId::try_from(orpha_disease_id)?; - match self.orpha_diseases.entry(id) { - std::collections::hash_map::Entry::Occupied(_) => Ok(id), - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(OrphaDisease::new(id, orpha_disease_name)); - Ok(id) - } - } - } - - /// Add the [`Gene`] as annotation to the [`HpoTerm`] - /// - /// The gene will be recursively connected to all parent `HpoTerms` as well. - /// - /// This method does not add the HPO-term to the [`Gene`], this must be handled - /// by the client. - /// - /// # Errors - /// - /// If the HPO term is not present, an [`HpoError::DoesNotExist`] is returned - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::GeneId; - /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Term-Foo".into(), 1u32); - /// ontology.add_gene("Foo", GeneId::from(5)); - /// ontology.link_gene_term(1u32, GeneId::from(5u32)).unwrap(); - /// - /// let term = ontology.hpo(1u32).unwrap(); - /// assert_eq!(term.genes().next().unwrap().name(), "Foo"); - /// ``` - pub fn link_gene_term>( - &mut self, - term_id: I, - gene_id: GeneId, - ) -> HpoResult<()> { - let term = self.get_mut(term_id).ok_or(HpoError::DoesNotExist)?; - - if term.add_gene(gene_id) { - // If the gene is already associated to the term, this branch will - // be skipped. That is desired, because by definition - // all parent terms are already linked as well - let parents = term.all_parents().clone(); - for parent in &parents { - self.link_gene_term(parent, gene_id)?; - } - } - Ok(()) - } - - /// Add the [`OmimDisease`] as annotation to the [`HpoTerm`] - /// - /// The disease will be recursively connected to all parent `HpoTerms` as well. - /// - /// This method does not add the HPO-term to the [`OmimDisease`], this - /// must be handled by the client. - /// - /// # Errors - /// - /// If the HPO term is not present, an [`HpoError`] is returned - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::{Disease, OmimDiseaseId}; - /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Term-Foo".into(), 1u32); - /// ontology.add_omim_disease("Foo", "5"); - /// ontology.link_omim_disease_term(1u32, OmimDiseaseId::from(5u32)).unwrap(); - /// - /// let term = ontology.hpo(1u32).unwrap(); - /// assert_eq!(term.omim_diseases().next().unwrap().name(), "Foo"); - /// ``` - pub fn link_omim_disease_term>( - &mut self, - term_id: I, - omim_disease_id: OmimDiseaseId, - ) -> HpoResult<()> { - let term = self.get_mut(term_id).ok_or(HpoError::DoesNotExist)?; - - if term.add_omim_disease(omim_disease_id) { - // If the disease is already associated to the term, this branch will - // be skipped. That is desired, because by definition - // all parent terms are already linked as well - let parents = term.all_parents().clone(); - for parent in &parents { - self.link_omim_disease_term(parent, omim_disease_id)?; - } - } - Ok(()) - } - - /// Add the [`OrphaDisease`] as annotation to the [`HpoTerm`] - /// - /// The disease will be recursively connected to all parent `HpoTerms` as well. - /// - /// This method does not add the HPO-term to the [`OrphaDisease`], this - /// must be handled by the client. - /// - /// # Errors - /// - /// If the HPO term is not present, an [`HpoError`] is returned - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::{Disease, OrphaDiseaseId}; - /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Term-Foo".into(), 1u32); - /// ontology.add_orpha_disease("Foo", "5"); - /// ontology.link_orpha_disease_term(1u32, OrphaDiseaseId::from(5u32)).unwrap(); - /// - /// let term = ontology.hpo(1u32).unwrap(); - /// assert_eq!(term.orpha_diseases().next().unwrap().name(), "Foo"); - /// ``` - pub fn link_orpha_disease_term>( - &mut self, - term_id: I, - orpha_disease_id: OrphaDiseaseId, - ) -> HpoResult<()> { - let term = self.get_mut(term_id).ok_or(HpoError::DoesNotExist)?; - - if term.add_orpha_disease(orpha_disease_id) { - // If the disease is already associated to the term, this branch will - // be skipped. That is desired, because by definition - // all parent terms are already linked as well - let parents = term.all_parents().clone(); - for parent in &parents { - self.link_orpha_disease_term(parent, orpha_disease_id)?; - } - } - Ok(()) - } - /// Returns a mutable reference to the [`Gene`] of the provided [`GeneId`] /// /// If no such gene is present, `None` is returned @@ -1718,74 +1330,12 @@ impl Ontology { pub fn orpha_disease_mut(&mut self, disease_id: &OrphaDiseaseId) -> Option<&mut OrphaDisease> { self.orpha_diseases.get_mut(disease_id) } - - /// Calculates the [`crate::term::InformationContent`]s for every term - /// - /// This method should only be called **after** all terms are added, - /// connected and all genes and diseases are linked as well. - /// - /// It can be called repeatedly, all values are recalculated each time, - /// as long as the Ontology contains at least 1 gene/disease. - /// When no genes/diseases are present, the IC is not calculated nor updated. - /// - /// # Errors - /// - /// This method returns an error if there are more Genes or Terms than `u16::MAX` - /// because larger numbers can't be safely converted to `f32` - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// - /// let mut ontology = Ontology::default(); - /// - /// // [all kind of logic to add terms, diseases, genes....] - /// - /// ontology.calculate_information_content().unwrap(); - /// ``` - pub fn calculate_information_content(&mut self) -> HpoResult<()> { - self.calculate_gene_ic()?; - self.calculate_omim_disease_ic()?; - self.calculate_orpha_disease_ic()?; - Ok(()) - } } /// Crate-only functions for setting up and building the Ontology /// /// Those methods should not be exposed publicly impl Ontology { - /// Insert an `HpoTermInternal` to the ontology - /// - /// This method does not link the term to its parents or to any annotations - pub(crate) fn add_term(&mut self, term: HpoTermInternal) -> HpoTermId { - let id = *term.id(); - self.hpo_terms.insert(term); - id - } - - pub(crate) fn set_hpo_version(&mut self, version: (u16, u8, u8)) { - self.hpo_version = version; - } - - /// Parses `Bytes` into the Jax-Ontology release version - fn hpo_version_from_bytes(&mut self, bytes: &Bytes) -> HpoResult { - if bytes.version() == BinaryVersion::V1 { - self.set_hpo_version((0u16, 0u8, 0u8)); - Ok(0) - } else { - if bytes.len() < 4 { - return Err(HpoError::ParseBinaryError); - } - let year = u16::from_be_bytes([bytes[0], bytes[1]]); - let month = u8::from_be_bytes([bytes[2]]); - let day = u8::from_be_bytes([bytes[3]]); - self.set_hpo_version((year, month, day)); - Ok(4) - } - } - /// Returns a binary representation of the Ontology's metadata /// /// It adds the HPO-identifying bytes `HPO`, the version @@ -1804,178 +1354,6 @@ impl Ontology { bytes } - /// Adds an [`HpoTerm`] to the ontology - /// - /// This method is part of the Ontology-building, based on the binary - /// data format and requires a specified data layout. - /// - /// The method assumes that the data is in the right format and also - /// assumes that the caller takes care of handling all consistencies - /// like parent-child connection etc. - /// - /// See [`HpoTermInternal::as_bytes`] for explanation of the binary layout. - fn add_terms_from_bytes(&mut self, bytes: Bytes) { - for term in BinaryTermBuilder::new(bytes) { - self.add_term(term); - } - } - - /// Connects an [`HpoTerm`] to its parent term - /// - /// This method is part of the Ontology-building, based on the binary - /// data format and requires a specified data layout. - /// - /// The method assumes that the data is in the right format and also - /// assumes that the caller will populate the `all_parents` caches for - /// each term. - /// - /// See [`HpoTermInternal::parents_as_byte`] for explanation of the binary layout. - /// - /// # Panics - /// - /// This method will panic if the length of bytes does not exactly correspond - /// to the contained data - fn add_parent_from_bytes(&mut self, bytes: &[u8]) { - let mut idx: usize = 0; - loop { - if idx == bytes.len() { - break; - } - let n_parents = u32_from_bytes(&bytes[idx..]) as usize; - - idx += 4; - let term = - HpoTermId::from([bytes[idx], bytes[idx + 1], bytes[idx + 2], bytes[idx + 3]]); - idx += 4; - for _ in 0..n_parents { - let parent = - HpoTermId::from([bytes[idx], bytes[idx + 1], bytes[idx + 2], bytes[idx + 3]]); - self.add_parent(parent, term); - idx += 4; - } - } - } - - /// Adds genes to the ontoloigy and connects them to connected terms - /// - /// This method is part of the Ontology-building, based on the binary - /// data format and requires a specified data layout. - /// - /// It connects all connected terms and their parents properly. The - /// method assumes that the bytes encode all gene-term connections. - /// - /// See [`Gene::as_bytes`] for explanation of the binary layout - fn add_genes_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { - let mut idx: usize = 0; - loop { - if idx >= bytes.len() { - break; - } - let gene_len = u32_from_bytes(&bytes[idx..]) as usize; - let gene = Gene::try_from(&bytes[idx..idx + gene_len])?; - for term in gene.hpo_terms() { - self.link_gene_term(term, *gene.id())?; - } - self.genes.insert(*gene.id(), gene); - idx += gene_len; - } - Ok(()) - } - - /// Adds [`OmimDisease`]s to the ontoloigy and connects them to connected terms - /// - /// This method is part of the Ontology-building, based on the binary - /// data format and requires a specified data layout. - /// - /// It connects all connected terms and their parents properly. The - /// method assumes that the bytes encode all Disease-term connections. - /// - /// See [`OmimDisease::as_bytes`] for explanation of the binary layout - fn add_omim_disease_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { - let mut idx: usize = 0; - loop { - if idx >= bytes.len() { - break; - } - let disease_len = u32_from_bytes(&bytes[idx..]) as usize; - let disease = OmimDisease::try_from(&bytes[idx..idx + disease_len])?; - for term in disease.hpo_terms() { - self.link_omim_disease_term(term, *disease.id())?; - } - self.omim_diseases.insert(*disease.id(), disease); - idx += disease_len; - } - Ok(()) - } - - /// Adds [`OrphaDisease`]s to the ontoloigy and connects them to connected terms - /// - /// This method is part of the Ontology-building, based on the binary - /// data format and requires a specified data layout. - /// - /// It connects all connected terms and their parents properly. The - /// method assumes that the bytes encode all Disease-term connections. - /// - /// See [`OrphaDisease::as_bytes`] for explanation of the binary layout - fn add_orpha_disease_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { - let mut idx: usize = 0; - loop { - if idx >= bytes.len() { - break; - } - let disease_len = u32_from_bytes(&bytes[idx..]) as usize; - let disease = OrphaDisease::try_from(&bytes[idx..idx + disease_len])?; - for term in disease.hpo_terms() { - self.link_orpha_disease_term(term, *disease.id())?; - } - self.orpha_diseases.insert(*disease.id(), disease); - idx += disease_len; - } - Ok(()) - } - - /// This method is part of the cache creation to link all terms to their - /// direct and indirect parents (grandparents) - /// - /// # Panics - /// - /// This method will panic if the `term_id` is not present in the Ontology - fn all_grandparents(&mut self, term_id: HpoTermId) -> &HpoGroup { - if !self.get_unchecked(term_id).parents_cached() { - self.create_cache_of_grandparents(term_id); - } - let term = self.get_unchecked(term_id); - term.all_parents() - } - - /// This method is part of the cache creation to link all terms to their - /// direct and indirect parents (grandparents) - /// - /// It will (somewhat) recursively iterate all parents and copy all their parents. - /// During this recursion, the list of `all_parents` is cached in each term that was - /// iterated. - /// - /// The logic is that the recursion bubbles up all the way to the top of the ontolgy - /// and then caches the list of direct and indirect parents for every term bubbling - /// back down. The recursion does not reach the top level again, because it will stop - /// once it reaches a term with already cached `all_parents`. - /// - /// # Panics - /// - /// This method will panic if the `term_id` is not present in the Ontology - fn create_cache_of_grandparents(&mut self, term_id: HpoTermId) { - let mut res = HpoGroup::default(); - let parents = self.get_unchecked(term_id).parents().clone(); - for parent in &parents { - let grandparents = self.all_grandparents(parent); - for gp in grandparents { - res.insert(gp); - } - } - let term = self.get_unchecked_mut(term_id); - *term.all_parents_mut() = res.bitor(&parents); - } - /// Returns the `HpoTermInternal` with the given `HpoTermId` /// /// Returns `None` if no such term is present @@ -1994,66 +1372,6 @@ impl Ontology { pub(crate) fn get_unchecked>(&self, term_id: I) -> &HpoTermInternal { self.hpo_terms.get_unchecked(term_id.into()) } - - /// Returns a mutable reference to the `HpoTermInternal` with the given `HpoTermId` - /// - /// Returns `None` if no such term is present - fn get_mut>(&mut self, term_id: I) -> Option<&mut HpoTermInternal> { - self.hpo_terms.get_mut(term_id.into()) - } - - /// Returns a mutable reference to the `HpoTermInternal` with the given `HpoTermId` - /// - /// This method should only be called if the caller is sure that the term actually - /// exists, e.g. during an iteration of all `HpoTermId`s. - /// - /// # Panics - /// - /// This method will panic if the `term_id` is not present in the Ontology - fn get_unchecked_mut>(&mut self, term_id: I) -> &mut HpoTermInternal { - self.hpo_terms.get_unchecked_mut(term_id.into()) - } - - /// Calculates the gene-specific Information Content for every term - /// - /// If no genes are present in the Ontology, no IC are calculated - fn calculate_gene_ic(&mut self) -> HpoResult<()> { - let n_genes = self.genes.len(); - for term in self.hpo_terms.values_mut() { - let current_genes = term.genes().len(); - term.information_content_mut() - .set_gene(n_genes, current_genes)?; - } - Ok(()) - } - - /// Calculates the Omim-Disease-specific Information Content for every term - /// - /// If no diseases are present in the Ontology, no IC are calculated - fn calculate_omim_disease_ic(&mut self) -> HpoResult<()> { - let n_omim_diseases = self.omim_diseases.len(); - - for term in self.hpo_terms.values_mut() { - let current_diseases = term.omim_diseases().len(); - term.information_content_mut() - .set_omim_disease(n_omim_diseases, current_diseases)?; - } - Ok(()) - } - - /// Calculates the Orpha-Disease-specific Information Content for every term - /// - /// If no diseases are present in the Ontology, no IC are calculated - fn calculate_orpha_disease_ic(&mut self) -> HpoResult<()> { - let n_orpha_diseases = self.orpha_diseases.len(); - - for term in self.hpo_terms.values_mut() { - let current_diseases = term.orpha_diseases().len(); - term.information_content_mut() - .set_orpha_disease(n_orpha_diseases, current_diseases)?; - } - Ok(()) - } } /// Iterates the Ontology and yields [`HpoTerm`]s @@ -2086,6 +1404,8 @@ impl<'a> IntoIterator for &'a Ontology { #[cfg(test)] mod test { + use crate::parser::binary::Bytes; + use super::*; #[test] @@ -2097,7 +1417,7 @@ mod test { ("Abnormality", 4u32), ]; - let mut ont = Ontology::default(); + let mut ont = Builder::new(); let mut v: Vec = Vec::new(); for (name, id) in test_terms { @@ -2105,6 +1425,12 @@ mod test { v.append(&mut t.as_bytes()); } ont.add_terms_from_bytes(Bytes::new(&v, parser::binary::BinaryVersion::V1)); + let ont = ont + .terms_complete() + .connect_all_terms() + .calculate_information_content() + .expect("Test can calculate IC") + .build_minimal(); assert_eq!(ont.len(), 4); } @@ -2117,12 +1443,13 @@ mod test { ("Abnormality", 4u32), ]; - let mut ont = Ontology::default(); + let mut ont = Builder::new(); for (name, id) in test_terms { ont.add_term(HpoTermInternal::new(String::from(name), id.into())); } - assert_eq!(ont.len(), 4); + + // assert_eq!(ont.len(), 4); // The fake term has the same HpoTermId as one of of the Test ontology let mut fake_term = HpoTermInternal::new(String::new(), 3u32.into()); @@ -2131,26 +1458,64 @@ mod test { let bytes = fake_term.parents_as_byte(); + let mut ont = ont.terms_complete(); ont.add_parent_from_bytes(&bytes[..]); - assert_eq!(ont.get_unchecked(3u32).parents().len(), 2); - assert_eq!(ont.get_unchecked(1u32).children().len(), 1); - assert_eq!(ont.get_unchecked(2u32).children().len(), 1); + let ont = ont + .connect_all_terms() + .calculate_information_content() + .expect("Test can calculate IC") + .build_minimal(); + assert_eq!( + ont.hpo(3u32) + .expect("Term added before in test") + .parents() + .count(), + 2 + ); + assert_eq!( + ont.hpo(1u32) + .expect("Term added before in test") + .children() + .count(), + 1 + ); + assert_eq!( + ont.hpo(2u32) + .expect("Term added before in test") + .children() + .count(), + 1 + ); } #[test] fn parse_hpo_version() { - let mut ont = Ontology::default(); + let mut ont = Builder::new(); // 7*256 + 231 == 2023 let v = [7u8, 231u8, 1u8, 31u8]; ont.hpo_version_from_bytes(&Bytes::new(&v, BinaryVersion::V2)) .unwrap(); + + let ont = ont + .terms_complete() + .connect_all_terms() + .calculate_information_content() + .expect("Test can calculate IC") + .build_minimal(); assert_eq!(ont.hpo_version, (2023, 1, 31)); assert_eq!(ont.hpo_version(), "2023-01-31"); + let mut ont = Builder::new(); ont.hpo_version_from_bytes(&Bytes::new(&v, BinaryVersion::V1)) .unwrap(); + let ont = ont + .terms_complete() + .connect_all_terms() + .calculate_information_content() + .expect("Test can calculate IC") + .build_minimal(); assert_eq!(ont.hpo_version(), "0000-00-00"); } @@ -2218,14 +1583,32 @@ mod test { } #[test] + #[ignore = "fails with weird \0 extra characters"] fn graphiv() { - let mut ontology = Ontology::default(); - ontology.insert_term("Root".into(), 1u32); - ontology.insert_term("A very long name".into(), 2u32); - ontology.insert_term("A small name".into(), 3u32); + let test_terms = [ + ("Root", 1u32), + ("A very long name", 2u32), + ("A small name", 3u32), + ]; + + let mut ont = Builder::new(); + + let mut v: Vec = Vec::new(); + for (name, id) in test_terms { + let t = HpoTermInternal::new(String::from(name), id.into()); + v.append(&mut t.as_bytes()); + } + ont.add_terms_from_bytes(Bytes::new(&v, parser::binary::BinaryVersion::V1)); + let mut ont = ont.terms_complete(); + + ont.add_parent_unchecked(1u32, 2u32); + ont.add_parent_unchecked(1u32, 3u32); + let ontology = ont + .connect_all_terms() + .calculate_information_content() + .expect("Test can calculate IC") + .build_minimal(); - ontology.add_parent(1u32, 2u32); - ontology.add_parent(1u32, 3u32); let graph = ontology.as_graphviz("fdp"); assert_eq!(graph, "digraph G {\nlayout=fdp\n\"Root\" -> \"A\nvery\nlong\nname\"\n\"Root\" -> \"A\nsmall\nname\"\n}\n"); } diff --git a/src/ontology/builder.rs b/src/ontology/builder.rs index 3d515e9..03b8f12 100644 --- a/src/ontology/builder.rs +++ b/src/ontology/builder.rs @@ -1,3 +1,8 @@ +//! [`Builder`] can be used to manually create custom Ontologies +//! +//! In most cases, this is not recommended, use the +//! built-in functions in [`Ontology`](`crate::Ontology`) instead. + use crate::annotations::Disease; use crate::term::internal::HpoTermInternal; use std::collections::hash_map::Entry; @@ -11,25 +16,27 @@ use crate::annotations::{OrphaDisease, OrphaDiseaseId}; use crate::parser::binary::{BinaryTermBuilder, BinaryVersion, Bytes}; use crate::term::HpoGroup; -use crate::{u32_from_bytes, HpoTermId, Ontology}; -use crate::HpoResult; use crate::HpoError; +use crate::HpoResult; +use crate::{u32_from_bytes, HpoTermId, Ontology}; use crate::ontology::termarena::Arena; - +/// State of [`Builder`] that only contains some 'loose', unconnected terms pub struct LooseCollection; + +/// State of [`Builder`] that only contains all terms of the ontology, +/// but not yet connected to each other. pub struct AllTerms; + +/// State of [`Builder`] that contains all terms, connected to each other pub struct ConnectedTerms; -pub struct FullyAnnotated; -pub trait AddAnotation{} -impl AddAnotation for LooseCollection{} -impl AddAnotation for AllTerms{} -impl AddAnotation for ConnectedTerms{} +/// State of [`Builder`] that contains all terms and all gene and disease annotation +pub struct FullyAnnotated; fn transition_state(builder: Builder) -> Builder { - Builder::{ + Builder:: { hpo_terms: builder.hpo_terms, genes: builder.genes, omim_diseases: builder.omim_diseases, @@ -37,11 +44,80 @@ fn transition_state(builder: Builder) -> Builder { hpo_version: builder.hpo_version, categories: builder.categories, modifier: builder.modifier, - state: PhantomData + state: PhantomData, } } - +/// Builder to manually create an Ontology +/// +/// There should rarely, if ever, be any need to build custom Ontologies. +/// The connections within the HPO, along with gene and disease annotations +/// are quite complex and it's trivially easy to mess this up, when doing it +/// manually. +/// +/// To build a full ontology, the builder transitions between the following states: +/// +/// ```text +/// Builder : Add individual terms to the Ontology +/// | +/// terms_complete() +/// | +/// v +/// Builder : Define parent-child relationships +/// | +/// connect_all_terms() +/// | +/// v +/// Builder: Annotate terms and their ancestors with genes/diseases +/// | +/// calculate_information_content() +/// | +/// v +/// Builder +/// | +/// build_with_defaults() or build_minimal() +/// | +/// v +/// Ontology +/// ``` +/// +/// # Examples +/// +/// ``` +/// use hpo::builder::Builder; +/// +/// let mut builder = Builder::new(); +/// +/// // Add three terms +/// builder.new_term("Root", 1u32); +/// builder.new_term("First child", 2u32); +/// builder.new_term("Second child", 3u32); +/// +/// // before connecting terms, indicate that all terms have been added +/// let mut builder = builder.terms_complete(); +/// +/// // Connect both childs to the root term +/// builder.add_parent(1u32, 2u32); +/// builder.add_parent(1u32, 3u32); +/// +/// // Build all connections and cache the connections +/// let mut builder = builder.connect_all_terms(); +/// +/// builder.annotate_gene(11u32.into(), "Gene1", 2u32.into()).unwrap(); +/// builder.annotate_omim_disease(22u32.into(), "Disease 1", 3u32.into()).unwrap(); +/// +/// // Indicate that all annotations are added an calculate the information content +/// let mut builder = builder.calculate_information_content().unwrap(); +/// +/// // Build an Ontology +/// let ontology = builder.build_minimal(); +/// +/// assert_eq!(ontology.len(), 3); +/// +/// let root_term = ontology.hpo(1u32).unwrap(); +/// assert_eq!(root_term.name(), "Root"); +/// ``` +/// pub struct Builder { hpo_terms: Arena, genes: HashMap, @@ -50,51 +126,20 @@ pub struct Builder { hpo_version: (u16, u8, u8), categories: HpoGroup, modifier: HpoGroup, - state: PhantomData + state: PhantomData, } - -impl Builder { - pub fn add_gene(&mut self, gene_name: &str, gene_id: GeneId) { - if let Entry::Vacant(entry) = self.genes.entry(gene_id) { - entry.insert(Gene::new(gene_id, gene_name)); - } - } - - pub fn add_omim_disease( - &mut self, - omim_disease_name: &str, - omim_disease_id: &str, - ) -> HpoResult { - let id = OmimDiseaseId::try_from(omim_disease_id)?; - match self.omim_diseases.entry(id) { - std::collections::hash_map::Entry::Occupied(_) => Ok(id), - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(OmimDisease::new(id, omim_disease_name)); - Ok(id) - } - } - } - - pub fn add_orpha_disease( - &mut self, - orpha_disease_name: &str, - orpha_disease_id: &str, - ) -> HpoResult { - let id = OrphaDiseaseId::try_from(orpha_disease_id)?; - match self.orpha_diseases.entry(id) { - std::collections::hash_map::Entry::Occupied(_) => Ok(id), - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(OrphaDisease::new(id, orpha_disease_name)); - Ok(id) - } - } +impl Default for Builder { + fn default() -> Self { + Self::new() } } - - -impl Builder{ +/// In this state, the Builder contains a loose collection +/// of HPO-terms. They don't yet have any relation to each +/// other or any associated annotations +impl Builder { + /// Creates a new `Builder` instance to manually crate an Ontology pub fn new() -> Builder { Builder:: { hpo_terms: Arena::default(), @@ -104,11 +149,11 @@ impl Builder{ hpo_version: (0u16, 0u8, 0u8), categories: HpoGroup::default(), modifier: HpoGroup::default(), - state: PhantomData + state: PhantomData, } - } + } - /// Adds an [`HpoTerm`] to the ontology + /// Adds [`crate::HpoTerm`]s to the ontology /// /// This method is part of the Ontology-building, based on the binary /// data format and requires a specified data layout. @@ -118,7 +163,7 @@ impl Builder{ /// like parent-child connection etc. /// /// See [`HpoTermInternal::as_bytes`] for explanation of the binary layout. - pub (crate) fn add_terms_from_bytes(&mut self, bytes: Bytes) { + pub(crate) fn add_terms_from_bytes(&mut self, bytes: Bytes) { for term in BinaryTermBuilder::new(bytes) { self.add_term(term); } @@ -126,13 +171,51 @@ impl Builder{ /// Insert an `HpoTermInternal` to the ontology /// - /// This method does not link the term to its parents or to any annotations - pub(crate) fn add_term(&mut self, term: HpoTermInternal) -> HpoTermId { - let id = *term.id(); + /// This method does not link the term to its parents or to any annotations. + /// Since `HpoTermInternal` is a crate-private struct, this method + /// is only available in-crate. + pub(crate) fn add_term(&mut self, term: HpoTermInternal) { self.hpo_terms.insert(term); - id } + /// Adds a new term to the ontology + /// + /// The term does not have any connections to other terms or any gene/disease + /// annotations. Parents and children of terms can be added in the + /// `Builder` state. + /// + /// # Examples + /// + /// ``` + /// use hpo::builder::Builder; + /// + /// let mut builder = Builder::new(); + /// + /// // Add three terms + /// builder.new_term("Root", 1u32); + /// builder.new_term("First child", 2u32); + /// builder.new_term("Second child", 3u32); + /// + /// // quickly transition through all stages to build the ontology + /// let ontology = builder + /// .terms_complete() + /// .connect_all_terms() + /// .calculate_information_content().unwrap() + /// .build_minimal(); + /// + /// assert_eq!(ontology.len(), 3); + /// ``` + pub fn new_term>(&mut self, name: &str, id: I) { + let term = HpoTermInternal::new(name.to_string(), id.into()); + self.add_term(term); + } + + /// Transitions the state to `Builder` + /// + /// This method indicates that all terms have been added. It is not possible + /// to add new terms afterwards. + /// Transitioning to `Builder` is required to crate parent-child + /// connections between terms. #[must_use] pub fn terms_complete(self) -> Builder { transition_state(self) @@ -140,26 +223,39 @@ impl Builder{ } impl Builder { - /// Add a connection from an [`HpoTerm`] to its parent + /// Add a connection from an [`HpoTerm`](`crate::HpoTerm`) to its parent /// /// This method is called once for every dependency in the Ontology during the initialization. /// - /// There should rarely be a need to call this method outside of the ontology building - /// - /// # Panics + /// # Errors /// - /// This method will panic if the `parent_id` or `child_id` is not present in the Ontology + /// This method will return `HpoError::DoesNotExist` when the parent or child + /// `HpoTerm` does not exist. /// /// # Examples /// /// ``` - /// use hpo::Ontology; + /// use hpo::builder::Builder; + /// # use hpo::builder::AllTerms; + /// + /// fn example_builder() -> Builder + /// # { + /// # let mut builder = Builder::new(); + /// # builder.new_term("Foo", 1u32); + /// # builder.new_term("Bar", 2u32); + /// # builder.terms_complete() + /// # } + /// + /// let mut builder: Builder = example_builder(); /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Foo".into(), 1u32); - /// ontology.insert_term("Bar".into(), 2u32); + /// // connect a term to its parent + /// builder.add_parent(1u32, 2u32).unwrap(); /// - /// ontology.add_parent(1u32, 2u32); + /// // quickly transition through all stages to build the ontology + /// let ontology = builder + /// .connect_all_terms() + /// .calculate_information_content().unwrap() + /// .build_minimal(); /// /// assert!(ontology.hpo(2u32).unwrap().parent_ids().contains(&1u32.into())); /// ``` @@ -167,6 +263,33 @@ impl Builder { &mut self, parent_id: I, child_id: J, + ) -> HpoResult<()> { + let parent = self + .hpo_terms + .get_mut(parent_id.into()) + .ok_or(HpoError::DoesNotExist)?; + parent.add_child(child_id); + + let child = self + .hpo_terms + .get_mut(child_id.into()) + .ok_or(HpoError::DoesNotExist)?; + child.add_parent(parent_id); + Ok(()) + } + + /// Add a connection from an [`HpoTerm`](`crate::HpoTerm`) to its parent + /// + /// This method is called once for every dependency in the Ontology during the initialization. + /// + /// # Panics + /// + /// This method will panic if the `parent_id` or `child_id` is not present in the Ontology + /// + pub(crate) fn add_parent_unchecked + Copy, J: Into + Copy>( + &mut self, + parent_id: I, + child_id: J, ) { let parent = self.hpo_terms.get_unchecked_mut(parent_id.into()); parent.add_child(child_id); @@ -175,7 +298,7 @@ impl Builder { child.add_parent(parent_id); } - /// Connects an [`HpoTerm`] to its parent term + /// Connects an [`HpoTerm`](`crate::HpoTerm`) to its parent term /// /// This method is part of the Ontology-building, based on the binary /// data format and requires a specified data layout. @@ -205,45 +328,48 @@ impl Builder { for _ in 0..n_parents { let parent = HpoTermId::from([bytes[idx], bytes[idx + 1], bytes[idx + 2], bytes[idx + 3]]); - self.add_parent(parent, term); + self.add_parent_unchecked(parent, term); idx += 4; } } } - - /// Crates and caches the `all_parents` values for every term + /// Transitions the state to `Builder` /// - /// This method can only be called once and afterwards no new terms - /// should be added to the Ontology anymore and no new term-parent connection - /// should be created. - /// Since this method caches the results, rerunning it will not cause a new - /// calculation. + /// After changing the state no new terms can be added to the Ontology + /// anymore and no new term-parent connection can should be created. /// /// # Examples /// /// ``` - /// use hpo::Ontology; + /// use hpo::builder::Builder; + /// # use hpo::builder::{AllTerms, ConnectedTerms}; + /// + /// fn example_builder() -> Builder + /// # { + /// # let mut builder = Builder::new(); + /// # builder.new_term("Foo", 1u32); + /// # builder.new_term("Bar", 2u32); + /// # let mut builder = builder.terms_complete(); + /// # builder.add_parent(1u32, 2u32).unwrap(); + /// # builder + /// # } /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Root".into(), 1u32); - /// ontology.insert_term("Foo".into(), 2u32); - /// ontology.insert_term("Bar".into(), 3u32); + /// let mut builder: Builder = example_builder(); /// - /// ontology.add_parent(1u32, 2u32); - /// ontology.add_parent(2u32, 3u32); + /// // connect all the terms and return a `Builder` + /// let builder: Builder = builder.connect_all_terms(); /// - /// // At this point #3 does not have info about grandparents - /// assert!(!ontology.hpo(3u32).unwrap().all_parent_ids().contains(&1u32.into())); + /// // quickly transition through all stages to build the ontology + /// let ontology = builder + /// .calculate_information_content().unwrap() + /// .build_minimal(); /// - /// ontology.create_cache(); - /// assert!(ontology.hpo(3u32).unwrap().all_parent_ids().contains(&1u32.into())); + /// assert!(ontology.hpo(2u32).unwrap().parent_ids().contains(&1u32.into())); /// ``` #[must_use] pub fn connect_all_terms(mut self) -> Builder { - let term_ids: Vec = self.hpo_terms.keys(); - - for id in term_ids { + for id in self.hpo_terms.keys() { self.create_cache_of_grandparents(id); } transition_state(self) @@ -292,8 +418,294 @@ impl Builder { } } - impl Builder { + /// Add the [`Gene`] as annotation to the [`HpoTerm`](`crate::HpoTerm`) + /// + /// The gene will be recursively connected to all parent `HpoTerms` as well. + /// + /// # Errors + /// + /// If the HPO term is not present, an [`HpoError::DoesNotExist`] is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::HpoTermId; + /// use hpo::annotations::GeneId; + /// use hpo::builder::Builder; + /// # use hpo::builder::ConnectedTerms; + /// + /// fn example_builder() -> Builder + /// # { + /// # let mut builder = Builder::new(); + /// # builder.new_term("Foo", 1u32); + /// # builder.new_term("Bar", 2u32); + /// # let mut builder = builder.terms_complete(); + /// # builder.add_parent(1u32, 2u32).unwrap(); + /// # builder.connect_all_terms() + /// # } + /// + /// let mut builder: Builder = example_builder(); + /// + /// builder.annotate_gene(GeneId::from(5), "Gene 1", HpoTermId::from(1u32)); + /// + /// // quickly transition through all stages to build the ontology + /// let ontology = builder + /// .calculate_information_content().unwrap() + /// .build_minimal(); + /// + /// let term = ontology.hpo(1u32).unwrap(); + /// assert!(term.genes().find(|gene| gene.name() == "Gene 1").is_some()); + /// assert!(term.genes().find(|gene| gene.name() == "Foobar").is_none()); + /// ``` + #[allow(clippy::missing_panics_doc)] + pub fn annotate_gene( + &mut self, + gene_id: GeneId, + gene_name: &str, + term_id: HpoTermId, + ) -> HpoResult<()> { + self.add_gene(gene_name, gene_id); + let gene = self + .genes + .get_mut(&gene_id) + .expect("Gene is present because it was just add_omim_disease"); + + gene.add_term(term_id); + self.link_gene_term(term_id, gene_id)?; + Ok(()) + } + + /// Add the [`OmimDisease`] as annotation to the [`HpoTerm`](`crate::HpoTerm`) + /// + /// The disease will be recursively connected to all parent `HpoTerms` as well. + /// + /// # Errors + /// + /// If the HPO term is not present, an [`HpoError::DoesNotExist`] is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::HpoTermId; + /// use hpo::annotations::{Disease, OmimDiseaseId}; + /// use hpo::builder::Builder; + /// # use hpo::builder::ConnectedTerms; + /// + /// fn example_builder() -> Builder + /// # { + /// # let mut builder = Builder::new(); + /// # builder.new_term("Foo", 1u32); + /// # builder.new_term("Bar", 2u32); + /// # let mut builder = builder.terms_complete(); + /// # builder.add_parent(1u32, 2u32).unwrap(); + /// # builder.connect_all_terms() + /// # } + /// + /// let mut builder: Builder = example_builder(); + /// + /// builder.annotate_omim_disease(OmimDiseaseId::from(5), "Disease 1", HpoTermId::from(1u32)); + /// + /// // quickly transition through all stages to build the ontology + /// let ontology = builder + /// .calculate_information_content().unwrap() + /// .build_minimal(); + /// + /// let term = ontology.hpo(1u32).unwrap(); + /// assert!(term.omim_diseases().find(|disease| disease.name() == "Disease 1").is_some()); + /// assert!(term.omim_diseases().find(|disease| disease.name() == "Foobar").is_none()); + /// ``` + #[allow(clippy::missing_panics_doc)] + pub fn annotate_omim_disease( + &mut self, + omim_id: OmimDiseaseId, + omim_name: &str, + term_id: HpoTermId, + ) -> HpoResult<()> { + self.add_omim_disease(omim_name, omim_id); + let gene = self + .omim_diseases + .get_mut(&omim_id) + .expect("Gene is present because it was just add_omim_disease"); + + gene.add_term(term_id); + self.link_omim_disease_term(term_id, omim_id)?; + + Ok(()) + } + + /// Add the [`OrphaDisease`] as annotation to the [`HpoTerm`](`crate::HpoTerm`) + /// + /// The disease will be recursively connected to all parent `HpoTerms` as well. + /// + /// # Errors + /// + /// If the HPO term is not present, an [`HpoError::DoesNotExist`] is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::HpoTermId; + /// use hpo::annotations::{Disease, OrphaDiseaseId}; + /// use hpo::builder::Builder; + /// # use hpo::builder::ConnectedTerms; + /// + /// fn example_builder() -> Builder + /// # { + /// # let mut builder = Builder::new(); + /// # builder.new_term("Foo", 1u32); + /// # builder.new_term("Bar", 2u32); + /// # let mut builder = builder.terms_complete(); + /// # builder.add_parent(1u32, 2u32).unwrap(); + /// # builder.connect_all_terms() + /// # } + /// + /// let mut builder: Builder = example_builder(); + /// + /// builder.annotate_orpha_disease(OrphaDiseaseId::from(5), "Disease 1", HpoTermId::from(1u32)); + /// + /// // quickly transition through all stages to build the ontology + /// let ontology = builder + /// .calculate_information_content().unwrap() + /// .build_minimal(); + /// + /// let term = ontology.hpo(1u32).unwrap(); + /// assert!(term.orpha_diseases().find(|disease| disease.name() == "Disease 1").is_some()); + /// assert!(term.orpha_diseases().find(|disease| disease.name() == "Foobar").is_none()); + /// ``` + #[allow(clippy::missing_panics_doc)] + pub fn annotate_orpha_disease( + &mut self, + orpha_id: OrphaDiseaseId, + orpha_name: &str, + term_id: HpoTermId, + ) -> HpoResult<()> { + self.add_orpha_disease(orpha_name, orpha_id); + let gene = self + .orpha_diseases + .get_mut(&orpha_id) + .expect("Gene is present because it was just add_orpha_disease"); + + gene.add_term(term_id); + self.link_orpha_disease_term(term_id, orpha_id)?; + + Ok(()) + } + + /// Calculates the [`crate::term::InformationContent`]s for every term + /// and transitions to the `FullyAnnotated` state + /// + /// This method should only be called **after** all terms are added, + /// connected and all genes and diseases are linked as well. + /// + /// # Errors + /// + /// This method returns an error if there are more Genes or Terms than `u16::MAX` + /// because larger numbers can't be safely converted to `f32` + /// + /// # Examples + /// + /// ``` + /// use hpo::HpoTermId; + /// use hpo::annotations::GeneId; + /// use hpo::builder::Builder; + /// # use hpo::builder::ConnectedTerms; + /// + /// fn example_builder() -> Builder + /// # { + /// # let mut builder = Builder::new(); + /// # builder.new_term("Foo", 1u32); + /// # builder.new_term("Bar", 2u32); + /// # let mut builder = builder.terms_complete(); + /// # builder.add_parent(1u32, 2u32).unwrap(); + /// # let mut builder = builder.connect_all_terms(); + /// # builder + /// # } + /// + /// let mut builder: Builder = example_builder(); + /// builder.annotate_gene(GeneId::from(1), "Gene 1", HpoTermId::from(1u32)); + /// builder.annotate_gene(GeneId::from(2), "Gene 2", HpoTermId::from(2u32)); + /// + /// // transition to final state of `Builder` + /// let builder = builder.calculate_information_content().unwrap(); + /// + /// let ontology = builder.build_minimal(); + /// + /// let gene_ic = ontology + /// .hpo(2u32).unwrap() + /// .information_content() + /// .gene(); + /// assert!(gene_ic > 0.0, "{gene_ic}"); + /// ``` + pub fn calculate_information_content(mut self) -> HpoResult> { + self.calculate_gene_ic()?; + self.calculate_omim_disease_ic()?; + self.calculate_orpha_disease_ic()?; + + Ok(transition_state(self)) + } + + /// Adds a [`Gene`](`crate::annotations::Gene`) to the ontology + /// + /// The gene is not yet linked to any terms, this must be done + /// through [`Builder::annotate_gene`](`Builder::annotate_gene`) + /// + /// # Note: + /// + /// There is rarely need to call this method directly. The preferred way is to use + /// [`Builder::annotate_gene`](`Builder::annotate_gene`) + pub fn add_gene(&mut self, gene_name: &str, gene_id: GeneId) { + if let Entry::Vacant(entry) = self.genes.entry(gene_id) { + entry.insert(Gene::new(gene_id, gene_name)); + } + } + + /// Adds an [`OmimDisease`](`crate::annotations::OmimDisease`) to the ontology + /// + /// The gene is not yet linked to any terms, this must be done + /// through [`Builder::annotate_omim_disease`](`Builder::annotate_omim_disease`) + /// + /// # Note: + /// + /// There is rarely need to call this method directly. The preferred way is to use + /// [`Builder::annotate_omim_disease`](`Builder::annotate_omim_disease`) + pub fn add_omim_disease( + &mut self, + omim_disease_name: &str, + omim_disease_id: OmimDiseaseId, + ) -> OmimDiseaseId { + match self.omim_diseases.entry(omim_disease_id) { + std::collections::hash_map::Entry::Occupied(_) => omim_disease_id, + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(OmimDisease::new(omim_disease_id, omim_disease_name)); + omim_disease_id + } + } + } + + /// Adds an [`OrphaDisease`](`crate::annotations::OrphaDisease`) to the ontology + /// + /// The gene is not yet linked to any terms, this must be done + /// through [`Builder::annotate_orpha_disease`](`Builder::annotate_orpha_disease`) + /// + /// # Note: + /// + /// There is rarely need to call this method directly. The preferred way is to use + /// [`Builder::annotate_orpha_disease`](`Builder::annotate_orpha_disease`) + pub fn add_orpha_disease( + &mut self, + orpha_disease_name: &str, + orpha_disease_id: OrphaDiseaseId, + ) -> OrphaDiseaseId { + match self.orpha_diseases.entry(orpha_disease_id) { + std::collections::hash_map::Entry::Occupied(_) => orpha_disease_id, + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(OrphaDisease::new(orpha_disease_id, orpha_disease_name)); + orpha_disease_id + } + } + } + /// Adds genes to the ontoloigy and connects them to connected terms /// /// This method is part of the Ontology-building, based on the binary @@ -303,7 +715,7 @@ impl Builder { /// method assumes that the bytes encode all gene-term connections. /// /// See [`Gene::as_bytes`] for explanation of the binary layout - pub (crate) fn add_genes_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { + pub(crate) fn add_genes_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { let mut idx: usize = 0; loop { if idx >= bytes.len() { @@ -329,7 +741,7 @@ impl Builder { /// method assumes that the bytes encode all Disease-term connections. /// /// See [`OmimDisease::as_bytes`] for explanation of the binary layout - pub (crate) fn add_omim_disease_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { + pub(crate) fn add_omim_disease_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { let mut idx: usize = 0; loop { if idx >= bytes.len() { @@ -355,7 +767,7 @@ impl Builder { /// method assumes that the bytes encode all Disease-term connections. /// /// See [`OrphaDisease::as_bytes`] for explanation of the binary layout - pub (crate) fn add_orpha_disease_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { + pub(crate) fn add_orpha_disease_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { let mut idx: usize = 0; loop { if idx >= bytes.len() { @@ -372,37 +784,22 @@ impl Builder { Ok(()) } - /// Add the [`Gene`] as annotation to the [`HpoTerm`] + /// Add the [`Gene`] as annotation to the [`HpoTerm`](`crate::HpoTerm`) /// /// The gene will be recursively connected to all parent `HpoTerms` as well. /// - /// This method does not add the HPO-term to the [`Gene`], this must be handled - /// by the client. + /// This method does not add the HPO-term to the [`Gene`], this + /// must be handled by the client. /// /// # Errors /// - /// If the HPO term is not present, an [`HpoError::DoesNotExist`] is returned - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::GeneId; - /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Term-Foo".into(), 1u32); - /// ontology.add_gene("Foo", GeneId::from(5)); - /// ontology.link_gene_term(1u32, GeneId::from(5u32)).unwrap(); + /// If the HPO term is not present, an [`HpoError`] is returned /// - /// let term = ontology.hpo(1u32).unwrap(); - /// assert_eq!(term.genes().next().unwrap().name(), "Foo"); - /// ``` - pub fn link_gene_term>( - &mut self, - term_id: I, - gene_id: GeneId, - ) -> HpoResult<()> { - let term = self.hpo_terms.get_mut(term_id.into()).ok_or(HpoError::DoesNotExist)?; + fn link_gene_term(&mut self, term_id: HpoTermId, gene_id: GeneId) -> HpoResult<()> { + let term = self + .hpo_terms + .get_mut(term_id) + .ok_or(HpoError::DoesNotExist)?; if term.add_gene(gene_id) { // If the gene is already associated to the term, this branch will @@ -416,7 +813,7 @@ impl Builder { Ok(()) } - /// Add the [`OmimDisease`] as annotation to the [`HpoTerm`] + /// Add the [`OmimDisease`] as annotation to the [`HpoTerm`](`crate::HpoTerm`) /// /// The disease will be recursively connected to all parent `HpoTerms` as well. /// @@ -427,26 +824,15 @@ impl Builder { /// /// If the HPO term is not present, an [`HpoError`] is returned /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::{Disease, OmimDiseaseId}; - /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Term-Foo".into(), 1u32); - /// ontology.add_omim_disease("Foo", "5"); - /// ontology.link_omim_disease_term(1u32, OmimDiseaseId::from(5u32)).unwrap(); - /// - /// let term = ontology.hpo(1u32).unwrap(); - /// assert_eq!(term.omim_diseases().next().unwrap().name(), "Foo"); - /// ``` - pub fn link_omim_disease_term>( + fn link_omim_disease_term( &mut self, - term_id: I, + term_id: HpoTermId, omim_disease_id: OmimDiseaseId, ) -> HpoResult<()> { - let term = self.hpo_terms.get_mut(term_id.into()).ok_or(HpoError::DoesNotExist)?; + let term = self + .hpo_terms + .get_mut(term_id) + .ok_or(HpoError::DoesNotExist)?; if term.add_omim_disease(omim_disease_id) { // If the disease is already associated to the term, this branch will @@ -460,7 +846,7 @@ impl Builder { Ok(()) } - /// Add the [`OrphaDisease`] as annotation to the [`HpoTerm`] + /// Add the [`OrphaDisease`] as annotation to the [`HpoTerm`](`crate::HpoTerm`) /// /// The disease will be recursively connected to all parent `HpoTerms` as well. /// @@ -471,26 +857,15 @@ impl Builder { /// /// If the HPO term is not present, an [`HpoError`] is returned /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::{Disease, OrphaDiseaseId}; - /// - /// let mut ontology = Ontology::default(); - /// ontology.insert_term("Term-Foo".into(), 1u32); - /// ontology.add_orpha_disease("Foo", "5"); - /// ontology.link_orpha_disease_term(1u32, OrphaDiseaseId::from(5u32)).unwrap(); - /// - /// let term = ontology.hpo(1u32).unwrap(); - /// assert_eq!(term.orpha_diseases().next().unwrap().name(), "Foo"); - /// ``` - pub fn link_orpha_disease_term>( + fn link_orpha_disease_term( &mut self, - term_id: I, + term_id: HpoTermId, orpha_disease_id: OrphaDiseaseId, ) -> HpoResult<()> { - let term = self.hpo_terms.get_mut(term_id.into()).ok_or(HpoError::DoesNotExist)?; + let term = self + .hpo_terms + .get_mut(term_id) + .ok_or(HpoError::DoesNotExist)?; if term.add_orpha_disease(orpha_disease_id) { // If the disease is already associated to the term, this branch will @@ -504,40 +879,6 @@ impl Builder { Ok(()) } - /// Calculates the [`crate::term::InformationContent`]s for every term - /// - /// This method should only be called **after** all terms are added, - /// connected and all genes and diseases are linked as well. - /// - /// It can be called repeatedly, all values are recalculated each time, - /// as long as the Ontology contains at least 1 gene/disease. - /// When no genes/diseases are present, the IC is not calculated nor updated. - /// - /// # Errors - /// - /// This method returns an error if there are more Genes or Terms than `u16::MAX` - /// because larger numbers can't be safely converted to `f32` - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// - /// let mut ontology = Ontology::default(); - /// - /// // [all kind of logic to add terms, diseases, genes....] - /// - /// ontology.calculate_information_content().unwrap(); - /// ``` - #[must_use] - pub fn calculate_information_content(mut self) -> HpoResult> { - self.calculate_gene_ic()?; - self.calculate_omim_disease_ic()?; - self.calculate_orpha_disease_ic()?; - - Ok(transition_state(self)) - } - /// Calculates the gene-specific Information Content for every term /// /// If no genes are present in the Ontology, no IC are calculated @@ -580,30 +921,57 @@ impl Builder { } } - impl Builder { + /// Builds the [`Ontology`] with default settings + /// + /// This method can only be used with the standard HPO from Jax + /// and will most likely not work with custom ontologies. + /// + /// # Errors + /// + /// This method requires that the main-category terms: + /// + /// - `HP:0000001 | All` + /// - `HP:0000118 | Phenotypic abnormality` + /// + /// are present in the Ontology. pub fn build_with_defaults(self) -> HpoResult { - let mut ont = Ontology { + let mut ont = self.build_minimal(); + ont.set_default_categories()?; + ont.set_default_modifier()?; + Ok(ont) + } + + /// Builds the [`Ontology`] + /// + /// This method will not specify different phenotype + /// categories or modifier terms. + /// + /// Use this method only with custom ontologies. When using the standard + /// Jax ontology, use the recommended [`Builder::build_with_defaults`] + /// method. + pub fn build_minimal(self) -> Ontology { + Ontology { hpo_terms: self.hpo_terms, genes: self.genes, omim_diseases: self.omim_diseases, orpha_diseases: self.orpha_diseases, hpo_version: self.hpo_version, ..Default::default() - }; - ont.set_default_categories()?; - ont.set_default_modifier()?; - Ok(ont) + } } } impl Builder { + /// Defines the HPO version of the Ontology + /// The version should be specified as \[YEAR\]-\[MONTH\]-\[DAY\], e.g. + /// `2024-08-21` pub fn set_hpo_version(&mut self, version: (u16, u8, u8)) { self.hpo_version = version; } /// Parses `Bytes` into the Jax-Ontology release version - pub (crate) fn hpo_version_from_bytes(&mut self, bytes: &Bytes) -> HpoResult { + pub(crate) fn hpo_version_from_bytes(&mut self, bytes: &Bytes) -> HpoResult { if bytes.version() == BinaryVersion::V1 { self.set_hpo_version((0u16, 0u8, 0u8)); Ok(0) @@ -619,51 +987,3 @@ impl Builder { } } } - - -/* -struct OntologyBuilder{} -impl OntologyBuilder { - fn add_terms(&mut self, ) -} - -1. add terms -2. connect terms to parents -3. add genes or diseases -4. connect genes or diseases with terms (2 must be finished) - -stateDiagram-v2 - Builder --> Builder : add terms - Builder --> Builder: add annotations (gene, disease) - Builder --> Builder2: connect parents and children (add_parent()) - Builder2 --> Builder3: cache all terms and parents (create_cache()) - Builder2 --> Builder2: add annotations (gene, disease) - Builder3 --> Builder3: add annotations (gene, disease) - Builder3 --> Builder3: set_categories, set_modifier - Builder3 --> Builder3: link annotations to terms - Builder3 --> Builder4: calculate information content - Builder4 --> Builder4: set_categories, set_modifier - - -Builder -| -add_parent() -| -V -Builder -| -crate_cache() -| -V -Builder -| -calculate_information_content() -| -V -Builder -| -ontology() -| -V -Ontology -*/ diff --git a/src/parser.rs b/src/parser.rs index 0b90a4a..b60b74f 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -2,7 +2,9 @@ use std::path::Path; -use crate::{HpoResult, Ontology}; +use builder::Builder; + +use crate::{ontology::builder, HpoResult, Ontology}; pub(crate) mod binary; /// Module to parse `hp.obo` file @@ -15,6 +17,8 @@ pub(crate) mod hp_obo; pub(crate) mod gene_to_hpo { use crate::annotations::GeneId; + use crate::ontology::builder::ConnectedTerms; + use crate::ontology::Builder; use crate::parser::Path; use crate::HpoError; use crate::HpoResult; @@ -23,7 +27,6 @@ pub(crate) mod gene_to_hpo { use std::io::BufReader; use crate::HpoTermId; - use crate::Ontology; struct ParsedGene<'a> { ncbi_id: GeneId, @@ -131,9 +134,9 @@ pub(crate) mod gene_to_hpo { /// ``` pub fn parse_genes_to_phenotype>( file: P, - ontology: &mut Ontology, + builder: &mut Builder, ) -> HpoResult<()> { - parse(file, ontology, genes_to_phenotype_line) + parse(file, builder, genes_to_phenotype_line) } /// Parse `phenotype_to_genes.txt` file @@ -146,15 +149,15 @@ pub(crate) mod gene_to_hpo { /// ``` pub fn parse_phenotype_to_genes>( file: P, - ontology: &mut Ontology, + builder: &mut Builder, ) -> HpoResult<()> { - parse(file, ontology, phenotype_to_gene_line) + parse(file, builder, phenotype_to_gene_line) } /// Parses a file to connect genes to HPO terms fn parse, F: Fn(&str) -> HpoResult>>( file: P, - ontology: &mut Ontology, + builder: &mut Builder, parse_line: F, ) -> HpoResult<()> { let filename = file.as_ref().display().to_string(); @@ -169,13 +172,7 @@ pub(crate) mod gene_to_hpo { })?; let gene = parse_line(&line)?; - - ontology.add_gene(gene.symbol, gene.ncbi_id); - ontology.link_gene_term(gene.hpo, gene.ncbi_id)?; - ontology - .gene_mut(&gene.ncbi_id) - .expect("Gene is present because it was just add_omim_disease") - .add_term(gene.hpo); + builder.annotate_gene(gene.ncbi_id, gene.symbol, gene.hpo)?; } Ok(()) } @@ -342,7 +339,10 @@ pub(crate) mod gene_to_hpo { /// ``` /// pub(crate) mod disease_to_hpo { - use crate::annotations::Disease; + use crate::annotations::OmimDiseaseId; + use crate::annotations::OrphaDiseaseId; + use crate::ontology::builder::ConnectedTerms; + use crate::ontology::Builder; use crate::HpoError; use crate::HpoResult; use crate::HpoTermId; @@ -351,8 +351,6 @@ pub(crate) mod disease_to_hpo { use std::io::BufReader; use std::path::Path; - use crate::Ontology; - enum DiseaseKind<'a> { Omim(DiseaseComponents<'a>), Orpha(DiseaseComponents<'a>), @@ -364,6 +362,16 @@ pub(crate) mod disease_to_hpo { hpo_id: HpoTermId, } + impl<'a> DiseaseComponents<'a> { + fn omim_disease_id(&self) -> HpoResult { + OmimDiseaseId::try_from(self.id) + } + + fn orpha_disease_id(&self) -> HpoResult { + OrphaDiseaseId::try_from(self.id) + } + } + fn parse_line(line: &str) -> HpoResult>> { if line.starts_with("OMIM") { Ok(parse_disease_components(line)?.map(DiseaseKind::Omim)) @@ -374,7 +382,7 @@ pub(crate) mod disease_to_hpo { } } - fn parse_disease_components(line: &str) -> HpoResult>> { + fn parse_disease_components(line: &str) -> HpoResult> { let mut cols = line.trim().splitn(5, '\t'); let Some(id_col) = cols.next() else { @@ -413,7 +421,7 @@ pub(crate) mod disease_to_hpo { /// - [`HpoError::CannotOpenFile`]: Source file not present or can't be opened /// - [`HpoError::ParseIntError`]: A line contains an invalid `omim_disease_id` /// - [`HpoError::DoesNotExist`]: A line contains a non-existing [`HpoTermId`] - pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { + pub fn parse>(file: P, builder: &mut Builder) -> HpoResult<()> { let filename = file.as_ref().display().to_string(); let file = File::open(file).map_err(|_| HpoError::CannotOpenFile(filename))?; let reader = BufReader::new(file); @@ -421,22 +429,18 @@ pub(crate) mod disease_to_hpo { let line = line.unwrap(); match parse_line(&line)? { Some(DiseaseKind::Omim(omim)) => { - let omim_disease_id = ontology.add_omim_disease(omim.name, omim.id)?; - ontology.link_omim_disease_term(omim.hpo_id, omim_disease_id)?; - - ontology - .omim_disease_mut(&omim_disease_id) - .ok_or(HpoError::DoesNotExist)? - .add_term(omim.hpo_id); + builder.annotate_omim_disease( + omim.omim_disease_id()?, + omim.name, + omim.hpo_id, + )?; } Some(DiseaseKind::Orpha(orpha)) => { - let orpha_disease_id = ontology.add_orpha_disease(orpha.name, orpha.id)?; - ontology.link_orpha_disease_term(orpha.hpo_id, orpha_disease_id)?; - - ontology - .orpha_disease_mut(&orpha_disease_id) - .ok_or(HpoError::DoesNotExist)? - .add_term(orpha.hpo_id); + builder.annotate_orpha_disease( + orpha.orpha_disease_id()?, + orpha.name, + orpha.hpo_id, + )?; } _ => {} } @@ -526,22 +530,28 @@ pub(crate) fn load_from_jax_files_with_transivitve_genes>( obo_file: P, gene_file: P, disease_file: P, - ontology: &mut Ontology, -) -> HpoResult<()> { - hp_obo::read_obo_file(obo_file, ontology)?; - gene_to_hpo::parse_phenotype_to_genes(gene_file, ontology)?; - disease_to_hpo::parse(disease_file, ontology)?; - Ok(()) +) -> HpoResult { + let builder = Builder::new(); + let builder = hp_obo::read_obo_file(obo_file, builder)?; + let mut builder = builder.connect_all_terms(); + gene_to_hpo::parse_phenotype_to_genes(gene_file, &mut builder)?; + disease_to_hpo::parse(disease_file, &mut builder)?; + builder + .calculate_information_content()? + .build_with_defaults() } pub(crate) fn load_from_jax_files>( obo_file: P, gene_file: P, disease_file: P, - ontology: &mut Ontology, -) -> HpoResult<()> { - hp_obo::read_obo_file(obo_file, ontology)?; - gene_to_hpo::parse_genes_to_phenotype(gene_file, ontology)?; - disease_to_hpo::parse(disease_file, ontology)?; - Ok(()) +) -> HpoResult { + let builder = Builder::new(); + let builder = hp_obo::read_obo_file(obo_file, builder)?; + let mut builder = builder.connect_all_terms(); + gene_to_hpo::parse_genes_to_phenotype(gene_file, &mut builder)?; + disease_to_hpo::parse(disease_file, &mut builder)?; + builder + .calculate_information_content()? + .build_with_defaults() } diff --git a/src/parser/hp_obo.rs b/src/parser/hp_obo.rs index 1e0b843..aaf6d72 100644 --- a/src/parser/hp_obo.rs +++ b/src/parser/hp_obo.rs @@ -3,7 +3,10 @@ use tracing::{error, trace, warn}; use crate::{parser::Path, HpoError, HpoResult}; use std::fs; -use crate::{term::internal::HpoTermInternal, HpoTermId, Ontology}; +use crate::{term::internal::HpoTermInternal, HpoTermId}; + +use crate::ontology::builder::{AllTerms, LooseCollection}; +use crate::ontology::Builder; /// Links terms to each other (Child - Parent) /// @@ -21,7 +24,10 @@ type Connections = Vec<(HpoTermId, HpoTermId)>; /// /// If you use this function you cannot add additional terms or /// parents afterwards, since all dependency data will be already cached. -pub(super) fn read_obo_file>(filename: P, ontology: &mut Ontology) -> HpoResult<()> { +pub(super) fn read_obo_file>( + filename: P, + mut builder: Builder, +) -> HpoResult> { // stores tuples of Term - Parent let mut connections: Connections = Vec::new(); @@ -34,14 +40,15 @@ pub(super) fn read_obo_file>(filename: P, ontology: &mut Ontology for term in file_content.split("\n\n") { if let Some(term) = term.strip_prefix("[Term]\n") { if let Some(raw_term) = term_from_obo(term) { - let id = ontology.add_term(raw_term); + let id = *raw_term.id(); + builder.add_term(raw_term); add_connections(&mut connections, term, id); } else { warn!("Unable to parse: {}", term); } } else if term.starts_with("format-version: 1.2") { trace!("Parsing the header"); - ontology.set_hpo_version(version_from_obo(term).unwrap_or_else(|| { + builder.set_hpo_version(version_from_obo(term).unwrap_or_else(|| { warn!("No HPO Ontology version detected"); (0u16, 0u8, 0u8) })); @@ -50,12 +57,13 @@ pub(super) fn read_obo_file>(filename: P, ontology: &mut Ontology } } + let mut builder = builder.terms_complete(); + for (child, parent) in connections { - ontology.add_parent(parent, child); + builder.add_parent_unchecked(parent, child); } - ontology.create_cache(); - Ok(()) + Ok(builder) } fn version_from_obo(header: &str) -> Option<(u16, u8, u8)> { @@ -124,12 +132,16 @@ mod test { // use std::fs; use super::*; - use crate::Ontology; #[test] fn split_terms() { - let mut ont = Ontology::default(); - read_obo_file("tests/small.obo", &mut ont).unwrap(); + let builder = Builder::new(); + let builder = read_obo_file("tests/small.obo", builder).unwrap(); + let ont = builder + .connect_all_terms() + .calculate_information_content() + .unwrap() + .build_minimal(); assert_eq!(ont.len(), 4); diff --git a/src/set.rs b/src/set.rs index 143fec1..72df83b 100644 --- a/src/set.rs +++ b/src/set.rs @@ -759,12 +759,33 @@ impl<'b, 'a> Extend> for HpoSet<'a> { #[cfg(test)] mod test { + use crate::ontology::builder::{AllTerms, ConnectedTerms, LooseCollection}; + use crate::ontology::Builder; use crate::similarity::{Builtins, StandardCombiner}; use crate::term::internal::HpoTermInternal; use crate::term::HpoGroup; use crate::term::InformationContentKind; use crate::{HpoSet, Ontology}; + fn builder_from_ontology(ont: &Ontology) -> Builder { + let mut builder = Builder::new(); + for term in ont { + builder.add_term(ont.get_unchecked(term.id()).clone()); + } + builder + } + + fn connect_terms(ont: &Ontology, mut builder: Builder) -> Builder { + for term in ont { + for parent in term.parents() { + builder + .add_parent(parent.id(), term.id()) + .expect("Term or parent must be present in builder"); + } + } + builder.connect_all_terms() + } + #[test] fn test() { let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); @@ -798,12 +819,19 @@ mod test { #[test] fn test_obsolete() { - let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + let mut builder = builder_from_ontology(&ontology); let mut obsolete_term = HpoTermInternal::new("Obsolete: Foo".to_string(), 666u32.into()); *obsolete_term.obsolete_mut() = true; + builder.add_term(obsolete_term); - ontology.add_term(obsolete_term); + let builder = connect_terms(&ontology, builder.terms_complete()); + let ontology = builder + .calculate_information_content() + .expect("Able to calculate IC in tests") + .build_with_defaults() + .expect("Able to build Ontology in tests"); let mut hpos = HpoGroup::new(); hpos.insert(707u32); @@ -824,11 +852,19 @@ mod test { #[test] fn test_with_replaced_obsolete() { - let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + let mut builder = builder_from_ontology(&ontology); let mut obsolete_term = HpoTermInternal::new("Obsolete: Foo".to_string(), 666u32.into()); *obsolete_term.replacement_mut() = Some(25454u32.into()); - ontology.add_term(obsolete_term.clone()); + builder.add_term(obsolete_term); + + let builder = connect_terms(&ontology, builder.terms_complete()); + let ontology = builder + .calculate_information_content() + .expect("Able to calculate IC in tests") + .build_with_defaults() + .expect("Able to build Ontology in tests"); let mut hpos = HpoGroup::new(); hpos.insert(707u32); @@ -852,11 +888,19 @@ mod test { #[test] fn test_replace_obsolete() { - let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + let mut builder = builder_from_ontology(&ontology); let mut obsolete_term = HpoTermInternal::new("Obsolete: Foo".to_string(), 666u32.into()); *obsolete_term.replacement_mut() = Some(25454u32.into()); - ontology.add_term(obsolete_term.clone()); + builder.add_term(obsolete_term); + + let builder = connect_terms(&ontology, builder.terms_complete()); + let ontology = builder + .calculate_information_content() + .expect("Able to calculate IC in tests") + .build_with_defaults() + .expect("Able to build Ontology in tests"); let mut hpos = HpoGroup::new(); hpos.insert(707u32); From ba57531b093de80a07eb0b16ee9b9a74c53aa978 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Wed, 21 Aug 2024 19:59:24 +0200 Subject: [PATCH 3/8] Remove methods to modifiy genes or diseases within Ontology --- src/ontology.rs | 76 ------------------------------------------------- 1 file changed, 76 deletions(-) diff --git a/src/ontology.rs b/src/ontology.rs index 2f6b298..981b5ab 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -1166,13 +1166,7 @@ impl Ontology { println!("At the end: {code}"); code } -} -/// Methods to add annotations -/// -/// These methods should rarely (if ever) be used by clients. -/// Calling these functions might disrupt the Ontology and associated terms. -impl Ontology { /// Returns a mutable reference to the categories vector /// /// This is a vector that should contain top-level `HpoTermId`s used for @@ -1266,76 +1260,6 @@ impl Ontology { Ok(()) } - /// Returns a mutable reference to the [`Gene`] of the provided [`GeneId`] - /// - /// If no such gene is present, `None` is returned - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// - /// let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); - /// - /// let mut gene = ontology.gene_mut(&2175u32.into()).unwrap(); - /// assert_eq!(gene.hpo_terms().len(), 3); - /// gene.add_term(1u32); - /// assert_eq!(gene.hpo_terms().len(), 4); - /// ``` - pub fn gene_mut(&mut self, gene_id: &GeneId) -> Option<&mut Gene> { - self.genes.get_mut(gene_id) - } - - /// Returns a mutable reference to the [`OmimDisease`] of the provided [`OmimDiseaseId`] - /// - /// If no such disease is present, `None` is returned - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::Disease; - /// - /// let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); - /// - /// let mut disease = ontology.omim_disease_mut(&269880u32.into()).unwrap(); - /// assert_eq!(disease.hpo_terms().len(), 1); - /// disease.add_term(1u32); - /// assert_eq!(disease.hpo_terms().len(), 2); - /// ``` - pub fn omim_disease_mut( - &mut self, - omim_disease_id: &OmimDiseaseId, - ) -> Option<&mut OmimDisease> { - self.omim_diseases.get_mut(omim_disease_id) - } - - /// Returns a mutable reference to the [`OrphaDisease`] of the provided [`OrphaDiseaseId`] - /// - /// If no such disease is present, `None` is returned - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// use hpo::annotations::Disease; - /// - /// let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); - /// - /// let mut disease = ontology.orpha_disease_mut(&110u32.into()).unwrap(); - /// assert_eq!(disease.hpo_terms().len(), 1); - /// disease.add_term(1u32); - /// assert_eq!(disease.hpo_terms().len(), 2); - /// ``` - pub fn orpha_disease_mut(&mut self, disease_id: &OrphaDiseaseId) -> Option<&mut OrphaDisease> { - self.orpha_diseases.get_mut(disease_id) - } -} - -/// Crate-only functions for setting up and building the Ontology -/// -/// Those methods should not be exposed publicly -impl Ontology { /// Returns a binary representation of the Ontology's metadata /// /// It adds the HPO-identifying bytes `HPO`, the version From e325dc29aad05beb249f0c41b976fa712acc100a Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Wed, 21 Aug 2024 20:06:38 +0200 Subject: [PATCH 4/8] Fix embarrasing bug in graphviz test --- src/ontology.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/ontology.rs b/src/ontology.rs index 981b5ab..6da48ad 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -1159,11 +1159,9 @@ impl Ontology { let term_name = term.name().replace(' ', "\n"); let child_name = child.name().replace(' ', "\n"); code.push_str(&format!("\"{term_name}\" -> \"{child_name}\"\n")); - println!("In function: {code}"); } } code.push_str("}\n"); - println!("At the end: {code}"); code } @@ -1507,7 +1505,6 @@ mod test { } #[test] - #[ignore = "fails with weird \0 extra characters"] fn graphiv() { let test_terms = [ ("Root", 1u32), @@ -1519,10 +1516,10 @@ mod test { let mut v: Vec = Vec::new(); for (name, id) in test_terms { - let t = HpoTermInternal::new(String::from(name), id.into()); + let t = HpoTermInternal::new(name.into(), id.into()); v.append(&mut t.as_bytes()); } - ont.add_terms_from_bytes(Bytes::new(&v, parser::binary::BinaryVersion::V1)); + ont.add_terms_from_bytes(Bytes::new(&v, parser::binary::BinaryVersion::V3)); let mut ont = ont.terms_complete(); ont.add_parent_unchecked(1u32, 2u32); From c526b08dd9d4c2741e7788a005da32eeeb099ec2 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Wed, 12 Jun 2024 06:35:57 +0200 Subject: [PATCH 5/8] Add text file that contains Diagrams needed for documentation --- Diagrams.txt | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 Diagrams.txt diff --git a/Diagrams.txt b/Diagrams.txt new file mode 100644 index 0000000..5e340e8 --- /dev/null +++ b/Diagrams.txt @@ -0,0 +1,97 @@ +Diagrams + + + .----------. + | Ontology | + '----------' + | + | + contains + | + | + ^ + .---------. + | |----------. + | HPOTerm | | + | |>---is a -' + '---------' + v v + | | + | | + ^ ^ +.------. .---------. +| Gene | | Disease | +'------' '---------' + + + + + ============== + || Ontology || + ============== ============ + | || HPOSet || + | ============ + v | + ................ | + : IntoIterator : v + '''''''''''''''' .----------. ................ + | | Combined | -------------> : IntoIterator : + | '----------' '''''''''''''''' + v ^ | + .----------------. | | + | ontology::Iter | | | + '----------------' ...ancestors() | + | | | + | | v + | ============= .------------. ============== + iterates ------------> || HpoTerm || <-- iterates -- | term::Iter | ----collect----> || HpoGroup || + ============= '------------' ============== + | ^ + | | + '--- parents()/children() ---' + + + + + + ++--------------------+ +| HP:0000707 | +| | +| Abnormality of the | +| nervous system | ++--------------------+ + + ++---------------------+ +| HP:0000005 | +| | +| Mode of inheritance | ++---------------------+ + ++----------------------------+ +| HP:0002011 | +| | +| Morphological central | +| nervous system abnormality | ++----------------------------+ + ++-------------------------+ +| HP:0025454 | +| | +| Abnormal CSF metabolite | +| concentration | ++-------------------------+ + ++-------------+ +| HP:0003581 | +| | +| Adult onset | ++-------------+ + ++-------------------+ +| HP:0012639 | +| | +| Abnormal nervous | +| system morphology | ++-------------------+ \ No newline at end of file From d8a855c06e8b9590e0f232e35b6652fde24ae53f Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Thu, 11 Jul 2024 11:53:01 +0200 Subject: [PATCH 6/8] Updated Diagrams text file --- Diagrams.txt | 117 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 40 deletions(-) diff --git a/Diagrams.txt b/Diagrams.txt index 5e340e8..be39127 100644 --- a/Diagrams.txt +++ b/Diagrams.txt @@ -53,45 +53,82 @@ Diagrams + +----------------+ + | HP:0000001 | + | | + | All | + +----------------+ + + +----------------+ +----------------+ + | HP:0000118 | | HP:0012823 | + | | | | + | Phenotypic | | Clinical | + | abnormality | | modifier | + +----------------+ +----------------+ + ++----------------+ +----------------+ +| HP:0000707 | | HP:0001939 | +| | | | +| Abnormality | | Abnormality | +| of the | | of | +| nervous system | | metabolism/ | ++----------------+ | homeostasis | + +----------------+ + + ++----------------+ +| HP:0000005 | +| | +| Mode of | +| inheritance | ++----------------+ + ++----------------+ +| HP:0034345 | +| | +| Mendelian | +| inheritance | ++----------------+ + ++----------------+ +| HP:0000007 | +| | +| Autosomal | +| recessive | +| inheritance | ++----------------+ + + + ++----------------+ +----------------+ +| HP:0012638 | | HP:0012639 | +| | | | +| Abnormal | | Abnormal | +| nervous system | | nervous system | +| physiology | | morphology | ++----------------+ +----------------+ - -+--------------------+ -| HP:0000707 | -| | -| Abnormality of the | -| nervous system | -+--------------------+ - - -+---------------------+ -| HP:0000005 | -| | -| Mode of inheritance | -+---------------------+ -+----------------------------+ -| HP:0002011 | -| | -| Morphological central | -| nervous system abnormality | -+----------------------------+ - -+-------------------------+ -| HP:0025454 | -| | -| Abnormal CSF metabolite | -| concentration | -+-------------------------+ - -+-------------+ -| HP:0003581 | -| | -| Adult onset | -+-------------+ - -+-------------------+ -| HP:0012639 | -| | -| Abnormal nervous | -| system morphology | -+-------------------+ \ No newline at end of file ++----------------+ +| HP:0002011 | +| | +| Morphological | +| central | +| nervous system | +| abnormality | ++----------------+ + ++----------------+ +| HP:0025454 | +| | +| Abnormal | +| CSF metabolite | +| concentration | ++----------------+ + ++----------------+ +| HP:0003581 | +| | +| Adult onset | ++----------------+ + From 4badb4b674f3ffd710487c4c88c0605732ae678d Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Sun, 25 Aug 2024 09:02:49 +0200 Subject: [PATCH 7/8] Remove aquamarine dependency This removes mermaid graphs replaces them with ASCII diagrams and a PNG of the example ontology --- Cargo.toml | 1 - Diagrams.txt | 134 ---------------------------- src/ontology.rs | 228 +++++++++++------------------------------------- 3 files changed, 49 insertions(+), 314 deletions(-) delete mode 100644 Diagrams.txt diff --git a/Cargo.toml b/Cargo.toml index 31894f6..87d41a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,6 @@ categories = ["science", "data-structures", "parser-implementations"] [dependencies] thiserror = "1.0" -aquamarine = "0" # used in Docs tracing = "0.1" smallvec = "1" diff --git a/Diagrams.txt b/Diagrams.txt deleted file mode 100644 index be39127..0000000 --- a/Diagrams.txt +++ /dev/null @@ -1,134 +0,0 @@ -Diagrams - - - .----------. - | Ontology | - '----------' - | - | - contains - | - | - ^ - .---------. - | |----------. - | HPOTerm | | - | |>---is a -' - '---------' - v v - | | - | | - ^ ^ -.------. .---------. -| Gene | | Disease | -'------' '---------' - - - - - ============== - || Ontology || - ============== ============ - | || HPOSet || - | ============ - v | - ................ | - : IntoIterator : v - '''''''''''''''' .----------. ................ - | | Combined | -------------> : IntoIterator : - | '----------' '''''''''''''''' - v ^ | - .----------------. | | - | ontology::Iter | | | - '----------------' ...ancestors() | - | | | - | | v - | ============= .------------. ============== - iterates ------------> || HpoTerm || <-- iterates -- | term::Iter | ----collect----> || HpoGroup || - ============= '------------' ============== - | ^ - | | - '--- parents()/children() ---' - - - - - +----------------+ - | HP:0000001 | - | | - | All | - +----------------+ - - +----------------+ +----------------+ - | HP:0000118 | | HP:0012823 | - | | | | - | Phenotypic | | Clinical | - | abnormality | | modifier | - +----------------+ +----------------+ - -+----------------+ +----------------+ -| HP:0000707 | | HP:0001939 | -| | | | -| Abnormality | | Abnormality | -| of the | | of | -| nervous system | | metabolism/ | -+----------------+ | homeostasis | - +----------------+ - - -+----------------+ -| HP:0000005 | -| | -| Mode of | -| inheritance | -+----------------+ - -+----------------+ -| HP:0034345 | -| | -| Mendelian | -| inheritance | -+----------------+ - -+----------------+ -| HP:0000007 | -| | -| Autosomal | -| recessive | -| inheritance | -+----------------+ - - - -+----------------+ +----------------+ -| HP:0012638 | | HP:0012639 | -| | | | -| Abnormal | | Abnormal | -| nervous system | | nervous system | -| physiology | | morphology | -+----------------+ +----------------+ - - -+----------------+ -| HP:0002011 | -| | -| Morphological | -| central | -| nervous system | -| abnormality | -+----------------+ - -+----------------+ -| HP:0025454 | -| | -| Abnormal | -| CSF metabolite | -| concentration | -+----------------+ - -+----------------+ -| HP:0003581 | -| | -| Adult onset | -+----------------+ - diff --git a/src/ontology.rs b/src/ontology.rs index 6da48ad..3dc8399 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -27,7 +27,6 @@ use termarena::Arena; pub use builder::Builder; -#[cfg_attr(doc, aquamarine::aquamarine)] /// `Ontology` is the main interface of the `hpo` crate and contains all data /// /// The [`Ontology`] struct holds all information about the ontology @@ -117,6 +116,30 @@ pub use builder::Builder; /// [`Ontology`] does not contain a direct relationship between genes and diseases. This relation /// is only present indirectly via the connected [`HpoTerm`]s. /// +/// ```text +/// .----------. +/// | Ontology | +/// '----------' +/// | +/// | +/// contains +/// | +/// | +/// ^ +/// .---------. +/// | |----------. +/// | HPOTerm | | +/// | |>---is a -' +/// '---------' +/// v v +/// | | +/// | | +/// ^ ^ +/// .------. .---------. +/// | Gene | | Disease | +/// '------' '---------' +/// ``` +/// /// # Transivity of relations /// /// **New in 0.9.0** @@ -126,32 +149,6 @@ pub use builder::Builder; /// But [`Gene`]s and [`OmimDisease`]s will only contain links to *direct* [`HpoTerm`]s. The annotations /// are not transitiv. /// -/// ```mermaid -/// erDiagram -/// ONTOLOGY ||--|{ HPOTERM : contains -/// HPOTERM ||--|{ HPOTERM : is_a -/// HPOTERM }|--o{ DISEASE : phenotype_of -/// HPOTERM }|--o{ GENE : phenotype_of -/// HPOTERM { -/// str name -/// HpoTermId id -/// HpoTerms parents -/// HpoTerms children -/// Genes genes -/// OmimDiseases omim_diseases -/// } -/// DISEASE { -/// str name -/// OmimDiseaseId id -/// HpoGroup hpo_terms -/// } -/// GENE { -/// str name -/// GeneId id -/// HpoGroup hpo_terms -/// } -/// ``` -/// /// # Relations of different public struct in this module /// /// The below diagram looks complicated at first, but the @@ -160,77 +157,31 @@ pub use builder::Builder; /// The `HpoGroup` is more relevant for internal use, but can also be /// useful for fast set-based operations. /// -/// ```mermaid -/// classDiagram -/// class Ontology { -/// into_iter() -/// } -/// -/// class HpoTerm{ -/// - HpoTermId id -/// - &Ontology -/// parents() HpoTerms -/// parent_ids() HpoGroup -/// all_parent_ids() HpoGroup -/// children() HpoTerms -/// children_ids() HpoTerms -/// common_ancestors() Combine -/// union_ancestors() Combine -/// many-more() -/// } -/// -/// class HpoGroup { -/// - Set~HpoTermId~ -/// into_iter() -/// terms() -/// } -/// -/// class HpoSet { -/// - HpoGroup -/// - &Ontology -/// similarity(...) f32 -/// information_content() -/// } -/// -/// class HpoTermId { -/// - u32: id -/// } -/// -/// class `ontology::Iter` { -/// next() HpoTerm -/// } +/// ```text +/// ============== +/// || Ontology || +/// ============== ============ +/// | || HPOSet || +/// | ============ +/// v | +/// ................ | +/// : IntoIterator : v +/// '''''''''''''''' .----------. ................ +/// | | Combined | -------------> : IntoIterator : +/// | '----------' '''''''''''''''' +/// v ^ | +/// .----------------. | | +/// | ontology::Iter | | | +/// '----------------' ...ancestors() | +/// | | | +/// | | v +/// | ============= .------------. ============== +/// iterates ------------> || HpoTerm || <-- iterates -- | term::Iter | ----collect----> || HpoGroup || +/// ============= '------------' ============== +/// | ^ +/// | | +/// '--- parents()/children() ---' /// -/// class `term::Iter` { -/// next() HpoTerm -/// } -/// -/// class `group::Iter` { -/// next() HpoTermId -/// } -/// -/// class Combine { -/// - HpoGroup -/// into_iter() -/// } -/// -/// Ontology ..|> `ontology::Iter`: hpos() -/// HpoSet ..|> `term::Iter`: iter() -/// HpoGroup ..|> `group::Iter`: iter() -/// HpoGroup ..|> `term::Iter`: terms() -/// Combine ..|> `term::Iter`: iter() -/// -/// `ontology::Iter` --o HpoGroup: collect() -/// `ontology::Iter` --* HpoTerm: iterates() -/// -/// `term::Iter` --* HpoTerm: iterates() -/// `term::Iter` --o HpoGroup: collect() -/// -/// `group::Iter` --* HpoTermId: iterates() -/// `group::Iter` --o HpoGroup: collect() -/// -/// HpoTerm ..|> HpoGroup: parent_ids()/children_ids() -/// HpoTerm ..|> `term::Iter`: parents()/children() -/// HpoTerm ..|> `Combine`: ..._ancestors() /// ``` /// /// # Example ontology @@ -238,88 +189,7 @@ pub use builder::Builder; /// For all examples and tests in this documentation, we're using the /// following small subset of the full Ontology: /// -/// ```mermaid -/// graph TD -/// HP:0011017["HP:0011017
-/// Abnormal cellular physiology"] -/// HP:0010662["HP:0010662
-/// Abnormality of the diencephalon"] -/// HP:0010662 --> HP:0012285 -/// HP:0000005["HP:0000005
-/// Mode of inheritance"] -/// HP:0000005 --> HP:0034345 -/// HP:0012648["HP:0012648
-/// Decreased inflammatory response"] -/// HP:0012443["HP:0012443
-/// Abnormality of brain morphology"] -/// HP:0012443 --> HP:0100547 -/// HP:0003674["HP:0003674
-/// Onset"] -/// HP:0003674 --> HP:0003581 -/// HP:0010978["HP:0010978
-/// Abnormality of immune system physiology"] -/// HP:0010978 --> HP:0012647 -/// HP:0000707["HP:0000707
-/// Abnormality of the nervous system"] -/// HP:0000707 --> HP:0012638 -/// HP:0000707 --> HP:0012639 -/// HP:0034345["HP:0034345
-/// Mendelian inheritance"] -/// HP:0034345 --> HP:0000007 -/// HP:0000001["HP:0000001
-/// All"] -/// HP:0000001 -----> HP:0000005 -/// HP:0000001 --> HP:0000118 -/// HP:0000001 --> HP:0012823 -/// HP:0000818["HP:0000818
-/// Abnormality of the endocrine system"] -/// HP:0000818 --> HP:0000864 -/// HP:0100547["HP:0100547
-/// Abnormal forebrain morphology"] -/// HP:0100547 ----> HP:0010662 -/// HP:0012647["HP:0012647
-/// Abnormal inflammatory response"] -/// HP:0012647 --> HP:0012648 -/// HP:0001939["HP:0001939
-/// Abnormality of metabolism/homeostasis"] -/// HP:0001939 --> HP:0011017 -/// HP:0001939 ---> HP:0025454 -/// HP:0003581["HP:0003581
-/// Adult onset"] -/// HP:0012823["HP:0012823
-/// Clinical modifier"] -/// HP:0012823 --> HP:0031797 -/// HP:0012285["HP:0012285
-/// Abnormal hypothalamus physiology"] -/// HP:0012638["HP:0012638
-/// Abnormal nervous system physiology"] -/// HP:0012638 ----> HP:0012285 -/// HP:0000118["HP:0000118
-/// Phenotypic abnormality"] -/// HP:0000118 --> HP:0000707 -/// HP:0000118 --> HP:0000818 -/// HP:0000118 --> HP:0001939 -/// HP:0000118 -----> HP:0002715 -/// HP:0002011["HP:0002011
-/// Morphological central nervous system abnormality"] -/// HP:0002011 --> HP:0012443 -/// HP:0031797["HP:0031797
-/// Clinical course"] -/// HP:0031797 --> HP:0003674 -/// HP:0012639["HP:0012639
-/// Abnormal nervous system morphology"] -/// HP:0012639 --> HP:0002011 -/// HP:0002715["HP:0002715
-/// Abnormality of the immune system"] -/// HP:0002715 --> HP:0010978 -/// HP:0025454["HP:0025454
-/// Abnormal CSF metabolite concentration"] -/// HP:0000007["HP:0000007
-/// Autosomal recessive inheritance"] -/// HP:0000864["HP:0000864
-/// Abnormality of the hypothalamus-pituitary axis"] -/// HP:0000864 ---> HP:0012285 -/// ``` +/// ![Diagram of Example ontology](https://github.com/user-attachments/assets/0fa29033-e4cc-4bc6-aed3-123162629ca4) #[derive(Default, Clone)] pub struct Ontology { hpo_terms: Arena, From 31f466f3ad604c1c53d30cbdbf7eef2b43a03f31 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Sun, 25 Aug 2024 09:45:39 +0200 Subject: [PATCH 8/8] Update documentation --- src/ontology.rs | 111 ++++++++++++++++++++-------------- src/ontology/builder.rs | 9 ++- src/parser/binary/ontology.rs | 8 +++ 3 files changed, 80 insertions(+), 48 deletions(-) diff --git a/src/ontology.rs b/src/ontology.rs index 3dc8399..e63ec94 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -69,50 +69,66 @@ pub use builder::Builder; /// /// # Construction /// -/// There are two main ways to build the Ontology -/// 1. Download the standard annotation data from -/// [Jax HPO](https://hpo.jax.org/) itself. -/// Then use [`Ontology::from_standard`] to load the data. -/// You need the following files: -/// - `phenotype.hpoa` (Required to connect [`OmimDisease`]s to [`HpoTerm`]s) -/// - `genes_to_phenotype.txt` (Required to connect [`Gene`]s to [`HpoTerm`]s) -/// - alternatively: `phenotype_to_genes.txt` (use [`Ontology::from_standard_transitive`]) -/// - `hp.obo` (Required for [`HpoTerm`]s and their connection to each other) -/// 2. Load the ontology from a binary build using [`Ontology::from_binary`]. +/// There are several ways to build the Ontology /// -/// The [Github repository](https://github.com/anergictcell/hpo) of this crate -/// contains a binary build of the ontology -/// . -/// The snapshot will not always be up to date, so please double-check yourself. +/// ## Using the built-in binary Ontolgy /// -/// You can crate your own binary build of the ontology using the -/// `examples/obo_to_bin.rs` example. +/// The [Github repository](https://github.com/anergictcell/hpo) of this crate +/// contains a binary build of the ontology +/// that can +/// be used to construct an Ontology. This is the most convenient way, but has +/// one small downside: The snapshot of the binary build will not always be up +/// to date, so please double-check the version yourself or create your own copy. /// -/// `cargo run --example --release obo_to_bin ` +/// see [`Ontolgy::from_binary`](`crate::Ontology::from_binary`) /// -/// You can also build it all by yourself (not recommended), in which case you -/// will have to: -/// 1. construct an empty Ontology [`Ontology::default`] -/// 2. Add all terms [`Ontology::insert_term`] -/// 3. Connect terms to their parents [`Ontology::add_parent`] -/// 4. Cache all parent, child and grandparent connections [`Ontology::create_cache`] -/// 5. Add genes and diseases to the ontology -/// - [`Ontology::add_gene`] and [`Ontology::add_omim_disease`] -/// - Connect genes and diseases to the [`HpoTerm`]s using -/// [`Ontology::link_gene_term`] and [`Ontology::link_omim_disease_term`] -/// (this will automatically take care of "inheriting" the connection to all -/// parent terms) -/// - make sure to also add the linked terms to the genes and diseases -/// [`Gene::add_term`] and [`OmimDisease::add_term`] -/// 6. Calculate the information content [`Ontology::calculate_information_content`] +/// ```no_run +/// use hpo::Ontology; /// +/// let ontology = Ontology::from_binary("tests/ontology.hpo").unwrap(); +/// +/// println!("HPO version: {}", ontology.hpo_version()) +/// ``` +/// +/// ## Using the Ontology data provided by JAX +/// +/// HPO is maintained by [Jax HPO](https://hpo.jax.org/) and they provide +/// all masterdata to download. To construct an Ontology you need the files: +/// +/// - `hp.obo` (Required for [`HpoTerm`]s and their connection to each other) +/// - `phenotype.hpoa` (Required to connect [`OmimDisease`]s to [`HpoTerm`]s) +/// - `genes_to_phenotype.txt` (Required to connect [`Gene`]s to [`HpoTerm`]s) +/// +/// You must download the files into a local folder and then specify the path +/// to the folder, see [`Ontolgy::from_standard`](`crate::Ontology::from_standard`) +/// +/// ```bash +/// wget https://github.com/obophenotype/human-phenotype-ontology/releases/latest/download/hp.obo +/// wget https://github.com/obophenotype/human-phenotype-ontology/releases/latest/download/phenotype.hpoa +/// wget https://github.com/obophenotype/human-phenotype-ontology/releases/latest/download/genes_to_phenotype.txt +/// ``` +/// +/// ```no_run +/// use hpo::Ontology; +/// +/// let ontology = Ontology::from_standard("/path/to/jax_hpo_data/").unwrap(); +/// +/// println!("HPO version: {}", ontology.hpo_version()) +/// ``` +/// +/// ## Custom creation of an Ontology +/// +/// `hpo` provides an interface to create your own Ontology. This is not +/// really recommended, though, because there are a few footguns along the +/// way. For more information, check the [`Builder`](`crate::builder::Builder`) +/// struct for more information. /// /// # Layout /// /// The [`Ontology`] contains all terms and all associated genes and diseases. /// [`HpoTerm`]s are connected to each other in a directed relationship. Every term /// (except the term `All`) has at least one parent term in an `is_a` relationship. -/// Terms and [`crate::annotations`] ([`Gene`]s, [`OmimDisease`]s) have a many-to-many relationship. The +/// Terms and [`crate::annotations`] ([`Gene`]s, [`OmimDisease`]s, [`OrphaDisease`]s) have a many-to-many relationship. The /// [`Ontology`] does not contain a direct relationship between genes and diseases. This relation /// is only present indirectly via the connected [`HpoTerm`]s. /// @@ -329,12 +345,15 @@ impl Ontology { /// /// This method can fail for various reasons: /// - /// - Binary file not available: [`HpoError::CannotOpenFile`] - /// - `Ontology::add_genes_from_bytes` failed (TODO) - /// - `Ontology::add_omim_disease_from_bytes` failed (TODO) - /// - `add_terms_from_bytes` failed (TODO) - /// - `add_parent_from_bytes` failed (TODO) - /// - Size of binary data does not match the content: [`HpoError::ParseBinaryError`] + /// - Binary file not available or readable: [`HpoError::CannotOpenFile`] + /// - Invalid data provided: [`HpoError::ParseBinaryError`] + /// - Invalid binary version: [`HpoError::NotImplemented`] + /// - Invalid reference to terms: [`HpoError::DoesNotExist`] + /// + /// # Panics + /// + /// The method can panic if the provided data is incorrectly formatted or + /// contains invalid references between terms, genes or diseases /// /// # Examples /// @@ -390,14 +409,14 @@ impl Ontology { /// /// This method can fail for various reasons: /// - /// - Too few bytes or an invalid version - /// - `Ontology::hpo_version_from_bytes` failed - /// - `Ontology::add_genes_from_bytes` failed - /// - `Ontology::add_omim_disease_from_bytes` failed - /// - `add_terms_from_bytes` failed - /// - `add_parent_from_bytes` failed - /// - Size of binary data does not match the content: [`HpoError::ParseBinaryError`] + /// - Invalid data provided: [`HpoError::ParseBinaryError`] + /// - Invalid binary version: [`HpoError::NotImplemented`] + /// - Invalid reference to terms: [`HpoError::DoesNotExist`] + /// + /// # Panics /// + /// The method can panic if the provided data is incorrectly formatted or + /// contains invalid references between terms, genes or diseases /// /// # Examples /// diff --git a/src/ontology/builder.rs b/src/ontology/builder.rs index 03b8f12..477e92e 100644 --- a/src/ontology/builder.rs +++ b/src/ontology/builder.rs @@ -312,7 +312,8 @@ impl Builder { /// # Panics /// /// This method will panic if the length of bytes does not exactly correspond - /// to the contained data + /// to the contained data or if a `parent_id` or `child_id` is not present + /// in the Ontology pub(crate) fn add_parent_from_bytes(&mut self, bytes: &[u8]) { let mut idx: usize = 0; loop { @@ -793,7 +794,7 @@ impl Builder { /// /// # Errors /// - /// If the HPO term is not present, an [`HpoError`] is returned + /// If the HPO term is not present, an [`HpoError::DoesNotExist`] is returned /// fn link_gene_term(&mut self, term_id: HpoTermId, gene_id: GeneId) -> HpoResult<()> { let term = self @@ -971,6 +972,10 @@ impl Builder { } /// Parses `Bytes` into the Jax-Ontology release version + /// + /// # Errors + /// + /// - Wrong HPO-version format: [`HpoError::ParseBinaryError`] pub(crate) fn hpo_version_from_bytes(&mut self, bytes: &Bytes) -> HpoResult { if bytes.version() == BinaryVersion::V1 { self.set_hpo_version((0u16, 0u8, 0u8)); diff --git a/src/parser/binary/ontology.rs b/src/parser/binary/ontology.rs index 4e01d84..5021081 100644 --- a/src/parser/binary/ontology.rs +++ b/src/parser/binary/ontology.rs @@ -2,6 +2,14 @@ use crate::parser::binary::Bytes; use crate::{HpoError, HpoResult}; +/// Identifies the encoding version of the provided binary data +/// +/// # Errors +/// +/// This method can fail for the following reasons: +/// - Incorrect byte encoding: [`HpoError::ParseBinaryError`] +/// - Invalid version: [`HpoError::NotImplemented`] +/// pub(crate) fn version(bytes: &[u8]) -> HpoResult { if bytes.len() < 5 { return Err(HpoError::ParseBinaryError);