diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c8ade8..bedc6fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ All notable changes to this project will be documented in this file. ## Unreleased +## [0.10.0] + ### Feature - Add Orphante diseases (`OrphaDisease`) to Ontology @@ -11,13 +13,19 @@ All notable changes to this project will be documented in this file. - Add binary version 3 - Add new example ontology +### Documentation + +- Change orders of methods in `Ontology` to clean up the documentation. + ### Refactor +- Improve the OBO parser with better error handling - [**breaking**] Add `Disease` trait that is needed to work with `OmimDisease` and `OrphaDisease` - Update example ontology - Update unit- and doctests to align with updated example ontology -## [0.9.0] - 2024-03-27 + +## [0.9.1] - 2024-03-30 ### Bugfix diff --git a/Cargo.toml b/Cargo.toml index 3d2fba8..dde9d34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "hpo" -version = "0.9.1" +version = "0.10.0" edition = "2021" authors = ["Jonas Marcello "] description = "Human Phenotype Ontology Similarity" @@ -15,7 +15,6 @@ categories = ["science", "data-structures", "parser-implementations"] [dependencies] thiserror = "1.0" aquamarine = "0" # used in Docs -statrs = "0.16.0" tracing = "0.1" smallvec = "1" diff --git a/examples/compare_ontologies.rs b/examples/compare_ontologies.rs index 1366adf..2dade3d 100644 --- a/examples/compare_ontologies.rs +++ b/examples/compare_ontologies.rs @@ -17,7 +17,7 @@ fn ontology(path_arg: &str) -> Ontology { } } -fn ontology2(path_arg: &str) -> Ontology { +fn ontology_transitive(path_arg: &str) -> Ontology { let path = Path::new(path_arg); match path.is_file() { @@ -226,17 +226,21 @@ fn print_replacement_diff(replacements: Option<(Option, Option "); + println!("Usage:\ncompare_ontologies []"); println!("e.g.:\ncompare_ontologies tests/ontology.hpo tests/ontology_v2.hpo:\n"); + println!("Alternatively compare transitive vs non-transitive:\ncompare_ontologies tests/ontology.hpo\n"); process::exit(1) } let arg_old = args.nth(1).unwrap(); - let arg_new = args.next().unwrap(); - let lhs = ontology(&arg_old); - let rhs = ontology2(&arg_new); + + let rhs = if let Some(arg_new) = args.next() { + ontology(&arg_new) + } else { + ontology_transitive(&arg_old) + }; let diffs = lhs.compare(&rhs); diff --git a/src/annotations/gene.rs b/src/annotations/gene.rs index 1f8eb7c..d92adcf 100644 --- a/src/annotations/gene.rs +++ b/src/annotations/gene.rs @@ -119,11 +119,6 @@ impl Gene { &self.name } - /// Connect another [HPO term](`crate::HpoTerm`) to the gene - pub fn add_term>(&mut self, term_id: I) -> bool { - self.hpos.insert(term_id) - } - /// The set of connected HPO terms pub fn hpo_terms(&self) -> &HpoGroup { &self.hpos @@ -193,6 +188,16 @@ impl Gene { pub fn to_hpo_set<'a>(&self, ontology: &'a Ontology) -> HpoSet<'a> { HpoSet::new(ontology, self.hpos.clone()) } + + /// Connect another [HPO term](`crate::HpoTerm`) to the gene + /// + /// # Note + /// + /// This method does **not** add the [`Gene`] to the [HPO term](`crate::HpoTerm`). + /// Clients should not use this method, unless they are creating their own Ontology. + pub fn add_term>(&mut self, term_id: I) -> bool { + self.hpos.insert(term_id) + } } impl PartialEq for Gene { diff --git a/src/annotations/omim_disease.rs b/src/annotations/omim_disease.rs index 853bfff..1eec5a6 100644 --- a/src/annotations/omim_disease.rs +++ b/src/annotations/omim_disease.rs @@ -6,8 +6,7 @@ use std::hash::Hash; use crate::annotations::disease::DiseaseIterator; use crate::annotations::{AnnotationId, Disease}; use crate::term::HpoGroup; -use crate::HpoError; -use crate::HpoTermId; +use crate::{HpoError, HpoSet, HpoTermId, Ontology}; /// A set of OMIM diseases /// @@ -93,15 +92,83 @@ impl Disease for OmimDisease { &self.name } - /// Connect another [HPO term](`crate::HpoTerm`) to the disease - fn add_term>(&mut self, term_id: I) -> bool { - self.hpos.insert(term_id) - } - /// The set of connected HPO terms fn hpo_terms(&self) -> &HpoGroup { &self.hpos } + + /// Returns a binary representation of the `OmimDisease` + /// + /// The binary layout is defined as: + /// + /// | Byte offset | Number of bytes | Description | + /// | --- | --- | --- | + /// | 0 | 4 | The total length of the binary data blob as big-endian `u32` | + /// | 4 | 4 | The `OmimDiseaseId` as big-endian `u32` | + /// | 8 | 4 | The length of the `OmimDisease` Name as big-endian `u32` | + /// | 12 | n | The `OmimDisease` name as u8 vector | + /// | 12 + n | 4 | The number of associated HPO terms as big-endian `u32` | + /// | 16 + n | x * 4 | The [`HpoTermId`]s of the associated terms, each encoded as big-endian `u32` | + /// + /// # Examples + /// + /// ``` + /// use hpo::annotations::{Disease, OmimDisease}; + /// + /// let mut disease = OmimDisease::new(123.into(), "FooBar"); + /// let bytes = disease.as_bytes(); + /// + /// assert_eq!(bytes.len(), 4 + 4 + 4 + 6 + 4); + /// assert_eq!(bytes[4..8], [0u8, 0u8, 0u8, 123u8]); // ID of disease => 123 + /// assert_eq!(bytes[8..12], [0u8, 0u8, 0u8, 6u8]); // Length of Name => 6 + /// ``` + fn as_bytes(&self) -> Vec { + fn usize_to_u32(n: usize) -> u32 { + n.try_into().expect("unable to convert {n} to u32") + } + let name = self.name().as_bytes(); + let name_length = name.len(); + let size = 4 + 4 + 4 + name_length + 4 + self.hpos.len() * 4; + + let mut res = Vec::new(); + + // 4 bytes for total length + res.append(&mut usize_to_u32(size).to_be_bytes().to_vec()); + + // 4 bytes for OMIM Disease-ID + res.append(&mut self.id.to_be_bytes().to_vec()); + + // 4 bytes for Length of OMIM Disease Name + res.append(&mut usize_to_u32(name_length).to_be_bytes().to_vec()); + + // OMIM Disease name (n bytes) + for c in name { + res.push(*c); + } + + // 4 bytes for number of HPO terms + res.append(&mut usize_to_u32(self.hpos.len()).to_be_bytes().to_vec()); + + // HPO terms + res.append(&mut self.hpos.as_bytes()); + + res + } + + /// Returns an [`HpoSet`] from the `OmimDisease` + fn to_hpo_set<'a>(&self, ontology: &'a Ontology) -> HpoSet<'a> { + HpoSet::new(ontology, self.hpos.clone()) + } + + /// Connect another [HPO term](`crate::HpoTerm`) to the disease + /// + /// # Note + /// + /// This method does **not** add the [`OmimDisease`] to the [HPO term](`crate::HpoTerm`). + /// Clients should not use this method, unless they are creating their own Ontology. + fn add_term>(&mut self, term_id: I) -> bool { + self.hpos.insert(term_id) + } } impl PartialEq for OmimDisease { diff --git a/src/lib.rs b/src/lib.rs index 00ec872..3d631da 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ pub mod utils; pub use ontology::comparison; pub use ontology::Ontology; pub use set::HpoSet; +#[doc(inline)] pub use term::{HpoTerm, HpoTermId}; const DEFAULT_NUM_PARENTS: usize = 10; @@ -59,6 +60,9 @@ pub enum HpoError { /// Failed to convert an integer to a float #[error("cannot convert int to float")] TryFromIntError(#[from] std::num::TryFromIntError), + /// Failed to parse a line of input data from the JAX obo + #[error("invalid input data: {0}")] + InvalidInput(String), } impl From for HpoError { diff --git a/src/ontology.rs b/src/ontology.rs index 042513c..f891a95 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -1,5 +1,6 @@ use crate::annotations::Disease; use core::fmt::Debug; +use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::Read; @@ -346,7 +347,7 @@ impl Ontology { /// /// - Actual OBO data: [`hp.obo`](https://hpo.jax.org/app/data/ontology) /// - Links between HPO and OMIM diseases: [`phenotype.hpoa`](https://hpo.jax.org/app/data/annotations) - /// - Links between HPO and Genes: [`phenotype_to_genes.txt`](http://purl.obolibrary.org/obo/hp/hpoa/phenotype_to_genes.txt) + /// - Links between HPO and Genes: [`genes_to_phenotype.txt`](http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt) /// /// and then specify the folder where the data is stored. /// @@ -404,7 +405,7 @@ impl Ontology { /// /// - Actual OBO data: [`hp.obo`](https://hpo.jax.org/app/data/ontology) /// - Links between HPO and OMIM diseases: [`phenotype.hpoa`](https://hpo.jax.org/app/data/annotations) - /// - Links between HPO and Genes: [`genes_to_phenotypes.txt`](http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt) + /// - Links between HPO and Genes: [`phenotype_to_genes.txt`](http://purl.obolibrary.org/obo/hp/hpoa/phenotype_to_genes.txt) /// /// and then specify the folder where the data is stored. /// @@ -610,87 +611,6 @@ impl Ontology { } } - /// Returns a binary representation of the Ontology - /// - /// The binary data is separated into sections: - /// - /// - Metadata (HPO and Bindat Version) (see `Ontology::metadata_as_bytes`) - /// - Terms (Names + IDs) (see `HpoTermInternal::as_bytes`) - /// - Term - Parent connection (Child ID - Parent ID) - /// (see `HpoTermInternal::parents_as_byte`) - /// - Genes (Names + IDs + Connected HPO Terms) ([`Gene::as_bytes`]) - /// - OMIM Diseases (Names + IDs + Connected HPO Terms) - /// ([`OmimDisease::as_bytes`]) - /// - /// Every section starts with 4 bytes to indicate its size - /// (big-endian encoded `u32`) - /// - /// This method is only useful if you use are modifying the ontology - /// and want to save data for later re-use. - /// - /// # Panics - /// - /// Panics when the buffer length of any subsegment larger than `u32::MAX` - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); - /// let bytes = ontology.as_bytes(); - /// ``` - pub fn as_bytes(&self) -> Vec { - fn usize_to_u32(n: usize) -> u32 { - n.try_into().expect("unable to convert {n} to u32") - } - let mut res = Vec::new(); - - // Add metadata, version info - res.append(&mut self.metadata_as_bytes()); - - // All HPO Terms - let mut buffer = Vec::new(); - for term in self.hpo_terms.values() { - buffer.append(&mut term.as_bytes()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - // All Term - Parent connections - buffer.clear(); - for term in self.hpo_terms.values() { - buffer.append(&mut term.parents_as_byte()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - // Genes and Gene-Term connections - buffer.clear(); - for gene in self.genes.values() { - buffer.append(&mut gene.as_bytes()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - // OMIM Disease and Disease-Term connections - buffer.clear(); - for omim_disease in self.omim_diseases.values() { - buffer.append(&mut omim_disease.as_bytes()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - // ORPHA Disease and Disease-Term connections - buffer.clear(); - for orpha_disease in self.orpha_diseases.values() { - buffer.append(&mut orpha_disease.as_bytes()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - res - } - /// Returns the number of HPO-Terms in the Ontology /// /// # Examples @@ -734,22 +654,6 @@ impl Ontology { HpoTerm::try_new(self, term_id).ok() } - /// Returns an Iterator of all [`HpoTerm`]s from the Ontology - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); - /// for term in ontology.hpos() { - /// println!("{}", term.name()); - /// } - /// ``` - /// - pub fn hpos(&self) -> Iter<'_> { - self.into_iter() - } - /// Returns a reference to the [`Gene`] of the provided [`GeneId`] /// /// If no such gene is present, `None` is returned @@ -939,11 +843,12 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::GeneId; /// /// let ontology_1 = Ontology::from_binary("tests/example.hpo").unwrap(); /// let mut ontology_2 = Ontology::default(); /// - /// ontology_2.add_gene("FOOBAR", "666666").unwrap(); + /// ontology_2.add_gene("FOOBAR", GeneId::from(666666)); /// /// let compare = ontology_1.compare(&ontology_2); /// assert_eq!(compare.added_hpo_terms().len(), 0); @@ -954,6 +859,154 @@ impl Ontology { Comparison::new(self, other) } + /// Iterates [`HpoTerm`]s + /// + /// # Examples + /// + /// ```rust + /// use hpo::Ontology; + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// let mut ont_iter = ontology.iter(); + /// + /// assert!(ont_iter.next().is_some()); + /// ``` + /// + pub fn iter(&self) -> Iter<'_> { + Iter { + inner: self.hpo_terms.iter(), + ontology: self, + } + } + + /// Returns an Iterator of all [`HpoTerm`]s from the Ontology + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// for term in ontology.hpos() { + /// println!("{}", term.name()); + /// } + /// ``` + /// + /// # Note + /// + /// This method is deprecated and will be removed in the future. + /// Use either [`Ontology::iter`] or iterate the ontology directly: + /// + /// ```rust + /// use hpo::Ontology; + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// for term in &ontology { + /// println!("{}", term.name()); + /// } + /// ``` + /// + pub fn hpos(&self) -> Iter<'_> { + self.into_iter() + } + + /// Returns a reference to the categories of the Ontology + /// + /// Categories are top-level `HpoTermId`s used for + /// categorizing individual `HpoTerm`s. + /// + /// See [`Ontology::set_default_categories()`] for more information + /// + /// # Examples + /// + /// ``` + /// use hpo::{HpoTerm, Ontology}; + /// + /// let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// assert_eq!(ontology.categories().len(), 6); + /// ``` + pub fn categories(&self) -> &HpoGroup { + &self.categories + } + + /// Returns a binary representation of the Ontology + /// + /// The binary data is separated into sections: + /// + /// - Metadata (HPO and Bindat Version) (see `Ontology::metadata_as_bytes`) + /// - Terms (Names + IDs) (see `HpoTermInternal::as_bytes`) + /// - Term - Parent connection (Child ID - Parent ID) + /// (see `HpoTermInternal::parents_as_byte`) + /// - Genes (Names + IDs + Connected HPO Terms) ([`Gene::as_bytes`]) + /// - OMIM Diseases (Names + IDs + Connected HPO Terms) + /// ([`OmimDisease::as_bytes`]) + /// + /// Every section starts with 4 bytes to indicate its size + /// (big-endian encoded `u32`) + /// + /// This method is only useful if you use are modifying the ontology + /// and want to save data for later re-use. + /// + /// # Panics + /// + /// Panics when the buffer length of any subsegment larger than `u32::MAX` + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// let bytes = ontology.as_bytes(); + /// ``` + pub fn as_bytes(&self) -> Vec { + fn usize_to_u32(n: usize) -> u32 { + n.try_into().expect("unable to convert {n} to u32") + } + let mut res = Vec::new(); + + // Add metadata, version info + res.append(&mut self.metadata_as_bytes()); + + // All HPO Terms + let mut buffer = Vec::new(); + for term in self.hpo_terms.values() { + buffer.append(&mut term.as_bytes()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + // All Term - Parent connections + buffer.clear(); + for term in self.hpo_terms.values() { + buffer.append(&mut term.parents_as_byte()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + // Genes and Gene-Term connections + buffer.clear(); + for gene in self.genes.values() { + buffer.append(&mut gene.as_bytes()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + // OMIM Disease and Disease-Term connections + buffer.clear(); + for omim_disease in self.omim_diseases.values() { + buffer.append(&mut omim_disease.as_bytes()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + // ORPHA Disease and Disease-Term connections + buffer.clear(); + for orpha_disease in self.orpha_diseases.values() { + buffer.append(&mut orpha_disease.as_bytes()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + res + } + /// Constructs a smaller ontology that contains only the `leaves` terms and /// all terms needed to connect to each leaf to `root` /// @@ -1029,16 +1082,16 @@ impl Ontology { if (gene.hpo_terms() & &phenotype_ids).is_empty() { continue; } - let gene_id = ont.add_gene( + ont.add_gene( self.gene(gene.id()).ok_or(HpoError::DoesNotExist)?.name(), - &gene.id().as_u32().to_string(), - )?; + *gene.id(), + ); // Link the gene to every term in the new ontology // --> also modifier terms for term in &(gene.hpo_terms() & &ids) { - ont.link_gene_term(term, gene_id)?; - ont.gene_mut(&gene_id) + ont.link_gene_term(term, *gene.id())?; + ont.gene_mut(gene.id()) .ok_or(HpoError::DoesNotExist)? .add_term(term); } @@ -1140,26 +1193,13 @@ impl Ontology { code.push_str("}\n"); code } +} - /// Returns a reference to the categories of the Ontology - /// - /// Categories are top-level `HpoTermId`s used for - /// categorizing individual `HpoTerm`s. - /// - /// See [`Ontology::set_default_categories()`] for more information - /// - /// # Examples - /// - /// ``` - /// use hpo::{HpoTerm, Ontology}; - /// - /// let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); - /// assert_eq!(ontology.categories().len(), 6); - /// ``` - pub fn categories(&self) -> &HpoGroup { - &self.categories - } - +/// Methods to add annotations +/// +/// These methods should rarely (if ever) be used by clients. +/// Calling these functions might disrupt the Ontology and associated terms. +impl Ontology { /// Returns a mutable reference to the categories vector /// /// This is a vector that should contain top-level `HpoTermId`s used for @@ -1253,20 +1293,6 @@ impl Ontology { Ok(()) } - /// Iterates [`HpoTerm`]s - pub fn iter(&self) -> Iter<'_> { - Iter { - inner: self.hpo_terms.iter(), - ontology: self, - } - } -} - -/// Methods to add annotations -/// -/// These methods should rarely (if ever) be used by clients. -/// Calling these functions might disrupt the Ontology and associated terms. -impl Ontology { /// Crates and inserts a new term to the ontology /// /// This method does not link the term to its parents or to any annotations @@ -1356,7 +1382,7 @@ impl Ontology { } } - /// Add a gene to the Ontology. and return the [`GeneId`] + /// Add a gene to the Ontology /// /// If the gene does not yet exist, a new [`Gene`] entity is created /// and stored in the Ontology. @@ -1367,19 +1393,19 @@ impl Ontology { /// Adding a gene does not connect it to any HPO terms. /// Use [`Ontology::link_gene_term`] for creating connections. /// - /// # Errors - /// - /// If the `gene_id` is invalid, an [`HpoError::ParseIntError`] is returned + /// This method was changed to receive the `gene_id` as [`GeneId`] + /// instead of `str` in `0.10` and does not return a `Result` anymore. /// /// # Examples /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::GeneId; /// /// let mut ontology = Ontology::default(); /// assert!(ontology.gene(&1u32.into()).is_none()); /// - /// ontology.add_gene("Foo", "1"); + /// ontology.add_gene("Foo", GeneId::from(1)); /// /// // Genes can be iterated... /// let mut gene_iterator = ontology.genes(); @@ -1390,14 +1416,9 @@ impl Ontology { /// // .. or accessed directly /// assert!(ontology.gene(&1u32.into()).is_some()); /// ``` - pub fn add_gene(&mut self, gene_name: &str, gene_id: &str) -> HpoResult { - let id = GeneId::try_from(gene_id)?; - match self.genes.entry(id) { - std::collections::hash_map::Entry::Occupied(_) => Ok(id), - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(Gene::new(id, gene_name)); - Ok(id) - } + pub fn add_gene(&mut self, gene_name: &str, gene_id: GeneId) { + if let Entry::Vacant(entry) = self.genes.entry(gene_id) { + entry.insert(Gene::new(gene_id, gene_name)); } } @@ -1520,7 +1541,7 @@ impl Ontology { /// /// let mut ontology = Ontology::default(); /// ontology.insert_term("Term-Foo".into(), 1u32); - /// ontology.add_gene("Foo", "5"); + /// ontology.add_gene("Foo", GeneId::from(5)); /// ontology.link_gene_term(1u32, GeneId::from(5u32)).unwrap(); /// /// let term = ontology.hpo(1u32).unwrap(); diff --git a/src/parser.rs b/src/parser.rs index e156075..0b90a4a 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -8,9 +8,15 @@ pub(crate) mod binary; /// Module to parse `hp.obo` file pub(crate) mod hp_obo; -/// Module to parse HPO - `Gene` associations from `genes_to_phenotype.txt` file -pub(crate) mod genes_to_phenotype { +/// Module to parse HPO - `Gene` associations +/// +/// It contains functions to parse `genes_to_phenotype.txt` and +/// `phenotype_to_genes.txt` input files +pub(crate) mod gene_to_hpo { + + use crate::annotations::GeneId; use crate::parser::Path; + use crate::HpoError; use crate::HpoResult; use std::fs::File; use std::io::BufRead; @@ -19,67 +25,323 @@ pub(crate) mod genes_to_phenotype { use crate::HpoTermId; use crate::Ontology; - /// Quick and dirty parser for development and debugging - pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { - let file = File::open(file).unwrap(); - let reader = BufReader::new(file); - for line in reader.lines() { - let line = line.unwrap(); - // TODO: Check for the header outside of the `lines` iterator - if line.starts_with('#') || line.starts_with("ncbi_gene_id") { - continue; - } - let cols: Vec<&str> = line.trim().split('\t').collect(); - let gene_id = ontology.add_gene(cols[1], cols[0])?; - let term_id = HpoTermId::try_from(cols[2])?; - ontology.link_gene_term(term_id, gene_id)?; + struct ParsedGene<'a> { + ncbi_id: GeneId, + symbol: &'a str, + hpo: HpoTermId, + } - ontology - .gene_mut(&gene_id) - .expect("Cannot find gene {gene_id}") - .add_term(term_id); + impl<'a> ParsedGene<'a> { + fn try_new(ncbi_id: &'a str, symbol: &'a str, hpo: &'a str) -> HpoResult { + let hpo = HpoTermId::try_from(hpo)?; + let ncbi_id = GeneId::try_from(ncbi_id)?; + Ok(Self { + ncbi_id, + symbol, + hpo, + }) + } + } + + /// Removes the first (header) line. + /// + /// TODO: Update this once is stable + fn remove_header(reader: &mut R) -> HpoResult<()> { + let mut trash = String::with_capacity(80); + reader.read_line(&mut trash).map_err(|_| { + HpoError::InvalidInput("Invalid data in genes_to_phenotype.txt".to_string()) + })?; + if !trash.starts_with('#') + && !trash.starts_with("ncbi_gene_id") + && !trash.starts_with("hpo_id") + { + return Err(HpoError::InvalidInput( + "genes_to_phenotype.txt file must contain a header".to_string(), + )); } Ok(()) } -} -/// Module to parse HPO - `Gene` associations from `phenotype_to_genes.txt` file -pub(crate) mod phenotype_to_genes { - use crate::parser::Path; - use crate::HpoResult; - use std::fs::File; - use std::io::BufRead; - use std::io::BufReader; + /// Parses a single line of `genes_to_phenotype.txt` + /// + /// and returns a `ParsedGene` struct with gene and HPO info + /// + /// ```text + /// 10 NAT2 HP:0000007 Autosomal recessive inheritance - OMIM:243400 + /// ``` + fn genes_to_phenotype_line(line: &str) -> HpoResult> { + let mut cols = line.split('\t'); - use crate::HpoTermId; - use crate::Ontology; + // Column 1 is the NCBI-ID of the gene + let Some(ncbi_id) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + // Column 2 is the gene symbol + let Some(symbol) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + // Column 3 is the Hpo Term ID + let Some(hpo) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + ParsedGene::try_new(ncbi_id, symbol, hpo) + } + + /// Parses a single line of `phenotype_to_genes.txt` + /// + /// ```text + /// HP:0000002 Abnormality of body height 81848 SPRY4 orphadata ORPHA:432 + /// ``` + fn phenotype_to_gene_line(line: &str) -> HpoResult> { + let mut cols = line.split('\t'); + + // Column 1 is the Hpo Term ID + let Some(hpo) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + // Column 2 is the HPO-name, which we don't need + if cols.next().is_none() { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + // Column 3 is the NCBI-ID of the gene + let Some(ncbi_id) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + // Column 4 is the gene symbol + let Some(symbol) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + ParsedGene::try_new(ncbi_id, symbol, hpo) + } + + /// Parse `genes_to_phenotype.txt` file + /// + /// ```text + /// ncbi_gene_id gene_symbol hpo_id hpo_name frequency disease_id + /// 10 NAT2 HP:0000007 Autosomal recessive inheritance - OMIM:243400 + /// 10 NAT2 HP:0001939 Abnormality of metabolism/homeostasis - OMIM:243400 + /// 16 AARS1 HP:0002460 Distal muscle weakness 15/15 OMIM:613287 + /// ``` + pub fn parse_genes_to_phenotype>( + file: P, + ontology: &mut Ontology, + ) -> HpoResult<()> { + parse(file, ontology, genes_to_phenotype_line) + } + + /// Parse `phenotype_to_genes.txt` file + /// + /// ```text + /// #Format: HPO-idHPO labelentrez-gene-identrez-gene-symbolAdditional Info from G-D sourceG-D sourcedisease-ID for link + /// HP:0000002 Abnormality of body height 81848 SPRY4 orphadata ORPHA:432 + /// HP:0000002 Abnormality of body height 204219 CERS3 orphadata ORPHA:79394 + /// HP:0000002 Abnormality of body height 51360 MBTPS2 - mim2gene OMIM:308205 + /// ``` + pub fn parse_phenotype_to_genes>( + file: P, + ontology: &mut Ontology, + ) -> HpoResult<()> { + parse(file, ontology, phenotype_to_gene_line) + } + + /// Parses a file to connect genes to HPO terms + fn parse, F: Fn(&str) -> HpoResult>>( + file: P, + ontology: &mut Ontology, + parse_line: F, + ) -> HpoResult<()> { + let filename = file.as_ref().display().to_string(); + let file = File::open(file).map_err(|_| HpoError::CannotOpenFile(filename))?; + let mut reader = BufReader::new(file); + + remove_header(&mut reader)?; - /// Quick and dirty parser for development and debugging - pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { - let file = File::open(file).unwrap(); - let reader = BufReader::new(file); for line in reader.lines() { - let line = line.unwrap(); - // TODO: Check for the header outside of the `lines` iterator - if line.starts_with('#') || line.starts_with("hpo_id") { - continue; - } - let cols: Vec<&str> = line.trim().split('\t').collect(); - let gene_id = ontology.add_gene(cols[3], cols[2])?; - let term_id = HpoTermId::try_from(cols[0])?; - ontology.link_gene_term(term_id, gene_id)?; + let line = line.map_err(|_| { + HpoError::InvalidInput("Invalid data in genes_to_phenotype.txt".to_string()) + })?; + + let gene = parse_line(&line)?; + ontology.add_gene(gene.symbol, gene.ncbi_id); + ontology.link_gene_term(gene.hpo, gene.ncbi_id)?; ontology - .gene_mut(&gene_id) - .expect("Cannot find gene {gene_id}") - .add_term(term_id); + .gene_mut(&gene.ncbi_id) + .expect("Gene is present because it was just add_omim_disease") + .add_term(gene.hpo); } Ok(()) } + + #[cfg(test)] + mod test { + use super::*; + #[test] + fn test_remove_header_ncbi_gene() { + let x = "ncbi_gene_id\txyz\n10\tNAT2\n".as_bytes(); + let mut reader = BufReader::new(x); + assert!(remove_header(&mut reader).is_ok()); + + let mut lines = reader.lines(); + assert_eq!(lines.next().unwrap().unwrap(), "10\tNAT2"); + assert!(lines.next().is_none()); + } + + #[test] + fn test_remove_header_hpo_id() { + let x = "hpo_id\txyz\n10\tNAT2\n".as_bytes(); + let mut reader = BufReader::new(x); + assert!(remove_header(&mut reader).is_ok()); + + let mut lines = reader.lines(); + assert_eq!(lines.next().unwrap().unwrap(), "10\tNAT2"); + assert!(lines.next().is_none()); + } + + #[test] + fn test_remove_header_hashtag() { + let x = "#foobar\txyz\n10\tNAT2\n".as_bytes(); + let mut reader = BufReader::new(x); + assert!(remove_header(&mut reader).is_ok()); + + let mut lines = reader.lines(); + assert_eq!(lines.next().unwrap().unwrap(), "10\tNAT2"); + assert!(lines.next().is_none()); + } + + #[test] + fn test_remove_header_fails() { + let x = "foobar\txyz\n10\tNAT2\n".as_bytes(); + let mut reader = BufReader::new(x); + assert!(remove_header(&mut reader).is_err()); + } + } + + #[cfg(test)] + mod test_genes_to_phenotype { + use crate::annotations::AnnotationId; + + use super::*; + + #[test] + fn test_parse_correct_line_with_newline() { + let line = "10\tNAT2\tHP:0000007\tfoobar\n"; + let res = genes_to_phenotype_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id.as_u32(), 10); + assert_eq!(res.symbol, "NAT2"); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_correct_line_without_newline() { + let line = "10\tNAT2\tHP:0000007\tfoobar"; + let res = genes_to_phenotype_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id.as_u32(), 10); + assert_eq!(res.symbol, "NAT2"); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_missing_id() { + let line = "NAT2\tHP:0000007\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_missing_symbol() { + let line = "10\tHP:0000007\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_missing_hpo() { + let line = "10\tNAT2\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_invalid_hpo() { + let line = "10\tNAT2\tHP:000000A\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + } + + #[cfg(test)] + mod test_phenotpye_to_genes { + use super::*; + use crate::annotations::AnnotationId; + + #[test] + fn test_parse_correct_line_with_newline() { + let line = "HP:0000007\tAbnormality of body height\t10\tNAT2\tfoobar\n"; + let res = phenotype_to_gene_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id.as_u32(), 10); + assert_eq!(res.symbol, "NAT2"); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_correct_line_without_newline() { + let line = "HP:0000007\tAbnormality of body height\t10\tNAT2\tfoobar"; + let res = phenotype_to_gene_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id.as_u32(), 10); + assert_eq!(res.symbol, "NAT2"); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_missing_id() { + let line = "HP:0000007\tAbnormality of body height\tNAT2\tfoobar\n"; + let res = phenotype_to_gene_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_empty_symbol() { + let line = "HP:0000007\tAbnormality of body height\t10\t\tfoobar\n"; + let res = phenotype_to_gene_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id.as_u32(), 10); + assert_eq!(res.symbol, ""); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_missing_hpo() { + let line = "Abnormality of body height\t10\tNAT2\tfoobar\n"; + let res = phenotype_to_gene_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_invalid_hpo() { + let line = "HP:0000007A\tAbnormality of body height\t10\tNAT2\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + } } /// Module to parse HPO - `OmimDisease` associations from `phenotype.hpoa` file -pub(crate) mod phenotype_hpoa { +/// +/// # Example line +/// +/// ```text +/// OMIM:619340 Developmental and epileptic encephalopathy 96 HP:0011097 PMID:31675180 PCS 1/2 P HPO:probinson[2021-06-21] +/// OMIM:609153 Pseudohyperkalemia NOT HP:0001878 PMID:2766660 PCS P HPO:lccarmody[2018-10-03] +/// ``` +/// +pub(crate) mod disease_to_hpo { use crate::annotations::Disease; use crate::HpoError; use crate::HpoResult; @@ -89,8 +351,6 @@ pub(crate) mod phenotype_hpoa { use std::io::BufReader; use std::path::Path; - use tracing::error; - use crate::Ontology; enum DiseaseKind<'a> { @@ -104,46 +364,62 @@ pub(crate) mod phenotype_hpoa { hpo_id: HpoTermId, } - fn parse_line(line: &str) -> Option> { + fn parse_line(line: &str) -> HpoResult>> { if line.starts_with("OMIM") { - parse_disease_components(line).map(DiseaseKind::Omim) + Ok(parse_disease_components(line)?.map(DiseaseKind::Omim)) } else if line.starts_with("ORPHA") { - parse_disease_components(line).map(DiseaseKind::Orpha) + Ok(parse_disease_components(line)?.map(DiseaseKind::Orpha)) } else { - None + Ok(None) } } - fn parse_disease_components(line: &str) -> Option> { - let cols: Vec<&str> = line.trim().split('\t').collect(); - if cols[2] == "NOT" { - return None; - } + fn parse_disease_components(line: &str) -> HpoResult>> { + let mut cols = line.trim().splitn(5, '\t'); + + let Some(id_col) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + let Some((_, disease_id)) = id_col.split_once(':') else { + return Err(HpoError::InvalidInput(line.to_string())); + }; - let Some((_, omim_id)) = cols[0].split_once(':') else { - error!("cannot parse Disease ID from {}", cols[0]); - return None; + let Some(disease_name) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); }; - let Ok(hpo_id) = HpoTermId::try_from(cols[3]) else { - error!("invalid HPO ID: {}", cols[3]); - return None; + if let Some("NOT") = cols.next() { + return Ok(None); }; - Some(DiseaseComponents { - id: omim_id, - name: cols[1], + let hpo_id = if let Some(id) = cols.next() { + HpoTermId::try_from(id)? + } else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + Ok(Some(DiseaseComponents { + id: disease_id, + name: disease_name, hpo_id, - }) + })) } /// Quick and dirty parser for development and debugging + /// + /// # Errors + /// + /// - [`HpoError::CannotOpenFile`]: Source file not present or can't be opened + /// - [`HpoError::ParseIntError`]: A line contains an invalid `omim_disease_id` + /// - [`HpoError::DoesNotExist`]: A line contains a non-existing [`HpoTermId`] pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { - let file = File::open(file).unwrap(); + let filename = file.as_ref().display().to_string(); + let file = File::open(file).map_err(|_| HpoError::CannotOpenFile(filename))?; let reader = BufReader::new(file); for line in reader.lines() { let line = line.unwrap(); - match parse_line(&line) { + match parse_line(&line)? { Some(DiseaseKind::Omim(omim)) => { let omim_disease_id = ontology.add_omim_disease(omim.name, omim.id)?; ontology.link_omim_disease_term(omim.hpo_id, omim_disease_id)?; @@ -172,10 +448,76 @@ pub(crate) mod phenotype_hpoa { mod test_omim_parsing { use super::*; + #[test] + fn test_skip_comment() { + let s = "#OMIM:600171\tGonadal agenesis\t\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + assert!(parse_line(s) + .expect("This line has the correct format") + .is_none()); + } + #[test] fn test_skip_not() { let s = "OMIM:600171\tGonadal agenesis\tNOT\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; - assert!(parse_line(s).is_none()); + assert!(parse_line(s) + .expect("This line has the correct format") + .is_none()); + } + + #[test] + fn test_correct_orpha() { + let s = "ORPHA:600171\tGonadal agenesis\t\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + let orpha = parse_line(s) + .expect("This line has the correct format") + .expect("Line describes an Omim disease"); + if let DiseaseKind::Orpha(orpha) = orpha { + assert_eq!(orpha.name, "Gonadal agenesis"); + assert_eq!(orpha.id, "600171"); + assert_eq!(orpha.hpo_id, "HP:0000055"); + } else { + panic!("Orpha line should be parsed as Orpha correctly"); + } + } + + #[test] + fn test_skip_orpha_not() { + let s = "ORPHA:600171\tGonadal agenesis\tNOT\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + assert!(parse_line(s) + .expect("This line has the correct format") + .is_none()); + } + + #[test] + fn test_correct_omim() { + let s = "OMIM:600171\tGonadal agenesis\t\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + let omim = parse_line(s) + .expect("This line has the correct format") + .expect("Line describes an Omim disease"); + if let DiseaseKind::Omim(omim) = omim { + assert_eq!(omim.name, "Gonadal agenesis"); + assert_eq!(omim.id, "600171"); + assert_eq!(omim.hpo_id, "HP:0000055"); + } else { + panic!("Omim line should be parsed as Omim correctly"); + } + } + + #[test] + fn test_invalid_omim_id() { + let s = "OMIM_600171\tGonadal agenesis\t\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + assert!(parse_line(s).is_err()); + } + + #[test] + fn test_invalid_hpo_id() { + let s = "OMIM:600171\tGonadal agenesis\t\tH55\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + assert!(parse_line(s).is_err()); + } + + #[test] + fn test_invalid_input() { + let s = "OMIM:600171 Gonadal agenesis HP:0000055 OMIM:600171 TAS P HPO:skoehler[2014-11-27]"; + assert!(parse_line(s).is_err()); } } } @@ -187,8 +529,8 @@ pub(crate) fn load_from_jax_files_with_transivitve_genes>( ontology: &mut Ontology, ) -> HpoResult<()> { hp_obo::read_obo_file(obo_file, ontology)?; - phenotype_to_genes::parse(gene_file, ontology)?; - phenotype_hpoa::parse(disease_file, ontology)?; + gene_to_hpo::parse_phenotype_to_genes(gene_file, ontology)?; + disease_to_hpo::parse(disease_file, ontology)?; Ok(()) } @@ -199,7 +541,7 @@ pub(crate) fn load_from_jax_files>( ontology: &mut Ontology, ) -> HpoResult<()> { hp_obo::read_obo_file(obo_file, ontology)?; - genes_to_phenotype::parse(gene_file, ontology)?; - phenotype_hpoa::parse(disease_file, ontology)?; + gene_to_hpo::parse_genes_to_phenotype(gene_file, ontology)?; + disease_to_hpo::parse(disease_file, ontology)?; Ok(()) } diff --git a/src/similarity.rs b/src/similarity.rs index 1be5f13..1598662 100644 --- a/src/similarity.rs +++ b/src/similarity.rs @@ -122,24 +122,34 @@ pub trait SimilarityCombiner { } /// Returns the maximum values of each row + /// + /// # Panics + /// + /// Panics when rows don't contain any values, i.e. when the matrix is empty fn row_maxes(&self, m: &Matrix) -> Vec { m.rows() .map(|row| { // I have no idea why, but I could not get a.max(b) to work // with the borrow checker - row.reduce(|a, b| if a > b { a } else { b }).unwrap() + row.reduce(|a, b| if a > b { a } else { b }) + .expect("A matrix must contain values") }) .copied() .collect() } /// Returns the maximum values of each column + /// + /// # Panics + /// + /// Panics when rows don't contain any values, i.e. when the matrix is empty fn col_maxes(&self, m: &Matrix) -> Vec { m.cols() .map(|col| { // I have no idea why, but I could not get a.max(b) to work // with the borrow checker - col.reduce(|a, b| if a > b { a } else { b }).unwrap() + col.reduce(|a, b| if a > b { a } else { b }) + .expect("A matrix must contain values") }) .copied() .collect() diff --git a/src/stats.rs b/src/stats.rs index 89431f7..df0688d 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -266,6 +266,16 @@ fn f64_from_u64(n: u64) -> f64 { intermediate.into() } +/// We have to frequently do divisions starting with u64 values +/// and need to return f64 values. To ensure some kind of safety +/// we use this method to panic in case of overflows. +fn f64_from_usize(n: usize) -> f64 { + let intermediate: u32 = n + .try_into() + .expect("cannot safely create f64 from large u64"); + intermediate.into() +} + #[cfg(test)] mod test { use super::*; diff --git a/src/stats/hypergeom.rs b/src/stats/hypergeom.rs index 016e02c..620cda6 100644 --- a/src/stats/hypergeom.rs +++ b/src/stats/hypergeom.rs @@ -64,5 +64,6 @@ mod disease; mod gene; +mod statrs; pub use disease::disease_enrichment; pub use gene::gene_enrichment; diff --git a/src/stats/hypergeom/disease.rs b/src/stats/hypergeom/disease.rs index d3e5aa1..14e7d2a 100644 --- a/src/stats/hypergeom/disease.rs +++ b/src/stats/hypergeom/disease.rs @@ -1,7 +1,7 @@ -use statrs::distribution::{DiscreteCDF, Hypergeometric}; use tracing::debug; use crate::annotations::OmimDiseaseId; +use crate::stats::hypergeom::statrs::Hypergeometric; use crate::stats::{f64_from_u64, Enrichment, SampleSet}; use crate::HpoTerm; diff --git a/src/stats/hypergeom/gene.rs b/src/stats/hypergeom/gene.rs index 482d049..4d8c90d 100644 --- a/src/stats/hypergeom/gene.rs +++ b/src/stats/hypergeom/gene.rs @@ -1,7 +1,7 @@ -use statrs::distribution::{DiscreteCDF, Hypergeometric}; use tracing::debug; use crate::annotations::GeneId; +use crate::stats::hypergeom::statrs::Hypergeometric; use crate::stats::{f64_from_u64, Enrichment, SampleSet}; use crate::HpoTerm; diff --git a/src/stats/hypergeom/statrs.rs b/src/stats/hypergeom/statrs.rs new file mode 100644 index 0000000..214c812 --- /dev/null +++ b/src/stats/hypergeom/statrs.rs @@ -0,0 +1,277 @@ +//! This module contains code from +//! +//! The statrs crate contains way more functionality than needed for hpo +//! so it only contains the logic neccessary for the hypergeometric +//! enrichment +#![allow(clippy::excessive_precision)] +#![allow(clippy::unreadable_literal)] + +use std::cmp; + +use crate::stats::f64_from_usize; + +/// Auxiliary variable when evaluating the `gamma_ln` function +const GAMMA_R: f64 = 10.900_511; + +/// Polynomial coefficients for approximating the `gamma_ln` function +const GAMMA_DK: &[f64] = &[ + 2.48574089138753565546e-5, + 1.05142378581721974210, + -3.45687097222016235469, + 4.51227709466894823700, + -2.98285225323576655721, + 1.05639711577126713077, + -1.95428773191645869583e-1, + 1.70970543404441224307e-2, + -5.71926117404305781283e-4, + 4.63399473359905636708e-6, + -2.71994908488607703910e-9, +]; +pub const LN_2_SQRT_E_OVER_PI: f64 = 0.6207822376352452223455184457816472122518527279025978; + +/// Constant value for `ln(pi)` +pub const LN_PI: f64 = 1.1447298858494001741434273513530587116472948129153; + +/// The maximum factorial representable +/// by a 64-bit floating point without +/// overflowing +pub const MAX_FACTORIAL: usize = 170; + +// Initialization for pre-computed cache of 171 factorial +// values 0!...170! +#[allow(clippy::cast_precision_loss)] +const FCACHE: [f64; MAX_FACTORIAL + 1] = { + let mut fcache = [1.0; MAX_FACTORIAL + 1]; + + let mut i = 1; + while i < MAX_FACTORIAL + 1 { + fcache[i] = fcache[i - 1] * i as f64; + + i += 1; + } + fcache +}; + +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct Hypergeometric { + population: u64, + successes: u64, + draws: u64, +} + +impl Hypergeometric { + /// Constructs a new hypergeometric distribution + /// with a population (N) of `population`, number + /// of successes (K) of `successes`, and number of draws + /// (n) of `draws` + /// + /// # Errors + /// + /// If `successes > population` or `draws > population` + pub fn new(population: u64, successes: u64, draws: u64) -> Result { + if successes > population || draws > population { + Err("Invalid params".to_string()) + } else { + Ok(Hypergeometric { + population, + successes, + draws, + }) + } + } + + /// Returns the minimum value in the domain of the + /// hypergeometric distribution representable by a 64-bit + /// integer + /// + /// # Formula + /// + /// ```text + /// max(0, n + K - N) + /// ``` + /// + /// where `N` is population, `K` is successes, and `n` is draws + fn min(&self) -> u64 { + (self.draws + self.successes).saturating_sub(self.population) + } + + /// Returns the maximum value in the domain of the + /// hypergeometric distribution representable by a 64-bit + /// integer + /// + /// # Formula + /// + /// ```text + /// min(K, n) + /// ``` + /// + /// where `K` is successes and `n` is draws + fn max(&self) -> u64 { + cmp::min(self.successes, self.draws) + } + + /// Calculates the survival function for the hypergeometric + /// distribution at `x` + /// + /// # Formula + /// + /// ```text + /// 1 - ((n choose k+1) * (N-n choose K-k-1)) / (N choose K) * 3_F_2(1, + /// k+1-K, k+1-n; k+2, N+k+2-K-n; 1) + /// ``` + /// + /// where `N` is population, `K` is successes, `n` is draws, + /// and `p_F_q` is the [generalized hypergeometric function](https://en.wikipedia.org/wiki/Generalized_hypergeometric_function) + /// + /// Calculated as a discrete integral over the probability mass + /// function evaluated from (k+1)..max + pub fn sf(&self, x: u64) -> f64 { + if x < self.min() { + 1.0 + } else if x >= self.max() { + 0.0 + } else { + let k = x; + let ln_denom = ln_binomial(self.population, self.draws); + ((k + 1)..=self.max()).fold(0.0, |acc, i| { + acc + (ln_binomial(self.successes, i) + + ln_binomial(self.population - self.successes, self.draws - i) + - ln_denom) + .exp() + }) + } + } +} + +/// Computes the logarithm of the gamma function +/// with an accuracy of 16 floating point digits. +/// The implementation is derived from +/// "An Analysis of the Lanczos Gamma Approximation", +/// Glendon Ralph Pugh, 2004 p. 116 +fn ln_gamma(x: f64) -> f64 { + if x < 0.5 { + let s = GAMMA_DK + .iter() + .enumerate() + .skip(1) + .fold(GAMMA_DK[0], |s, t| s + t.1 / (f64_from_usize(t.0) - x)); + + LN_PI + - (std::f64::consts::PI * x).sin().ln() + - s.ln() + - LN_2_SQRT_E_OVER_PI + - (0.5 - x) * ((0.5 - x + GAMMA_R) / std::f64::consts::E).ln() + } else { + let s = GAMMA_DK + .iter() + .enumerate() + .skip(1) + .fold(GAMMA_DK[0], |s, t| { + s + t.1 / (x + f64_from_usize(t.0) - 1.0) + }); + + s.ln() + LN_2_SQRT_E_OVER_PI + (x - 0.5) * ((x - 0.5 + GAMMA_R) / std::f64::consts::E).ln() + } +} + +/// Computes the logarithmic factorial function `x -> ln(x!)` +/// for `x >= 0`. +/// +/// # Remarks +/// +/// Returns `0.0` if `x <= 1` +fn ln_factorial(x: u64) -> f64 { + let x = usize::try_from(x).expect("x must be castable to usize"); + FCACHE + .get(x) + .map_or_else(|| ln_gamma(f64_from_usize(x) + 1.0), |&fac| fac.ln()) +} + +/// Computes the natural logarithm of the binomial coefficient +/// `ln(n choose k)` where `k` and `n` are non-negative values +/// +/// # Remarks +/// +/// Returns `f64::NEG_INFINITY` if `k > n` +pub fn ln_binomial(n: u64, k: u64) -> f64 { + if k > n { + f64::NEG_INFINITY + } else { + ln_factorial(n) - ln_factorial(k) - ln_factorial(n - k) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_fcache() { + assert!((FCACHE[0] - 1.0).abs() < f64::EPSILON); + assert!((FCACHE[1] - 1.0).abs() < f64::EPSILON); + assert!((FCACHE[2] - 2.0).abs() < f64::EPSILON); + assert!((FCACHE[3] - 6.0).abs() < f64::EPSILON); + assert!((FCACHE[4] - 24.0).abs() < f64::EPSILON); + assert!( + ( + FCACHE[70] - + 11978571669969890000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0 + ) + .abs() < f64::EPSILON + ); + assert!( + ( + FCACHE[170] - + 7257415615307994000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0 + ) + .abs() < f64::EPSILON + ); + } + + #[test] + fn test_hypergeom_build() { + let mut result = Hypergeometric::new(2, 2, 2); + assert!(result.is_ok()); + + result = Hypergeometric::new(2, 3, 2); + assert!(result.is_err()); + } + + #[test] + fn test_hypergeom_max() { + let hyper = Hypergeometric::new(50, 25, 13).unwrap(); + assert_eq!(hyper.max(), 13); + + let hyper = Hypergeometric::new(50, 10, 13).unwrap(); + assert_eq!(hyper.max(), 10); + } + + #[test] + fn min() { + let hyper = Hypergeometric::new(50, 25, 30).unwrap(); + assert_eq!(hyper.min(), 5); + + let hyper = Hypergeometric::new(50, 40, 30).unwrap(); + assert_eq!(hyper.min(), 20); + + let hyper = Hypergeometric::new(50, 10, 13).unwrap(); + assert_eq!(hyper.min(), 0); + } + + #[test] + fn test_hypergeom_cdf() { + // Numbers calculated here https://statisticsbyjim.com/probability/hypergeometric-distribution/ + let hyper = Hypergeometric::new(50, 25, 13).unwrap(); + + // more than 1 == 2 or more + assert!((hyper.sf(1) - 0.9996189832542451).abs() < f64::EPSILON); + // more than 3 == 4 or more + assert!((hyper.sf(3) - 0.9746644799047702).abs() < f64::EPSILON); + // more than 7 == 8 or more + assert!((hyper.sf(7) - 0.26009737477738537).abs() < f64::EPSILON); + // more than 12 == 13 or more + assert!((hyper.sf(12) - 0.000014654490222007184).abs() < f64::EPSILON); + + assert!(hyper.sf(13) < f64::EPSILON); + } +} diff --git a/src/term/hpoterm.rs b/src/term/hpoterm.rs index 6be748c..9642d0a 100644 --- a/src/term/hpoterm.rs +++ b/src/term/hpoterm.rs @@ -42,6 +42,9 @@ pub struct HpoTerm<'a> { impl<'a> HpoTerm<'a> { /// Constructs a new [`HpoTerm`] /// + /// Clients should rarely, if ever, use this function. Instead, use the + /// [`Ontology::hpo`] method to get an [`HpoTerm`] + /// /// # Errors /// /// If the given [`HpoTermId`] does not match an existing term @@ -59,6 +62,11 @@ impl<'a> HpoTerm<'a> { /// /// let non_existing_term = HpoTerm::try_new(&ontology, 666666666u32); /// assert!(non_existing_term.is_err()); + /// + /// // better retrieve terms from `Ontology`: + /// + /// let term = ontology.hpo(118u32); + /// assert!(term.is_some()); /// ``` pub fn try_new>(ontology: &'a Ontology, term: I) -> HpoResult> { let term = ontology.get(term).ok_or(HpoError::DoesNotExist)?; diff --git a/src/term/hpotermid.rs b/src/term/hpotermid.rs index 3ad5f10..8f6f9d9 100644 --- a/src/term/hpotermid.rs +++ b/src/term/hpotermid.rs @@ -54,6 +54,17 @@ impl AnnotationId for HpoTermId { impl TryFrom<&str> for HpoTermId { type Error = HpoError; + + /// Parse a str into an `HpoTermId` + /// + /// This method assumes (but does not check!) + /// that the first 3 characters are `HP:`. + /// + /// # Error + /// + /// An empty string or string with less than 4 characters + /// will return an error. + /// The characters after the 4th position must be parsable to `u32`. fn try_from(s: &str) -> HpoResult { if s.len() < 4 { return Err(HpoError::ParseIntError); diff --git a/src/utils.rs b/src/utils.rs index 979dd3a..f825ba2 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -120,4 +120,14 @@ mod test { assert_eq!(c.next(), Some((&4, &4))); assert_eq!(c.next(), None); } + + #[test] + fn combinations_with_none() { + let a = vec![Some(1), Some(2), None, Some(4)]; + let mut c = Combinations::new(&a); + assert_eq!(c.next(), Some((&1, &2))); + assert_eq!(c.next(), Some((&1, &4))); + assert_eq!(c.next(), Some((&2, &4))); + assert_eq!(c.next(), None); + } }