From 2cd007586c2f92d059d3679eaa5929e212ebfb0b Mon Sep 17 00:00:00 2001 From: Tray Torrance Date: Sun, 14 Jul 2024 17:38:51 -0700 Subject: [PATCH] Improve documentation --- Cargo.toml | 2 ++ src/db.rs | 7 +++++-- src/doc.rs | 26 +++++++++++++++++++++++ src/lib.rs | 2 ++ src/query.rs | 2 ++ src/search.rs | 58 +++++++++++++++++++++++++++++++++++++++++++-------- src/term.rs | 15 ++++++++++++- 7 files changed, 100 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index efddbd0..b631072 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,8 @@ version = "0.1.1-dev" edition = "2021" license = "MIT" description = "Rust bindings for the Xapian search engine" +documentation = "https://torrancew.github.io/xapian-rs" +rust-version = "1.70" [dependencies] autocxx = "0.27.0" diff --git a/src/db.rs b/src/db.rs index 6a10e01..4e25d64 100644 --- a/src/db.rs +++ b/src/db.rs @@ -5,6 +5,7 @@ use std::{path::Path, pin::Pin}; use autocxx::{cxx, prelude::*}; use bytes::Bytes; +/// A read-only Xapian database pub struct Database(Pin>); impl Database { @@ -19,11 +20,12 @@ impl Database { self.0.as_mut().close() } + /// Get the number of documents stored in the database pub fn doc_count(&self) -> u32 { self.0.get_doccount().into() } - // Detect whether a given term exists in the database + /// Detect whether a given term exists in the database pub fn term_exists(&self, term: impl AsRef<[u8]>) -> bool { cxx::let_cxx_string!(term = term); self.0.term_exists(&term) @@ -54,10 +56,11 @@ impl From for Database { } } +/// A Xapian database that can be read or written to pub struct WritableDatabase(Pin>); impl Default for WritableDatabase { - /// Open a new, in-memory WritableDatabase + /// Open a new, in-memory [`WritableDatabase`] fn default() -> Self { Self::inmemory() } diff --git a/src/doc.rs b/src/doc.rs index 396a5d5..3383c60 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -8,6 +8,7 @@ use std::{ use autocxx::{cxx, prelude::*}; use bytes::Bytes; +/// A document in a Xapian database pub struct Document(Pin>); impl Document { @@ -15,11 +16,13 @@ impl Document { Self(ptr) } + /// Add a boolean term to the document pub fn add_boolean_term(&mut self, term: impl AsRef) { cxx::let_cxx_string!(term = term.as_ref()); self.0.as_mut().add_boolean_term(&term) } + /// Add an occurrence of `term` at the position given by `pos` pub fn add_posting( &mut self, term: impl AsRef, @@ -32,6 +35,7 @@ impl Document { .add_posting(&term, pos, increment.into().unwrap_or(1.into())) } + /// Add a term to the document, without positional information pub fn add_term( &mut self, term: impl AsRef, @@ -43,28 +47,39 @@ impl Document { .add_term(&term, increment.into().unwrap_or(1.into())) } + /// Remove all terms and postings from the document pub fn clear_terms(&mut self) { self.0.as_mut().clear_terms() } + /// Get the data blob stored in this document pub fn data(&self) -> Bytes { ffi::cxx_bytes(&self.0.get_data()) } + /// Get the document ID (if any) associated with this document pub fn id(&self) -> Option { crate::DocId::new(self.0.get_docid()) } + /// Remove `term` and all postings associated with it from this document pub fn remove_term(&mut self, term: impl AsRef) { cxx::let_cxx_string!(term = term.as_ref()); self.0.as_mut().remove_term(&term) } + /// Set the data blob stored alongside this document pub fn set_data(&mut self, data: impl AsRef<[u8]>) { cxx::let_cxx_string!(data = data); self.0.as_mut().set_data(&data); } + /// Set the value stored in the given slot number + /// + /// Xapian values are stored as strings, but are often more useful in some other form. + /// To accomodate this, [`ToValue`][crate::ToValue] is used to serialize data in a + /// Xapian-friendly fashion. This trait is already implemented for most numeric primitives, + /// string types and byte collections. pub fn set_value(&mut self, slot: impl Into, value: impl crate::ToValue) { cxx::let_cxx_string!(value = value.serialize()); self.0 @@ -72,6 +87,7 @@ impl Document { .add_value(ffi::valueno::from(slot.into()), &value) } + /// Retrieve an iterator over the terms in this document pub fn terms(&self) -> crate::iter::TermIter { crate::iter::TermIter::new( self.0.termlist_begin().within_box(), @@ -79,6 +95,16 @@ impl Document { ) } + /// Retrieve the value (if any) stored in the given slot number + /// + /// Xapian values are stored as strings, but are often more useful in some other form. + /// To accomodate this, [`FromValue`][crate::FromValue] is used to deserialize data + /// from its Xapian representation. This trait is already implemented for most numeric + /// primitives, string types and byte collections. + /// + /// Returns `None` when there is no value stored in `slot` + /// Returns `Some(Err(T::Error)` when there is a value but deserialization fails + /// Returns `Some(Ok(T))` otherwise pub fn value( &self, slot: impl Into, diff --git a/src/lib.rs b/src/lib.rs index 861548d..6f94075 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +#![doc = include_str!("../README.md")] mod db; use std::num::NonZeroU32; @@ -59,6 +60,7 @@ impl From for u32 { } } +/// A newtype wrapper representing a valid document position #[derive(Debug)] pub struct Position(ffi::termpos); diff --git a/src/query.rs b/src/query.rs index d4cc425..637255d 100644 --- a/src/query.rs +++ b/src/query.rs @@ -95,6 +95,7 @@ impl From for Operator { } } +/// A parsed query, ready for use in a search #[derive(Clone)] pub struct Query(Pin>); @@ -223,6 +224,7 @@ impl Display for Query { } } +/// A type for building [`Query`] objects from strings pub struct QueryParser(Pin>); impl QueryParser { diff --git a/src/search.rs b/src/search.rs index 3c993b7..e970da4 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,4 +1,4 @@ -use crate::ffi; +use crate::{ffi, DocId}; use std::{ cell::{Ref, RefCell}, @@ -6,7 +6,6 @@ use std::{ ops::Deref, pin::Pin, rc::Rc, - string::FromUtf8Error, }; use autocxx::{cxx, prelude::*}; @@ -38,11 +37,14 @@ impl DateRangeProcessor { ) } - pub fn upcast(&mut self) -> Pin<&mut ffi::RangeProcessor> { + pub(crate) fn upcast(&mut self) -> Pin<&mut ffi::RangeProcessor> { unsafe { ffi::upcast(self.0.as_mut()) } } } +/// The primary interface to retrieve information from Xapian. +/// +/// Used to perform searches, faceting, term iteration, expansion, sorting, relevancy and more. pub struct Enquire(Pin>); impl Enquire { @@ -50,11 +52,15 @@ impl Enquire { Self(ffi::Enquire::new2(db.as_ref()).within_box()) } + /// Attach a [`MatchSpy`] implementation to this `Enquire` + /// + /// Instances of `MatchSpy` can be used to implement faceting pub fn add_matchspy(&mut self, spy: &T) { let spy = spy.clone().into_ffi(); unsafe { ffi::shim::enquire_add_matchspy(self.0.as_mut(), spy.upcast()) } } + /// Retrieve the [`MSet`] for the current [`Query`][crate::Query] pub fn mset( &self, first: u32, @@ -84,10 +90,12 @@ impl Enquire { ) } + /// Retrieve the query currently associated with this Enquire instance pub fn query(&self) -> crate::Query { crate::Query::from_ffi(ffi::shim::query_clone(self.0.get_query()).within_box()) } + /// Set the query currently associated with this Enquire instance pub fn set_query(&mut self, query: impl AsRef, qlen: impl Into>) { self.0 .as_mut() @@ -101,6 +109,7 @@ impl AsRef for Enquire { } } +/// An individual match item from the iterator yielded by [`MSet::matches`] #[derive(Clone)] pub struct Match { value: ffi::docid, @@ -113,22 +122,27 @@ impl Match { Self { value, ptr } } - pub fn docid(&self) -> u32 { - self.value.into() + /// Retrieve the [`DocId`][crate::DocId] associated with this Match + pub fn docid(&self) -> DocId { + unsafe { DocId::new_unchecked(self.value) } } + /// Retrieve the [`Document`][crate::Document] associated with this Match pub fn document(&self) -> crate::Document { crate::Document::new(self.ptr.get_document().within_box()) } + /// Retrieve the weight of this Match, represented as a percentage pub fn percent(&self) -> i32 { self.ptr.get_percent().into() } + /// Retrieve the [`MSet`] rank of this Match pub fn rank(&self) -> u32 { self.ptr.get_rank().into() } + /// Retrieve the weight of this Match pub fn weight(&self) -> f64 { self.ptr.get_weight() } @@ -154,9 +168,12 @@ impl PartialEq for Match { } } +/// A [`MatchDecider`] can be used to reject documents from an [`MSet`] pub trait MatchDecider { + /// Decide whether this document should be included in the `MSet` fn is_match(&self, doc: &crate::Document) -> bool; + #[doc(hidden)] fn into_ffi(self) -> &'static MatchDeciderWrapper where Self: Sized + 'static, @@ -165,6 +182,7 @@ pub trait MatchDecider { } } +#[doc(hidden)] pub struct MatchDeciderWrapper(Rc>); impl MatchDeciderWrapper { @@ -179,9 +197,16 @@ impl From for MatchDeciderWrapper { } } +/// A [`MatchSpy`] can be used to accumulate information seen during the match. +/// +/// Useful for faceting and generally profiling matching documents pub trait MatchSpy { + /// Process this [`Document`][crate::Document] + /// + /// Used to collect any desired data/metadata from the document fn observe(&self, doc: &crate::Document, weight: f64); + #[doc(hidden)] fn into_ffi(self) -> &'static mut MatchSpyWrapper where Self: Sized + 'static, @@ -189,11 +214,13 @@ pub trait MatchSpy { Box::leak(Box::new(MatchSpyWrapper::from(self))) } + /// An optional, human-friendly name for the MatchSpy fn name(&self) -> Option { None } } +#[doc(hidden)] pub struct MatchSpyWrapper(Rc>); impl MatchSpyWrapper { @@ -209,6 +236,7 @@ impl From for MatchSpyWrapper { } } +/// A list of search results with associated metadata pub struct MSet(Pin>); impl MSet { @@ -224,14 +252,17 @@ impl MSet { self.0.end().within_box() } + /// Convert a weight to a percentage, taking into account weighted query terms pub fn convert_to_percent(&self, weight: f64) -> i32 { self.0.convert_to_percent(weight).into() } + /// Detects whether this `MSet` is empty pub fn empty(&self) -> bool { self.0.empty() } + /// Retrieve the iterator of [`Match`] objects for this `MSet` pub fn matches(&self) -> crate::iter::MSetIter { crate::iter::MSetIter::new(self) } @@ -240,6 +271,14 @@ impl MSet { self.0.size().into() } + /// Generate a snippet from the provided `text` + /// + /// `length` controls the size of the snippet + /// `stemmer` should be an instance of the same stemming algorithm used to build the query + /// `flags` are used to control specific bits of functionality + /// `hl` is an optional pair of string-likes used to highlight matches within the snippet, for use in markup + /// `omit` is used to indicate any truncated prefix or suffix + /// mid-sen pub fn snippet( &self, text: impl AsRef, @@ -248,7 +287,7 @@ impl MSet { flags: u32, hl: impl Into>, omit: impl Into>, - ) -> Result + ) -> String where T: AsRef + Default, U: AsRef + Default, @@ -269,9 +308,10 @@ impl MSet { &omit, ); - String::from_utf8(Vec::from(text.as_bytes())) + text.to_string() } + /// Get the number of documents which `term` occurs in pub fn termfreq(&self, term: impl AsRef) -> u32 { cxx::let_cxx_string!(term = term.as_ref()); self.0.get_termfreq(&term).into() @@ -311,7 +351,7 @@ impl NumberRangeProcessor { ) } - pub fn upcast(&mut self) -> Pin<&mut ffi::RangeProcessor> { + pub(crate) fn upcast(&mut self) -> Pin<&mut ffi::RangeProcessor> { unsafe { ffi::upcast(self.0.as_mut()) } } } @@ -375,7 +415,7 @@ impl RangeProcessor { ) } - pub fn upcast(&mut self) -> Pin<&mut ffi::RangeProcessor> { + pub(crate) fn upcast(&mut self) -> Pin<&mut ffi::RangeProcessor> { self.0.as_mut() } } diff --git a/src/term.rs b/src/term.rs index 898ebf5..829aaf8 100644 --- a/src/term.rs +++ b/src/term.rs @@ -14,6 +14,7 @@ use autocxx::{ prelude::*, }; +/// A strategy to apply to a `Stem` instance pub enum StemStrategy { None, Some, @@ -82,9 +83,11 @@ impl From for ffi::TermGenerator_stem_strategy { } } +/// An instance of a Stemming algorithm pub struct Stem(Pin>); impl Stem { + /// List all languages with an available Stem instance in the underlying Xapian library pub fn languages() -> HashSet { ffi::Stem::get_available_languages() .to_string() @@ -93,15 +96,18 @@ impl Stem { .collect() } + /// Returns true if this Stem instance is a no-op pub fn is_noop(&self) -> bool { self.0.is_none() } + /// Returns a stemmer instance for the given language, if one exists pub fn for_language(lang: impl AsRef) -> Self { cxx::let_cxx_string!(lang = lang.as_ref()); Self(ffi::Stem::new3(&lang).within_box()) } + /// Run the underlying stem algorithm against `word`, returning its stemmed form pub fn stem(&self, word: impl AsRef) -> String { cxx::let_cxx_string!(word = word.as_ref()); ffi::shim::stemmer_stem(&self.0, &word).to_string() @@ -114,9 +120,13 @@ impl AsRef for Stem { } } +/// Determines whether a given term matches a `stopword`. +/// Stopwords are not typically indexed or included in parsed queries. pub trait Stopper { + /// Evaluate whether a given word is a stopword. fn is_stopword(&self, word: &str) -> bool; + #[doc(hidden)] fn into_ffi(self) -> &'static StopperWrapper where Self: Sized + 'static, @@ -125,10 +135,11 @@ pub trait Stopper { } } +#[doc(hidden)] pub struct StopperWrapper(Rc>); impl StopperWrapper { - pub fn upcast(&self) -> impl Deref + '_ { + pub(crate) fn upcast(&self) -> impl Deref + '_ { Ref::map(self.0.borrow(), |s| s.as_ref()) } } @@ -139,6 +150,7 @@ impl From for StopperWrapper { } } +/// An individual `term`, with access to position and frequency information pub struct Term { value: UniquePtr, ptr: Pin>, @@ -205,6 +217,7 @@ impl PartialOrd for Term { } } +/// An instance of a Xapian TermGenerator, which can be used to index text with optional stemming pub struct TermGenerator(Pin>); impl TermGenerator {