From 07ee7a1cafd7086934d51070d68db45fd2ef2272 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Mon, 12 Aug 2024 14:55:54 -0700 Subject: [PATCH] docs: bootstrap basic documentation --- .DS_Store | Bin 0 -> 6148 bytes docs/api/sampler.md | 114 ++++++++++++++++++++++++++++++++++++++++++++ docs/index.md | 0 mkdocs.yml | 20 ++++++++ 4 files changed, 134 insertions(+) create mode 100644 .DS_Store create mode 100644 docs/api/sampler.md create mode 100644 docs/index.md create mode 100644 mkdocs.yml diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..ca2b7874ee44cef1857d5487e8c8184223131c55 GIT binary patch literal 6148 zcmeH~F$w}f3`G;&La^D=avBfd4F=H@>;(h`8(BfodXDZ-CJ2t!BJu;tpJXO1`-+{7 zi0JxuSc&u^GJ~7S(n4d3ypw~RWiQwJa2ZeM@rat$Cvn!+@Lrnz*rt#G36KB@kN^q% z5COZlVY7KvMiL+a5_l4@??Zx{=Fn2rKOG1@0zf;I-LUpq0-CG<&7q|#Dlm=dL8DcD z46(YmLsOi~p`~hV7meXV4M3`}dgXhH(h?7~0-B+w9;*1Wg-e+&OK|2Hj6Nq_|Y zjDU8VVY9|d#ohY$dRE^>)z$?L_2URHKLJSWDqg_du%B!J&7q|#Dlq;CI0gn1_$q-1 D?w=C8 literal 0 HcmV?d00001 diff --git a/docs/api/sampler.md b/docs/api/sampler.md new file mode 100644 index 000000000..2ca3a9e10 --- /dev/null +++ b/docs/api/sampler.md @@ -0,0 +1,114 @@ +# Sampler Module + +## Overview + +The `sampler` module provides functionality for generating text samples based on templates and data. It is primarily used for creating datasets for natural language processing tasks in chemistry and related fields. The main class in this module is `TemplateSampler`, which allows for flexible text generation with support for multiple choice questions and class balancing. + +## TemplateSampler + +### Class: TemplateSampler + +The `TemplateSampler` class is responsible for sampling and generating text based on templates and data. + +#### Initialization + +```python +sampler = TemplateSampler(df: pd.DataFrame, meta: Dict, config: Dict, column_datafield_sampler: Optional[Callable] = None) +``` + +- `df`: A pandas DataFrame containing the dataset. +- `meta`: A dictionary containing metadata about the dataset, including identifiers and targets. +- `config`: A dictionary containing configuration parameters for the sampler. +- `column_datafield_sampler`: An optional callable for custom sampling from multiple options. + +#### Main Methods + +##### sample + +```python +def sample(self, sample: Optional[pd.Series], template: str) -> str +``` + +Generates a text sample based on a template and a data sample. + +- `sample`: A row from the dataset. If None, a random sample is chosen. +- `template`: The template string to be filled. +- Returns: The completed text sample with all variables replaced by their values. + +##### enable_class_balancing + +```python +def enable_class_balancing(self, column: str) +``` + +Enables class-balanced sampling for a specified column. + +- `column`: The column to use for balancing. + +##### disable_class_balancing + +```python +def disable_class_balancing(self) +``` + +Disables class-balanced sampling and reverts to the original dataset. + +#### Usage Examples + +Basic usage: + +```python +import pandas as pd +from chemnlp.data.sampler import TemplateSampler + +# Prepare your data, metadata, and config +df = pd.DataFrame(...) +meta = {...} +config = {...} + +# Initialize the sampler +sampler = TemplateSampler(df, meta, config) + +# Define a template +template = "The molecule with SMILES {SMILES#} has a {property#} of {value#}." + +# Generate a sample +result = sampler.sample(df.iloc[0], template) +print(result) +``` + +Using class balancing: + +```python +# Enable class balancing +sampler.enable_class_balancing("target_column") + +# Generate balanced samples +balanced_results = [sampler.sample(None, template) for _ in range(100)] + +# Disable class balancing when done +sampler.disable_class_balancing() +``` + +Multiple choice question: + +```python +multiple_choice_template = """ +Question: What is the {property__names__noun} of the molecule with SMILES {SMILES#}? +Options: {%multiple_choice_enum%4%aA1} +{value%} +Answer: {%multiple_choice_result} +""" + +mc_result = sampler.sample(df.iloc[0], multiple_choice_template) +print(mc_result) +``` + +## Notes + +- The `TemplateSampler` class supports various types of templates, including those with multiple choice questions. +- Class balancing can be useful for creating balanced datasets for machine learning tasks. +- The sampler can handle both categorical and continuous data types, with proper formatting for continuous values. +- Custom sampling functions can be provided for more control over how values are selected from multiple options. + +For more detailed information on the implementation and advanced usage, please refer to the source code and unit tests. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..e69de29bb diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..2ad61efcd --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,20 @@ +site_name: ChemNLP Documentation +theme: + name: material + palette: + primary: teal +nav: + - Home: index.md + - User Guide: + - Installation: user-guide/installation.md + - Quick Start: user-guide/quickstart.md + - API Reference: + - Sampler Module: api/sampler.md + - Examples: + - Basic Usage: examples/basic-usage.md + - Advanced Techniques: examples/advanced-techniques.md + - Contributing: contributing.md + - Changelog: changelog.md +markdown_extensions: + - pymdownx.highlight + - pymdownx.superfences