From 07ee7a1cafd7086934d51070d68db45fd2ef2272 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <mail@kjablonka.com>
Date: Mon, 12 Aug 2024 14:55:54 -0700
Subject: [PATCH] docs: bootstrap basic documentation

---
 .DS_Store           | Bin 0 -> 6148 bytes
 docs/api/sampler.md | 114 ++++++++++++++++++++++++++++++++++++++++++++
 docs/index.md       |   0
 mkdocs.yml          |  20 ++++++++
 4 files changed, 134 insertions(+)
 create mode 100644 .DS_Store
 create mode 100644 docs/api/sampler.md
 create mode 100644 docs/index.md
 create mode 100644 mkdocs.yml
diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..ca2b7874ee44cef1857d5487e8c8184223131c55
GIT binary patch
literal 6148
zcmeH~F$w}f3`G;&La^D=avBfd4F=H@>;(h`8(BfodXDZ-CJ2t!BJu;tpJXO1`-+{7
zi0JxuSc&u^GJ~7S(n4d3ypw~RWiQwJa2ZeM@rat$Cvn!+@Lrnz*rt#G36KB@kN^q%
z5COZlVY7KvMiL+a5_l4@??Zx{=Fn2rKOG1@0zf;I-LUpq0-CG<&7q|#Dlm=dL8DcD
z46(YmLsOi~p`~hV7meXV<H>4M3`}dgXhH(h?7~0-B+w9;*1Wg-e+&OK|2Hj6Nq_|Y
zjDU8VVY9|d#ohY$dRE^>)z$?L_2URHKLJSWDqg_du%B!J&7q|#Dlq;CI0gn1_$q-1
D?w=C8

literal 0
HcmV?d00001

diff --git a/docs/api/sampler.md b/docs/api/sampler.md
new file mode 100644
index 000000000..2ca3a9e10
--- /dev/null
+++ b/docs/api/sampler.md
@@ -0,0 +1,114 @@
+# Sampler Module
+
+## Overview
+
+The `sampler` module provides functionality for generating text samples based on templates and data. It is primarily used for creating datasets for natural language processing tasks in chemistry and related fields. The main class in this module is `TemplateSampler`, which allows for flexible text generation with support for multiple choice questions and class balancing.
+
+## TemplateSampler
+
+### Class: TemplateSampler
+
+The `TemplateSampler` class is responsible for sampling and generating text based on templates and data.
+
+#### Initialization
+
+```python
+sampler = TemplateSampler(df: pd.DataFrame, meta: Dict, config: Dict, column_datafield_sampler: Optional[Callable] = None)
+```
+
+- `df`: A pandas DataFrame containing the dataset.
+- `meta`: A dictionary containing metadata about the dataset, including identifiers and targets.
+- `config`: A dictionary containing configuration parameters for the sampler.
+- `column_datafield_sampler`: An optional callable for custom sampling from multiple options.
+
+#### Main Methods
+
+##### sample
+
+```python
+def sample(self, sample: Optional[pd.Series], template: str) -> str
+```
+
+Generates a text sample based on a template and a data sample.
+
+- `sample`: A row from the dataset. If None, a random sample is chosen.
+- `template`: The template string to be filled.
+- Returns: The completed text sample with all variables replaced by their values.
+
+##### enable_class_balancing
+
+```python
+def enable_class_balancing(self, column: str)
+```
+
+Enables class-balanced sampling for a specified column.
+
+- `column`: The column to use for balancing.
+
+##### disable_class_balancing
+
+```python
+def disable_class_balancing(self)
+```
+
+Disables class-balanced sampling and reverts to the original dataset.
+
+#### Usage Examples
+
+Basic usage:
+
+```python
+import pandas as pd
+from chemnlp.data.sampler import TemplateSampler
+
+# Prepare your data, metadata, and config
+df = pd.DataFrame(...)
+meta = {...}
+config = {...}
+
+# Initialize the sampler
+sampler = TemplateSampler(df, meta, config)
+
+# Define a template
+template = "The molecule with SMILES {SMILES#} has a {property#} of {value#}."
+
+# Generate a sample
+result = sampler.sample(df.iloc[0], template)
+print(result)
+```
+
+Using class balancing:
+
+```python
+# Enable class balancing
+sampler.enable_class_balancing("target_column")
+
+# Generate balanced samples
+balanced_results = [sampler.sample(None, template) for _ in range(100)]
+
+# Disable class balancing when done
+sampler.disable_class_balancing()
+```
+
+Multiple choice question:
+
+```python
+multiple_choice_template = """
+Question: What is the {property__names__noun} of the molecule with SMILES {SMILES#}?
+Options: {%multiple_choice_enum%4%aA1}
+{value%}
+Answer: {%multiple_choice_result}
+"""
+
+mc_result = sampler.sample(df.iloc[0], multiple_choice_template)
+print(mc_result)
+```
+
+## Notes
+
+- The `TemplateSampler` class supports various types of templates, including those with multiple choice questions.
+- Class balancing can be useful for creating balanced datasets for machine learning tasks.
+- The sampler can handle both categorical and continuous data types, with proper formatting for continuous values.
+- Custom sampling functions can be provided for more control over how values are selected from multiple options.
+
+For more detailed information on the implementation and advanced usage, please refer to the source code and unit tests.
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 000000000..2ad61efcd
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,20 @@
+site_name: ChemNLP Documentation
+theme:
+  name: material
+  palette:
+    primary: teal
+nav:
+  - Home: index.md
+  - User Guide:
+      - Installation: user-guide/installation.md
+      - Quick Start: user-guide/quickstart.md
+  - API Reference:
+      - Sampler Module: api/sampler.md
+  - Examples:
+      - Basic Usage: examples/basic-usage.md
+      - Advanced Techniques: examples/advanced-techniques.md
+  - Contributing: contributing.md
+  - Changelog: changelog.md
+markdown_extensions:
+  - pymdownx.highlight
+  - pymdownx.superfences