From e4eab2f68662a11ee4fb5527290ca736bd9b121a Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Thu, 15 Feb 2024 14:33:03 +0000 Subject: [PATCH] Update documentation --- docs/config.html | 6 +++--- docs/custom_datasets.html | 42 +++++++++++++-------------------------- 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/docs/config.html b/docs/config.html index 40961e0..6b03a64 100644 --- a/docs/config.html +++ b/docs/config.html @@ -175,7 +175,7 @@

Module mimir.config

"Dump data to cache? Exits program after dumping" load_from_cache: Optional[bool] = False """Load data from cache?""" - load_from_hf: Optional[bool] = False + load_from_hf: Optional[bool] = True """Load data from HuggingFace?""" blackbox_attacks: Optional[List[str]] = field( default_factory=lambda: None @@ -352,7 +352,7 @@

Class variables

class ExperimentConfig -(experiment_name: str, base_model: str, dataset_member: str, dataset_nonmember: str, output_name: str = None, dataset_nonmember_other_sources: Optional[List[str]] = <factory>, pretokenized: Optional[bool] = False, revision: Optional[str] = None, presampled_dataset_member: Optional[str] = None, presampled_dataset_nonmember: Optional[str] = None, token_frequency_map: Optional[str] = None, dataset_key: Optional[str] = None, specific_source: Optional[str] = None, full_doc: Optional[bool] = False, max_substrs: Optional[int] = 20, dump_cache: Optional[bool] = False, load_from_cache: Optional[bool] = False, load_from_hf: Optional[bool] = False, blackbox_attacks: Optional[List[str]] = <factory>, tokenization_attack: Optional[bool] = False, quantile_attack: Optional[bool] = False, n_samples: Optional[int] = 200, max_tokens: Optional[int] = 512, max_data: Optional[int] = 5000, min_words: Optional[int] = 100, max_words: Optional[int] = 200, max_words_cutoff: Optional[bool] = True, batch_size: Optional[int] = 50, chunk_size: Optional[int] = 20, scoring_model_name: Optional[str] = None, top_k: Optional[int] = 40, do_top_k: Optional[bool] = False, top_p: Optional[float] = 0.96, do_top_p: Optional[bool] = False, pre_perturb_pct: Optional[float] = 0.0, pre_perturb_span_length: Optional[int] = 5, tok_by_tok: Optional[bool] = False, fpr_list: Optional[List[float]] = <factory>, random_seed: Optional[int] = 0, ref_config: Optional[ReferenceConfig] = None, neighborhood_config: Optional[NeighborhoodConfig] = None, env_config: Optional[EnvironmentConfig] = None, openai_config: Optional[OpenAIConfig] = None) +(experiment_name: str, base_model: str, dataset_member: str, dataset_nonmember: str, output_name: str = None, dataset_nonmember_other_sources: Optional[List[str]] = <factory>, pretokenized: Optional[bool] = False, revision: Optional[str] = None, presampled_dataset_member: Optional[str] = None, presampled_dataset_nonmember: Optional[str] = None, token_frequency_map: Optional[str] = None, dataset_key: Optional[str] = None, specific_source: Optional[str] = None, full_doc: Optional[bool] = False, max_substrs: Optional[int] = 20, dump_cache: Optional[bool] = False, load_from_cache: Optional[bool] = False, load_from_hf: Optional[bool] = True, blackbox_attacks: Optional[List[str]] = <factory>, tokenization_attack: Optional[bool] = False, quantile_attack: Optional[bool] = False, n_samples: Optional[int] = 200, max_tokens: Optional[int] = 512, max_data: Optional[int] = 5000, min_words: Optional[int] = 100, max_words: Optional[int] = 200, max_words_cutoff: Optional[bool] = True, batch_size: Optional[int] = 50, chunk_size: Optional[int] = 20, scoring_model_name: Optional[str] = None, top_k: Optional[int] = 40, do_top_k: Optional[bool] = False, top_p: Optional[float] = 0.96, do_top_p: Optional[bool] = False, pre_perturb_pct: Optional[float] = 0.0, pre_perturb_span_length: Optional[int] = 5, tok_by_tok: Optional[bool] = False, fpr_list: Optional[List[float]] = <factory>, random_seed: Optional[int] = 0, ref_config: Optional[ReferenceConfig] = None, neighborhood_config: Optional[NeighborhoodConfig] = None, env_config: Optional[EnvironmentConfig] = None, openai_config: Optional[OpenAIConfig] = None)

Config for attacks

@@ -403,7 +403,7 @@

Class variables

"Dump data to cache? Exits program after dumping" load_from_cache: Optional[bool] = False """Load data from cache?""" - load_from_hf: Optional[bool] = False + load_from_hf: Optional[bool] = True """Load data from HuggingFace?""" blackbox_attacks: Optional[List[str]] = field( default_factory=lambda: None diff --git a/docs/custom_datasets.html b/docs/custom_datasets.html index b8fb006..3864f7d 100644 --- a/docs/custom_datasets.html +++ b/docs/custom_datasets.html @@ -41,6 +41,20 @@

Module mimir.custom_datasets

DATASETS = ['writing', 'english', 'german', 'pubmed'] +SOURCES_UPLOADED = [ + "arxiv", + "dm_mathematics", + "github", + "hackernews", + "pile_cc", + "pubmed_central", + "wikipedia_(en)", + "full_pile", + "c4", + "temporal_arxiv", + "temporal_wiki" +] + def load_pubmed(cache_dir): data = datasets.load_dataset('pubmed_qa', 'pqa_labeled', split='train', cache_dir=cache_dir) @@ -70,20 +84,6 @@

Module mimir.custom_datasets

if not filename.startswith("the_pile"): raise ValueError(f"HuggingFace data only available for The Pile.") - SOURCES_UPLOADED = [ - "arxiv", - "dm_mathematics", - "github", - "hackernews", - "pile_cc", - "pubmed_central", - "wikipedia_(en)", - "full_pile", - "c4", - "temporal_arxiv", - "temporal_wiki" - ] - for source in SOURCES_UPLOADED: # Got a match if source in filename and filename.startswith(f"the_pile_{source}"): @@ -292,20 +292,6 @@

Functions

if not filename.startswith("the_pile"): raise ValueError(f"HuggingFace data only available for The Pile.") - SOURCES_UPLOADED = [ - "arxiv", - "dm_mathematics", - "github", - "hackernews", - "pile_cc", - "pubmed_central", - "wikipedia_(en)", - "full_pile", - "c4", - "temporal_arxiv", - "temporal_wiki" - ] - for source in SOURCES_UPLOADED: # Got a match if source in filename and filename.startswith(f"the_pile_{source}"):