Skip to content

Commit

Permalink
[Doc] Fix dataset docs (#87)
Browse files Browse the repository at this point in the history
fix docs
  • Loading branch information
HIT-cwh authored Sep 1, 2023
1 parent 3e5e801 commit ae8b8d2
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
6 changes: 4 additions & 2 deletions docs/zh_cn/user_guides/incremental_pretraining.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,16 @@ from datasets import load_dataset
#######################################################################
- data_path = 'timdettmers/openassistant-guanaco'
- prompt_template = PROMPT_TEMPLATE.openassistant
+ data_path = 'path/to/your/data'
+ data_path = 'path/to/your/json/data'
...
#######################################################################
# STEP 3 Dataset & Dataloader #
#######################################################################
train_dataset = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path=data_path),
- dataset=dict(type=load_dataset, path=data_path),
+ dataset=dict(
+ type=load_dataset, path='json', data_files=dict(train=data_path)),
tokenizer=tokenizer,
max_length=max_length,
- dataset_map_fn=oasst1_map_fn,
Expand Down
6 changes: 4 additions & 2 deletions docs/zh_cn/user_guides/multi_turn_conversation.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ from datasets import load_dataset
# PART 1 Settings #
#######################################################################
- data_path = 'timdettmers/openassistant-guanaco'
+ data_path = 'path/to/your/data'
+ data_path = 'path/to/your/json/data'

+ prompt_template = PROMPT_TEMPLATE.openassistant
...
Expand All @@ -269,7 +269,9 @@ from datasets import load_dataset
#######################################################################
train_dataset = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path=data_path),
- dataset=dict(type=load_dataset, path=data_path),
+ dataset=dict(
+ type=load_dataset, path='json', data_files=dict(train=data_path)),
tokenizer=tokenizer,
max_length=max_length,
+ dataset_map_fn=None,
Expand Down
6 changes: 4 additions & 2 deletions docs/zh_cn/user_guides/single_turn_conversation.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,15 +228,17 @@ from datasets import load_dataset
#######################################################################
- alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
- alpaca_en_path = 'tatsu-lab/alpaca'
+ data_path = 'path/to/your/data'
+ data_path = 'path/to/your/json/data'

+ prompt_template = PROMPT_TEMPLATE.alpaca
#######################################################################
# STEP 3 Dataset & Dataloader #
#######################################################################
train_dataset = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path=data_path),
- dataset=dict(type=load_dataset, path=data_path),
+ dataset=dict(
+ type=load_dataset, path='json', data_files=dict(train=data_path)),
tokenizer=tokenizer,
max_length=max_length,
+ dataset_map_fn=None,
Expand Down

0 comments on commit ae8b8d2

Please sign in to comment.