diff --git a/docs/zh_cn/user_guides/incremental_pretraining.md b/docs/zh_cn/user_guides/incremental_pretraining.md index 32a331d7e..4964396ea 100644 --- a/docs/zh_cn/user_guides/incremental_pretraining.md +++ b/docs/zh_cn/user_guides/incremental_pretraining.md @@ -191,14 +191,16 @@ from datasets import load_dataset ####################################################################### - data_path = 'timdettmers/openassistant-guanaco' - prompt_template = PROMPT_TEMPLATE.openassistant -+ data_path = 'path/to/your/data' ++ data_path = 'path/to/your/json/data' ... ####################################################################### # STEP 3 Dataset & Dataloader # ####################################################################### train_dataset = dict( type=process_hf_dataset, - dataset=dict(type=load_dataset, path=data_path), +- dataset=dict(type=load_dataset, path=data_path), ++ dataset=dict( ++ type=load_dataset, path='json', data_files=dict(train=data_path)), tokenizer=tokenizer, max_length=max_length, - dataset_map_fn=oasst1_map_fn, diff --git a/docs/zh_cn/user_guides/multi_turn_conversation.md b/docs/zh_cn/user_guides/multi_turn_conversation.md index 7bc626496..a05213aaf 100644 --- a/docs/zh_cn/user_guides/multi_turn_conversation.md +++ b/docs/zh_cn/user_guides/multi_turn_conversation.md @@ -260,7 +260,7 @@ from datasets import load_dataset # PART 1 Settings # ####################################################################### - data_path = 'timdettmers/openassistant-guanaco' -+ data_path = 'path/to/your/data' ++ data_path = 'path/to/your/json/data' + prompt_template = PROMPT_TEMPLATE.openassistant ... @@ -269,7 +269,9 @@ from datasets import load_dataset ####################################################################### train_dataset = dict( type=process_hf_dataset, - dataset=dict(type=load_dataset, path=data_path), +- dataset=dict(type=load_dataset, path=data_path), ++ dataset=dict( ++ type=load_dataset, path='json', data_files=dict(train=data_path)), tokenizer=tokenizer, max_length=max_length, + dataset_map_fn=None, diff --git a/docs/zh_cn/user_guides/single_turn_conversation.md b/docs/zh_cn/user_guides/single_turn_conversation.md index c754105f5..7336a3335 100644 --- a/docs/zh_cn/user_guides/single_turn_conversation.md +++ b/docs/zh_cn/user_guides/single_turn_conversation.md @@ -228,7 +228,7 @@ from datasets import load_dataset ####################################################################### - alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' - alpaca_en_path = 'tatsu-lab/alpaca' -+ data_path = 'path/to/your/data' ++ data_path = 'path/to/your/json/data' + prompt_template = PROMPT_TEMPLATE.alpaca ####################################################################### @@ -236,7 +236,9 @@ from datasets import load_dataset ####################################################################### train_dataset = dict( type=process_hf_dataset, - dataset=dict(type=load_dataset, path=data_path), +- dataset=dict(type=load_dataset, path=data_path), ++ dataset=dict( ++ type=load_dataset, path='json', data_files=dict(train=data_path)), tokenizer=tokenizer, max_length=max_length, + dataset_map_fn=None,