-
Notifications
You must be signed in to change notification settings - Fork 3
/
pre_process_data.py
executable file
·44 lines (33 loc) · 1.3 KB
/
pre_process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import json
DEFAULT_FILE_TYPE = ".json"
# 默认采用json文件作为输入的文件类型
# 如果自定义了文件类型,或者数据源,请return False
def is_default_file_type():
return True
# 示例方法
# def load():
# #根据文件编码类型,选择相应编码
# with open("/user/npl/gpt/data/train.txt", 'r', encoding='gbk') as f:
# print('reading lines')
# lines = f.readlines();
# lines = [line.replace('\n', ' [SEP] ') for line in lines] # 用[SEP]表示换行, 段落之间使用SEP表示段落结束
# return lines
# 请用相应的原编码加载文件
# 自定文件类型或者数据源必须实现此方法
# 最终返回列表,具体参考上面示例
# def load():
# pass
def load():
#根据文件编码类型,选择相应编码
with open("data/train.txt", 'r', encoding='utf-8') as f:
print('reading lines')
lines = f.readlines()
lines = [line.replace('\n', ' [SEP] ') for line in lines] # 用[SEP]表示换行, 段落之间使用SEP表示段落结束
return lines
a= load()
# print(a)
with open('data/train.json', 'w', encoding='utf-8') as json_file:
# for it in a:
# json.dump(it, json_file, ensure_ascii=False)
json.dump(a, json_file, ensure_ascii=False)
print("write json file success!")