-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfind_index.py
95 lines (66 loc) · 3.19 KB
/
find_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import re
import pandas as pd
def find_entity(sentence):
'''
input
sentence : sentence with entity marking (* : subj entity & : obj entity)
output
sentence : origin sentence
subj : subj_entity
obj : obj_entity
subj_dict : {'start_idx': , 'end_idx': }
obj_dict : {'start_idx': , 'end_idx': }
example
&독일&의 *게오르그 하클*은 이번 대회 루지 싱글 종목에서 은메달을 획득함으로써, 개인종목에서 올림픽 대회 5연속 메달 획득에 성공한 최초의 선수가 되었다.
sentence : 독일의 게오르그 하클은 이번 대회 루지 싱글 종목에서 은메달을 획득함으로써, 개인종목에서 올림픽 대회 5연속 메달 획득에 성공한 최초의 선수가 되었다.
subj : 게오르그 하클
obj : 독일
subj_dict : {'start_idx': 4, 'end_idx': 11}
obj_dict : {'start_idx': 0, 'end_idx': 2}
'''
m_subj=re.search(r"\*[^\*]+\*",sentence)
m_obj=re.search(r"&[^&]+&",sentence)
subj_with_marker = sentence[m_subj.start():m_subj.end()]
obj_with_marker = sentence[m_obj.start():m_obj.end()]
subj = subj_with_marker[1:-1]
obj = obj_with_marker[1:-1]
subj_with_marker = re.sub(r'\*','\*',subj_with_marker)
subj_dict= dict()
obj_dict = dict()
if m_subj.start()<m_obj.start():
m_subj = re.search(subj_with_marker,sentence)
subj_dict["start_idx"] = m_subj.start()
subj_dict["end_idx"] = m_subj.start() + len(subj)
sentence = re.sub(subj_with_marker,subj,sentence)
m_obj = re.search(obj_with_marker,sentence)
obj_dict["start_idx"] = m_obj.start()
obj_dict["end_idx"] = m_obj.start() + len(obj)
sentence = re.sub(obj_with_marker,obj,sentence)
else:
m_obj = re.search(obj_with_marker, sentence)
obj_dict["start_idx"] = m_obj.start()
obj_dict["end_idx"] = m_obj.start() + len(obj)
sentence = re.sub(obj_with_marker,obj,sentence)
m_subj = re.search(subj_with_marker,sentence)
subj_dict["start_idx"] = m_subj.start()
subj_dict["end_idx"] = m_subj.start() + len(subj)
sentence = re.sub(subj_with_marker,subj,sentence)
return sentence, subj, obj, subj_dict, obj_dict
if __name__ == '__main__':
input_df = pd.read_csv('./temp.csv', encoding='utf8')
subj_list = []
obj_list = []
subj_type_list = input_df['subj_type']
obj_type_list = input_df['obj_type']
sentence_list = []
subj_dict_list = []
obj_dict_list = []
for sentence in input_df["sentence"]:
sentence_, subj_, obj_, subj_dict_, obj_dict_ = find_entity(sentence)
sentence_list.append(sentence_)
subj_list.append(subj_)
obj_list.append(obj_)
subj_dict_list.append(subj_dict_)
obj_dict_list.append(obj_dict_)
output_df = pd.DataFrame({'id':range(len(sentence_list)),'sentence':sentence_list,'subj_entity':subj_list,'obj_entity':obj_list,'subj_type':subj_type_list,'obj_type':obj_type_list, 'subj_index':subj_dict_list,'obj_index':obj_dict_list })
output_df.to_csv('append.csv')