-
Notifications
You must be signed in to change notification settings - Fork 2
/
biolu_encode.py
85 lines (61 loc) · 2.95 KB
/
biolu_encode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import argparse
parser = argparse.ArgumentParser(description='Change encoding from BIO to BIOLU')
parser.add_argument('input', metavar='-i', type=str, help='The path to the original file with BIO encoding')
parser.add_argument('output', metavar='-o', type=str, help='The name of your BIOLU encoded file')
args = parser.parse_args()
input_file = args.input
output_file = args.output
def read_file(input_file):
with open(input_file, 'rb') as f:
return f.read().decode('ASCII').split('\n')
def write_line(new_label: str, prev_label: str, line_content: list, output_file):
new_iob = new_label + prev_label
line_content[3] = new_iob
current_line = ' '.join(line_content)
output_file.write(current_line + '\n')
def convert(input_file, output_path):
output_file = open(output_path, 'w')
for i in range(len(input_file) + 1):
try:
current_line = input_file[i]
if '-DOCSTART-' in current_line:
output_file.write(current_line + '\n')
elif len(current_line) == 0:
output_file.write(current_line + '\n')
else:
prev_iob = None
next_iob = None
prev_line = None
next_line = None
try:
prev_line = input_file[i - 1]
next_line = input_file[i + 1]
if len(prev_line) > 0:
prev_line_content = prev_line.split()
prev_iob = prev_line_content[3]
if len(next_line) > 0:
next_line_content = next_line.split()
next_iob = next_line_content[3]
except IndexError:
pass
current_line_content = current_line.split()
current_iob = current_line_content[3]
# Outside entities
if current_iob == 'O':
output_file.write(current_line + '\n')
# Unit length entities
elif (prev_iob == 'O' or len(prev_line) == 0) and next_iob == 'O':
write_line('U-', current_iob[2:], current_line_content, output_file)
# First element of chunk
elif (prev_iob == 'O' or len(prev_line) == 0) and next_iob != 'O':
write_line('B-', current_iob[2:], current_line_content, output_file)
# Last element of chunk
elif (prev_iob != 'O' and len(prev_line) != 0) and (next_iob == 'O' or len(next_line) == 0):
write_line('L-', current_iob[2:], current_line_content, output_file)
# Inside a chunk
elif (prev_iob != 'O' and len(prev_line) != 0) and (next_iob != 'O' and len(next_line) != 0):
write_line('I-', current_iob[2:], current_line_content, output_file)
except IndexError:
pass
bio = read_file(input_file)
convert(bio, output_file)