-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_transcript_to_tei.py
executable file
·151 lines (131 loc) · 4.44 KB
/
convert_transcript_to_tei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 29 18:03:31 2021
@author: florian
"""
import re
def read_txt_from_file(file):
"""
"""
with open(file, "r", encoding="utf8") as current_file:
txt = current_file.read()
return txt
def write_txt_to_file(file, txt):
"""
"""
with open(file, "w", encoding="utf8") as current_file:
current_file.write(f"{txt}")
def read_metadata(txt):
"""
"""
metadata = {}
metadata_list = [
("author", "a"),
("title", "t"),
("siglum", "s"),
("transcriber", "tr"),
("transcriber_initials", "tri")]
for item in metadata_list:
metadata[item[0]] = re.search('<'+item[1]+'>(.*?)</'+item[1]+'>', txt).group(1)
txt = re.sub(f'<{item[1]}>(.*?)</{item[1]}>\n','',txt)
return metadata, txt
def create_teiHeader(title, author, transcriber, transcriber_initials, siglum):
teiHeader = ('<teiHeader>' +'\n'+
'<fileDesc>' +'\n'+
'<titleStmt>' +'\n'+
'<title>' + title + '</title>' +'\n'+
'<author>' + author + '</author>' +'\n'+
'<respStmt>' +'\n'+
'<name xml:id="' + transcriber_initials + '">' + transcriber + '</name>' +'\n'+
'<resp>manuscript transcription</resp>' +'\n'+
'</respStmt>' +'\n'+
'</titleStmt>' +'\n'+
'<publicationStmt><p>unpublished</p></publicationStmt>' +'\n'+
'<sourceDesc>' +'\n'+
'<listWit>' +'\n'+
'<witness xml:id="' + siglum + '"></witness>' +'\n'+
'</listWit>' +'\n'+
'</sourceDesc>' +'\n'+
'</fileDesc>' +'\n'+
'</teiHeader>')
return teiHeader
def convert_linebreaks(txt):
"""
"""
txt = re.sub('\n','<lb/> ', txt)
return txt
def convert_pagebreaks(txt, siglum):
"""
"""
txt = re.sub('\|fol\. ([0-9]{1,3}[rv])\|','<pb edRef="#'+siglum+'" n="'+r'\1'+'"/> ', txt)
return txt
def add_paragraphs(txt):
"""
"""
txt = '<p>' + txt + '</p>'
txt = re.sub('//', '</p><p>', txt)
return txt
def convert_punctuation(txt):
"""
"""
punctuation_list = {
"U002E": ".",
"U003A": ":",
"U0700" : "܀",
"U0707" : "܇",
"U0709" : "܆"}
for k,v in punctuation_list.items():
txt = txt.replace(v,'<pc>'+v+'</pc>')
return txt
def convert_rubrics(txt):
"""
"""
txt = re.sub('<r>(.*?)</r>','<rubric>'+r'\1'+'</rubric>', txt)
return txt
def convert_glosses(txt, siglum):
"""
"""
gloss_annotation = re.search('<m(.)>(.*?)</m>', txt)
place = gloss_annotation.group(1)
gloss = gloss_annotation.group(2)
if place == "r":
place = "right"
elif place == "l":
place = "left"
elif place == "t":
place = "top"
elif place == "b":
place = "bottom"
elif place == "a":
place = "above"
txt = re.sub('<m.>.*?</m>','<add edRef="#'+siglum+'" place="'+place+'">'+gloss+'</add>', txt)
return txt
def add_tei_structure(txt, teiHeader):
"""
"""
teiPreamble = ('<?xml version="1.0" encoding="UTF-8"?>' +'\n')
#'<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>' +'\n'+
#'<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"?>' +'\n')
teiFrameOpen = '<TEI xmlns="http://www.tei-c.org/ns/1.0">' +'\n'
teiFrameClose = '</TEI>'
txt = teiPreamble + teiFrameOpen + teiHeader + "<text><body>" + txt + "</body></text>" + teiFrameClose
return txt
def process(file):
txt = read_txt_from_file(file)
(metadata, txt) = read_metadata(txt)
header = create_teiHeader(
metadata["title"],
metadata["author"],
metadata["transcriber"],
metadata["transcriber_initials"],
metadata["siglum"]
)
txt = convert_linebreaks(txt)
txt = convert_pagebreaks(txt, metadata["siglum"])
txt = add_paragraphs(txt)
txt = convert_punctuation(txt)
txt = convert_rubrics(txt)
txt = convert_glosses(txt, metadata["siglum"])
txt = add_tei_structure(txt, header)
write_txt_to_file('./output.xml',txt)