-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrs.py
120 lines (94 loc) · 4.79 KB
/
trs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding: utf-8
import gzip
import zipfile
import time
import datetime
from src.marcTransform import MARC21
from src.eadTransform import EAD
from enrich.transform_ocr import write_enriched_graphml, process_tsv
from enrich.merge import merge_all_files
import fire
if __name__=='__main__':
# PARAMETER
of = 'graphml'
dataSel=True
# Daten aus der GND
dt="Authority"
for filename, entity in [("Personen", "PerName"),
("Sachbegriffe", "TopicTerm"),
("Werke", "UniTitle"),
("Geografikum", "GeoName"),
("Koerperschaften", "CorpName"),
("Kongresse", "MeetName")
]:
# fn = 'C:/Users/elle01/Documents/SoNAR/sonar/Normdaten/GND_{}_MARC21XML_20190613.mrc.xml.gz'.format(filename)
fn = 'D:/Datendump/GND_{}_MARC21XML_20190613.mrc.xml.gz'.format(filename)
print('Process {}'.format(entity))
# Laufzeit messen
startTime = time.time()
with gzip.open(fn, 'rt', encoding='utf-8') as f:
gr = MARC21(f, dataType=dt, entityType=entity, outputFormat=of, dataSelection=dataSel)
elapsedTime = time.time() - startTime
# Report
print('{} files converted in {} h {} min {} s\n'.format(entity, str(datetime.timedelta(seconds=int(elapsedTime))).split(':')[0], str(datetime.timedelta(seconds=int(elapsedTime))).split(':')[1], str(datetime.timedelta(seconds=int(elapsedTime))).split(':')[2]))
# # Daten aus der ZDB, DNB
dt="Bibliographic"
for filename, entity in [("ZDB_MARC21_20190305", "Zdb"),
("DNB_MARC21_20190613-1", "Dnb1"),
("DNB_MARC21_20190613-2", "Dnb2"),
("DNB_MARC21_20190613-3", "Dnb3"),
("DNB_MARC21_20190613-4", "Dnb4")
]:
fn = 'Datendump/{}.mrc.xml.gz'.format(filename)
print('Process {}'.format(entity))
# Laufzeit messen
startTime = time.time()
with gzip.open(fn, 'rt', encoding='utf-8') as f:
gr = MARC21(f, dataType=dt, entityType=entity, outputFormat=of, dataSelection=dataSel)
elapsedTime = time.time() - startTime
# Report
print('{} files converted in {} h {} min {} s\n'.format(entity, str(datetime.timedelta(seconds=int(elapsedTime))).split(':')[0], str(datetime.timedelta(seconds=int(elapsedTime))).split(':')[1], str(datetime.timedelta(seconds=int(elapsedTime))).split(':')[2]))
# Daten aus der SBB
zfile = zipfile.ZipFile('Datendump/sbb_titel_2019-06-20_marcxml.zip', mode='r')
count = 1
for fn in zfile.namelist():
if '.xml' in fn and count in range(0,51):
entity = "Sbb{}".format(count)
print('Process {}'.format(fn))
# Laufzeit messen
startTime = time.time()
with zfile.open(fn) as f:
gr = MARC21(f, dataType=dt, entityType=entity, outputFormat=of, dataSelection=dataSel)
elapsedTime = time.time() - startTime
# Report
print('{} files converted in {} h {} min {} s\n'.format(entity, str(datetime.timedelta(seconds=int(elapsedTime))).split(':')[0], str(datetime.timedelta(seconds=int(elapsedTime))).split(':')[1], str(datetime.timedelta(seconds=int(elapsedTime))).split(':')[2]))
count += 1
# Daten aus der Kalliope
print('Process Kpe'.format(fn))
gr=EAD(filename = 'Datendump/KPE_EADXML_20190701.zip')
def integrate_ocr(tsv_files, merged_file, ocr_data_path, all_data_path):
"""
Use to process OCR files and merge all files
into one .graphml file.
Parameters
----------
tsv_files : str
Path to directory which contains ocr files in tsv format
merged_file : str
Name and path of output file, name needs to end in '.graphml'
ocr_data_path : str
Name of directory which contains the
ocr .graphml files
all_data_path : str
Name of directory which contains all .graphml
files (except the ocr .graphml files),
should be in data/graphml/
Returns
-------
None.
"""
process_tsv(tsv_files, 'data/entities-dict.json')
write_enriched_graphml("data/entities-dict.json", "data/ocr/", "graphml")
merge_all_files(merged_file, ocr_data_path, all_data_path)
#integrate_ocr('D:/SoNAR/Enrich/batch3/', 'data/merged/v_27072021.graphml', 'D:/SoNAR/Transformers/data/ocr/', 'D:/SoNAR/Transformers/data/graphml/' )
fire.Fire()