This repository has been archived by the owner on Nov 28, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
frantext_load_script.py
executable file
·166 lines (129 loc) · 5.35 KB
/
frantext_load_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import sys
import os
import errno
import philologic
import re
import socket
from philologic.LoadFilters import *
from philologic.PostFilters import *
from philologic.Parser import Parser
from philologic.Loader import Loader, handle_command_line, setup_db_dir
try:
import artfl_xpaths
except ImportError:
print "You need to copy the artfl_xpaths.py script from PhiloLogic4/scripts/ in the same directory as this script"
## Flush buffer output
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
## Parse command line
dbname, files, workers, console_output, log, debug = handle_command_line(sys.argv)
def clean_dates_and_sort(load_metadata, sort_order):
for index, i in enumerate(load_metadata):
try:
load_metadata[index]["date"] = re.sub('.*?(\d{4}).*', '\\1', i["date"])
except KeyError:
load_metadata[index]["date"] = ""
def make_sort_key(d):
key = [d.get(f,"") for f in sort_order]
return key
load_metadata.sort(key=make_sort_key, reverse=False)
fixed_load_metadata = load_metadata[:]
for index, i in enumerate(load_metadata):
if "date" not in i:
fixed_load_metadata.pop(index)
fixed_load_metadata.append(i)
print "%s has no date and was therefore moved to the end of the load order" % i['filename']
continue
try:
int(i["date"])
except ValueError:
fixed_load_metadata.pop(index)
fixed_load_metadata.append(i)
print "%s has no valid date and was therefore moved to the end of the load order" % i['filename']
return fixed_load_metadata
##########################
## System Configuration ##
##########################
# Set the filesytem path to the root web directory for your PhiloLogic install.
database_root = "/var/www/html/philologic4/"
# /var/www/html/philologic/ is conventional for linux,
# /Library/WebServer/Documents/philologic for Mac OS.
# Please follow the instructions in INSTALLING before use.
# Set the URL path to the same root directory for your philologic install.
url_root = "http://%s/philologic4/" % socket.getfqdn()
# http://localhost/philologic/ is appropriate if you don't have a DNS hostname.
if database_root is None or url_root is None:
print >> sys.stderr, "Please configure the loader script before use. See INSTALLING in your PhiloLogic distribution."
exit()
template_dir = "~/PhiloLogic4/www/"
# The load process will fail if you haven't set up the template_dir at the correct location.
# Define default object level
default_object_level = 'doc'
# Define navigable objects
navigable_objects = ('doc', 'div1', 'div2', 'div3')
# Data tables to store.
tables = ['toms', 'pages']
# Define filters as a list of functions to call, either those in Loader or outside
filters = [normalize_unicode_raw_words,make_word_counts,generate_words_sorted,make_object_ancestors(*navigable_objects),
make_sorted_toms(*navigable_objects),prev_next_obj(*navigable_objects),generate_pages, make_max_id]
post_filters = [word_frequencies,normalized_word_frequencies,metadata_frequencies,normalized_metadata_frequencies]
## Define text objects to generate plain text files for various machine learning tasks
## For instance, this could be ['doc', 'div1']
plain_text_obj = []
if plain_text_obj:
filters.extend([store_in_plain_text(*plaint_text_obj)])
extra_locals = {"db_url": url_root + dbname}
extra_locals['default_object_level'] = default_object_level
###########################
## Set-up database load ###
###########################
xpaths = artfl_xpaths.xpaths
metadata_xpaths = artfl_xpaths.metadata_xpaths
pseudo_empty_tags = ["milestone"]
## A list of tags to ignore
suppress_tags = ["teiHeader",".//head"]
word_regex = r"([\w]+)"
punct_regex = r"([\.?!])"
token_regex = word_regex + "|" + punct_regex
## Saved in db.locals.py for tokenizing at runtime
extra_locals["word_regex"] = word_regex
extra_locals["punct_regex"] = punct_regex
## Define the order in which files are sorted
## This will affect the order in which results are displayed
## Supply a list of metadata strings, e.g.:
## ["date", "author", "title"]
sort_order = ["date", "author", "title", "filename"]
################################
## Don't edit unless you know ##
## what you're doing ##
################################
os.environ["LC_ALL"] = "C" # Exceedingly important to get uniform sort order.
os.environ["PYTHONIOENCODING"] = "utf-8"
db_destination = database_root + dbname
data_destination = db_destination + "/data"
db_url = url_root + dbname
setup_db_dir(db_destination, template_dir)
####################
## Load the files ##
####################
l = Loader(data_destination,
load_filters=filters,
post_filters=post_filters,
tables=tables,
xpaths=xpaths,
metadata_xpaths=metadata_xpaths,
pseudo_empty_tags=pseudo_empty_tags,
suppress_tags=suppress_tags,
token_regex=token_regex,
default_object_level=default_object_level,
debug=debug)
l.add_files(files)
filenames = l.list_files()
load_metadata = l.sort_by_metadata(*sort_order)
## Clean-up dates and move texts with no date at the end
load_metadata = clean_dates_and_sort(load_metadata, sort_order)
l.parse_files(workers,load_metadata)
l.merge_objects()
l.analyze()
l.setup_sql_load()
l.post_processing()
l.finish(**extra_locals)