-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencyc_loader.py
executable file
·287 lines (246 loc) · 11.5 KB
/
encyc_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import sys
import os
import errno
import philologic
from optparse import OptionParser
from glob import glob
from philologic.LoadFilters import *
from philologic.PostFilters import *
from philologic.Parser import Parser
from philologic.ParserHelpers import *
from philologic.Loader import Loader
def normalize_divs(*columns):
def normalize_these_columns(loader_obj,text,depth=5):
current_values = {}
tmp_file = open(text["sortedtoms"] + ".tmp","w")
for column in columns:
current_values[column] = ""
for line in open(text["sortedtoms"]):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = eval(attrib)
if type == "div1":
for column in columns:
if column in record.attrib:
current_values[column] = record.attrib[column]
else:
current_values[column] = ""
elif type == "div2":
for column in columns:
if column in record.attrib:
current_values[column] = record.attrib[column]
elif type == "div3":
for column in columns:
if column not in record.attrib:
record.attrib[column] = current_values[column]
print >> tmp_file, record
tmp_file.close()
os.remove(text["sortedtoms"])
os.rename(text["sortedtoms"] + ".tmp",text["sortedtoms"])
return normalize_these_columns
def normalize_columns_post(*columns):
def normalize_these_columns_post(loader):
for k,v in loader.metadata_types.items():
if k in columns:
loader.metadata_types[k] = "div3"
return normalize_these_columns_post
def normalize_unicode_raw_words(loader_obj, text):
tmp_file = open(text["raw"] + ".tmp","w")
for line in open(text["raw"]):
rec_type, word, id, attrib = line.split('\t')
id = id.split()
if rec_type == "word":
word = word.decode("utf-8").lower().encode("utf-8")
record = Record(rec_type, word, id)
record.attrib = eval(attrib)
print >> tmp_file, record
os.remove(text["raw"])
os.rename(text["raw"] + ".tmp",text["raw"])
def make_sorted_toms(*types):
def sorted_toms(loader_obj, text):
type_pattern = "|".join("^%s" % t for t in types)
tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % (text["raw"],type_pattern,loader_obj.sort_by_id,text["sortedtoms"])
os.system(tomscommand)
return sorted_toms
#########################
## Command-line parsing #
#########################
usage = "usage: %prog [options] database_name files"
parser = OptionParser(usage=usage)
parser.add_option("-q", "--quiet", action="store_true", dest="quiet", help="suppress all output")
parser.add_option("-l", "--log", default=False, dest="log", help="enable logging and specify file path")
parser.add_option("-c", "--cores", type="int", default="2", dest="workers", help="define the number of cores for parsing")
parser.add_option("-t", "--templates", default=False, dest="template_dir", help="define the path for the templates you want to use")
parser.add_option("-d", "--debug", action="store_true", default=False, dest="debug", help="add debugging to your load")
parser.add_option("--no-template", action="store_true", default=False, dest="no_template", help="build a database without templates for HTML rendering")
##########################
## System Configuration **
##########################
# Set the filesytem path to the root web directory for your PhiloLogic install.
database_root = '/var/www/html/philo4/'
# /var/www/html/philologic/ is conventional for linux,
# /Library/WebServer/Documents/philologic for Mac OS.
# Please follow the instructions in INSTALLING before use.
# Set the URL path to the same root directory for your philologic install.
url_root = 'http://pantagruel.ci.uchicago.edu/philo4/'
# http://localhost/philologic/ is appropriate if you don't have a DNS hostname.
if database_root is None or url_root is None:
print >> sys.stderr, "Please configure the loader script before use. See INSTALLING in your PhiloLogic distribution."
exit()
template_dir = database_root + "_system_dir/_install_dir/"
# The load process will fail if you haven't set up the template_dir at the correct location.
###########################
## Configuration options ##
###########################
## Parse command-line arguments
(options, args) = parser.parse_args(sys.argv[1:])
try:
dbname = args[0]
args.pop(0)
files = args[:]
# if args[-1].endswith('/') or os.path.isdir(args[-1]):
# files = glob(args[-1] + '/*')
# else:
# files = args[:]
except IndexError:
print >> sys.stderr, "\nError: you did not supply a database name or a path for your file(s) to be loaded\n"
parser.print_help()
sys.exit()
## Number of cores used for parsing: you can define your own value on the
## command-line, stay with the default, or define your own value here
workers = options.workers or 2
## This defines which set of templates to use with your database: you can stay with
## the default or speicify another path from the command-line. Alternatively, you
## can edit it here.
if not options.no_template:
template_dir = options.template_dir or template_dir
else:
template_dir = False
## Define the type of output you want. By default, you get console output for your database
## load. You can however set a quiet option on the command-line, or set console_output
## to False here.
console_output = True
if options.quiet:
console_output = False
## Define a path for a log of your database load. This option can be defined on the command-line
## or here. It's disabled by default.
log = options.log or False
## Set debugging if you want to keep all the parsing data, as well as debug the templates
debug = options.debug or False
# Define text objects for ranked relevancy: by default it's ['doc']. Disable by supplying empty list
default_object_level = 'div3'
# Data tables to store.
tables = ['toms', 'pages', 'ranked_relevance']
# Define filters as a list of functions to call, either those in Loader or outside
filters = [normalize_unicode_raw_words,make_word_counts, generate_words_sorted,make_token_counts,make_sorted_toms("doc","div1","div2","div3"),
prev_next_obj, normalize_divs("head","articleAuthor","normalizedClass","volume","pos"),
word_frequencies_per_obj(),generate_pages, make_max_id]
post_filters = [normalize_columns_post("articleAuthor","normalizedClass","head","volume","pos"),word_frequencies,
normalized_word_frequencies,metadata_frequencies,normalized_metadata_frequencies,metadata_relevance_table]
# Define text objects to generate plain text files for various machine learning tasks
plain_text_obj = []
if plain_text_obj:
filters.extend([store_in_plain_text(*plaint_text_obj)])
extra_locals = {"db_url": url_root + dbname}
## Define which search reports to enable
## Note that this can still be configured in your database db_locals.py file
search_reports = ['concordance', 'kwic', 'relevance', 'collocation', 'time_series']
extra_locals['search_reports'] = search_reports
###########################
## Set-up database load ###
###########################
Philo_Types = ["doc","div"] # every object type you'll be indexing. pages don't count, yet.
XPaths = [("doc","."),("div",".//div1"),("div",".//div2"),("page",".//pb")]
Metadata_XPaths = [ # metadata per type. '.' is in this case the base element for the type, as specified in XPaths above.
# MUST MUST MUST BE SPECIFIED IN OUTER TO INNER ORDER--DOC FIRST, WORD LAST
("doc","./teiheader//titlestmt/title","title"),
("doc","./teiheader//titlestmt/author","author"),
("doc","./teiheader//profiledesc/creation/date","date"),
("div","./index[@type='headword']@value","head"),
("div","./index[@type='author']@value","articleAuthor"),
("div","./index[@type='objecttype']@value","articleType"),
("div","./index[@type='class']@value","class"),
("div","./index[@type='normclass']@value","normalizedClass"),
("div","./index[@type='englishclass']@value","englishClass"),
("div","./index[@type='generatedclass']@value","generatedClass"),
("div","./index[@type='pos']@value","pos"),
("div",".@n","n"),
("div",".@id","id"),
("div",".@vol","volume"),
("page",".@n","n"),
("page",".@fac","img")
]
# "doc" : [(ContentExtractor,"./teiHeader/fileDesc/titleStmt/author","author"),
# (ContentExtractor,"./teiHeader/fileDesc/titleStmt/title", "title"),
# (ContentExtractor,"./teiHeader/sourceDesc/biblFull/publicationStmt/date", "date"),
# (AttributeExtractor,"./text/body/volume@n","volume"),
# (AttributeExtractor,".@xml:id","id")],
# "div" : [(ContentExtractor,"./head","head"),
# (ContentExtractor,"./head//*","head"),
# (AttributeExtractor,".@n","n"),
# (AttributeExtractor,".@xml:id","id")],
# "para": [(ContentExtractor,"./speaker", "who"),
# (ContentExtractor,"./head","head")],
# "word": [(AttributeExtractor,".@lemma","lemma"),
# (ContentExtractor,".","token"),
# (AttributeExtractor,".@ana","ana")],
# "page": [(AttributeExtractor,".@n","n"),
# (AttributeExtractor,".@src","img")],
#}
pseudo_empty_tags = ["milestone"]
suppress_tags = ["teiheader",".//head"]
word_regex = r"([\w]+)"
punct_regex = r"([\.?!])"
token_regex = word_regex + "|" + punct_regex
extra_locals["word_regex"] = word_regex
extra_locals["punct_regex"] = punct_regex
#############################
# Actual work. Don't edit. #
#############################
os.environ["LC_ALL"] = "C" # Exceedingly important to get uniform sort order.
os.environ["PYTHONIOENCODING"] = "utf-8"
db_destination = database_root + dbname
data_destination = db_destination + "/data"
db_url = url_root + "/" + dbname
try:
os.mkdir(db_destination)
except OSError:
print "The %s database already exists" % dbname
print "Do you want to delete this database? Yes/No"
choice = raw_input().lower()
if choice.startswith('y'):
os.system('rm -rf %s' % db_destination)
os.mkdir(db_destination)
else:
sys.exit()
if template_dir:
os.system("cp -r %s* %s" % (template_dir,db_destination))
os.system("cp %s.htaccess %s" % (template_dir,db_destination))
####################
## Load the files ##
####################
l = Loader(data_destination,
token_regex,
XPaths,
Metadata_XPaths,
filters,
pseudo_empty_tags,
suppress_tags,
default_object_level=default_object_level,
debug=debug)
#destination,token_regex=default_token_regex,xpaths=default_xpaths,
# metadata_xpaths=default_metadata,filters=default_filters,
# pseudo_empty_tags=[],suppress_tags=[],console_output=True,
# log=False, debug=False)
l.add_files(files)
filenames = l.list_files()
print filenames
load_metadata = [{"filename":f} for f in sorted(filenames)]
l.parse_files(workers,load_metadata)
l.merge_objects()
l.analyze()
l.make_tables(tables)
l.finish(post_filters,**extra_locals)
print "\nDone indexing."
print "Your database is viewable at " + db_url + "\n"