forked from projectEndings/staticSearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild.xml
412 lines (329 loc) · 18.9 KB
/
build.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
<?xml version="1.0" encoding="UTF-8"?>
<project basedir="." name="BuildStaticSearch" default="all"
xmlns:if="ant:if"
xmlns:unless="ant:unless">
<!--
**********************************************************
* Ant file for building a static search engine for a *
* specific site. When supplied with a config file path *
* using the -DconfigFile=/home/user/proj/cfg.xml, it *
* will build the index based on that configuration. *
* If not supplied with a configuration file, it will *
* build the test site by default. *
* *
* Note that you can also specify a relative path using *
* -Dconfig=../rel/cfg.xml * *
* *
**********************************************************
-->
<!--****************************************************************
* *
* Task Definitions *
* *
****************************************************************-->
<!-- We need ant contrib. -->
<taskdef resource="net/sf/antcontrib/antcontrib.properties"/>
<!--****************************************************************
* *
* Properties *
* *
****************************************************************-->
<!-- Saxon moved from 9 to 10 2020-03-20. -->
<!--<property name="saxon" value="${basedir}/lib/saxon9he.jar"/>-->
<property name="saxon" value="${basedir}/lib/saxon-he-10.jar"/>
<!--The relative configuration path-->
<property name="config" value="configTest.xml"/>
<!--The full path to the configuration file. You can use either-->
<property name="configFile" value="${basedir}/${config}"/>
<!--The derived name of the configuration directory based off of the configuration
file path-->
<dirname property="configDir" file="${configFile}"/>
<!--Load the configuration file as a property file-->
<xmlproperty file="${configFile}" keeproot="true"/>
<!--Set the verbose flag to false by default; set to true to see detailed messages-->
<property name="verbose" value="false"/>
<!--The path to the search page, relative to the configuration directory-->
<property name="searchFilePath" location="${configDir}/${config.params.searchFile}"/>
<!--The name of the collection dir, derived from the search file-->
<dirname property="collectionDirName" file="${searchFilePath}"/>
<!--Output folder, which is forked, depending on whether or not an output folder
is specified in the configuration file-->
<property name="outputFolder" if:set="config.params.outputFolder" value="${config.params.outputFolder}"/>
<property name="outputFolder" unless:set="config.params.outputFolder" value="staticSearch"/>
<!--Stemmer folder, which is forked, depending on whether or not a stemmer folder
is specified in the configuration file-->
<property name="stemmerFolder" if:set="config.params.stemmerFolder" value="${config.params.stemmerFolder}"/>
<property name="stemmerFolder" unless:set="config.params.stemmerFolder" value="en"/>
<!--The directory for all of the static search assets (JSON files, Javascripts, et cetera).
Note that the static search directory is placed directly within the collection directory-->
<property name="staticSearchDir" value="${collectionDirName}/${outputFolder}/"/>
<!--Flag for whether or not this build is being run in a headless state or not. Default
is that it's not-->
<property name="headless" value="false"/>
<!--The basename of the search file with suffix still attached-->
<basename property="searchPageBasename" file="${searchFilePath}"/>
<!--The temporary search page, which is constructed by appending ssTemp_ to the search base name
and placing that file in the collectionDir-->
<property name="tempSearchPageOutput" value="${collectionDirName}/ssTemp_${searchPageBasename}"/>
<!--****************************************************************
* *
* Targets *
* *
****************************************************************-->
<target name="init">
<description>
target: init
This target initializes the static search indexing process. It first deletes
any previous results from an earlier build and then recreates the staticSearchDir,
which is simply the parent directory of the search file specified in the config file.
In the special case of the test, this target also generates the VERSION file.
</description>
<echo message="Initializing..."/>
<echo message="Deleting old products..."/>
<delete dir="temp"/>
<delete file="${basedir}/xsl/config.xsl"/>
<delete file="${tempSearchPageOutput}"/>
<delete dir="${staticSearchDir}"/>
<echo message="Creating new static search directory: ${staticSearchDir}"/>
<mkdir dir="${staticSearchDir}"/>
<!-- If we're building the test data, we need to generate the VERSION file. -->
<if>
<equals arg1="${basedir}/configTest.xml" arg2="${configFile}"/>
<then>
<delete file="${basedir}/test/VERSION"/>
<exec executable="git">
<arg value="rev-parse"/>
<arg value="--short"/>
<arg value="HEAD"/>
<redirector output="${basedir}/test/VERSION"/>
</exec>
</then>
</if>
</target>
<target name="validate">
<description>Task to validate that the source XHTML is well-formed XHTML. Note
that this DOES NOT check whether or not it is valid XHTML: only that it is well-formed.
This target also checks the config file and sees if it's invalid. We don't fail if it's invalid,
though, since we must assume that some people just don't care: but raise the error anyway.
</description>
<echo message="Validating ${collectionDirName} for well-formedness"/>
<!--@lenient='true' since we are only checking well-formedness-->
<xmlvalidate lenient="true">
<fileset dir="${collectionDirName}">
<include name="**/**.*htm*"/>
</fileset>
</xmlvalidate>
<echo message="Validating ${configFile} against the staticSearch schema..."/>
<exec executable="java" failonerror="false">
<arg line="-jar ${basedir}/lib/jing.jar"/>
<arg value="${basedir}/schema/staticSearch.rng"/>
<arg value="${configFile}"/>
</exec>
</target>
<target name="config">
<description>
target: config
This target creates an XSLT module from the XML configuration file, and converts
the various configuration options into XSLT variables, et cetera, necessary
throughout in the pipeline.
The XSLT is run on itself, but loads the configuration file as a parameter.
</description>
<echo message="Creating configuration file..."/>
<java classpath="${saxon}" classname="net.sf.saxon.Transform" failonerror="true" >
<arg value="-xsl:${basedir}/xsl/create_config_xsl.xsl"/>
<arg value="-s:${basedir}/xsl/create_config_xsl.xsl"/>
<arg value="configFile=${configFile}"/>
<arg value="--suppressXsltNamespaceCheck:on"/>
<arg value="verbose=${verbose}"/>
</java>
</target>
<target name="tokenize">
<description>
target: tokenize
This target runs the multi-stage tokenization process on the XHTML collection and creates a stash
of tokenized and cleaned up files from which we generate the individual token JSONs.
The XSLT is run on itself and uses the information in the config XSLT module (generated in
the config task) to determine the input files, et cetera.
</description>
<echo message="Creating tokenized XHTML files..."/>
<java classpath="${saxon}" classname="net.sf.saxon.Transform" failonerror="true">
<arg value="-xsl:${basedir}/xsl/tokenize.xsl"/>
<arg value="-s:${basedir}/xsl/tokenize.xsl"/>
<arg value="--suppressXsltNamespaceCheck:on"/>
<!--<jvmarg value="-Xmx2048m"/>-->
</java>
</target>
<target name="json">
<description>
target: json
This target creates the individual token JSON files from the tokenized XHTML files created
by the above tokenize task.
It is run on itself and uses the information in the configuration XSLT module (generated in the config task)
to determine input files, et cetera. Note that this task will likely create *a lot* of files and may take some time,
depending on the size of the resource collection.
</description>
<echo message="Creating individual JSON files..."/>
<java classpath="${saxon}" classname="net.sf.saxon.Transform" failonerror="true" >
<arg value="-xsl:${basedir}/xsl/json.xsl"/>
<arg value="-s:${basedir}/xsl/json.xsl"/>
<arg value="--suppressXsltNamespaceCheck:on"/>
</java>
</target>
<target name="createStemsFile">
<description>
target: createStemsFile
This target creates the ssStems.json file by first retrieving the length of all the JSONs available,
and then create a JSON from that document. This is necessary for running wildcard searches.
</description>
<echo message="Creating ssStems.json file..."/>
<echo message="First, measure the JSON files..."/>
<length property="stemLines" mode="each">
<fileset dir="${staticSearchDir}">
<include name="lower/**.json"/>
<include name="upper/**.json"/>
</fileset>
</length>
<echo message="Now create the tokens file..."/>
<java classpath="${saxon}" classname="net.sf.saxon.Transform" failonerror="true" >
<arg value="-xsl:${basedir}/xsl/create_ssStems.xsl"/>
<arg value="-s:${basedir}/xsl/create_ssStems.xsl"/>
<arg value="stemLines=${stemLines}"/>
<arg value="--suppressXsltNamespaceCheck:on"/>
</java>
</target>
<target name="makeSearchPage">
<description>
target: makeSearchPage
This target creates the search page at the location specified in the configuration file. If a file already exists,
then this process modifies the existing search page by replacing the content of div/@id='staticSearch'. If the file
does not exist, then a generic search page is generated.
This runs an XSLT on itself with an output specified as a temp page. That temporary page is then copied over
and overwrites the original page.
</description>
<echo message="Creating search page..."/>
<if taskname="copy">
<not>
<available file="${searchFilePath}"/>
</not>
<then>
<echo>WARNING: Cannot find search file at ${searchFilePath}. Created a template file from xsl/sample_search.html.</echo>
<copy file="xsl/sample_search.html" tofile="${searchFilePath}"/>
</then>
</if>
<echo message="Creating temporary search page: ${tempSearchPageOutput}"/>
<!--We create a hasFilters property here since the search page needs to know whether or not there are filters to
inject into the page. It is more efficient to derive the existence of the filters directory here rather than
in the XSLT.-->
<available type="dir" property="hasFilters" file="${staticSearchDir}/filters" value="true"/>
<java classpath="${saxon}" classname="net.sf.saxon.Transform" failonerror="true">
<arg value="-xsl:${basedir}/xsl/makeSearchPage.xsl"/>
<arg value="-s:${searchFilePath}"/>
<arg value="-o:${tempSearchPageOutput}"/>
<arg value="hasFilters=${hasFilters}"/>
<arg value="--suppressXsltNamespaceCheck:on"/>
<!--<jvmarg value="-Xmx2048m"/>-->
</java>
<echo message="Moving ${tempSearchPageOutput} to overwrite ${searchFilePath}"/>
<move file="${tempSearchPageOutput}" tofile="${searchFilePath}"/>
</target>
<target name="report">
<description>
target: report
This target creates a report page from the various products generated by the static search. It is not necessary for static search
to work, but it is *highly* recommended, since the result page contains various warnings as well as useful statistics that track
word use, et cetera.
The XSLT is run on itself and creates a result document passed off of a path specified in the config file; the resulting HTML
file for the report will be in the staticSEarchDir.
This target also calls the special open task on the condition that the build is not being run in a headless state (i.e. via a CI/CD system
that does not have access to a desktop).
</description>
<!--We create a hasFilters property here since the search page needs to know whether or not there are filters to
inject into the page. It is more efficient to derive the existence of the filters directory here rather than
in the XSLT.-->
<available type="dir" property="hasFilters" file="${staticSearchDir}/filters" value="true"/>
<java classpath="${saxon}" classname="net.sf.saxon.Transform" failonerror="true">
<arg value="-xsl:${basedir}/xsl/create_reports.xsl"/>
<arg value="-s:${basedir}/xsl/create_reports.xsl"/>
<arg value="--suppressXsltNamespaceCheck:on"/>
<arg value="hasFilters=${hasFilters}"/>
<!--<jvmarg value="-Xmx2048m"/>-->
</java>
<move file="${basedir}/staticSearch_report.html" tofile="${staticSearchDir}/staticSearch_report.html"/>
<echo message="Opening in browser..." unless:true="${headless}"/>
<open unless:true="${headless}" file="${staticSearchDir}/staticSearch_report.html"/>
</target>
<target name="copyFiles">
<description>
target: copyFiles
This target copies the necessary Javascript files from the staticSearch directory (i.e. this directory)
and places them in the project's static search directory.
</description>
<copy todir="${staticSearchDir}/">
<fileset dir="${basedir}/js">
<include name="ss**.js"/>
</fileset>
</copy>
<copy todir="${staticSearchDir}/">
<fileset dir="${basedir}/stemmers/${stemmerFolder}">
<include name="ssStemmer.js"/>
</fileset>
</copy>
</target>
<target name="clean">
<description>
target: clean
This target deletes the temporary directory, since it's no longer useful.
</description>
<echo message="Cleaning temporary directory..."/>
<delete dir="${staticSearchDir}/temp"/>
</target>
<target name="all" depends="init, validate, config, tokenize, json, createStemsFile, copyFiles, makeSearchPage, report, clean">
<description>
target: all
Default target for running the entire static search creation process.
</description>
</target>
<target name="allButValidate" depends="init, config, tokenize, json, createStemsFile, copyFiles, makeSearchPage, report, clean">
<description>
target: allButValidate
Target to override validation, which is helpful in cases where the HTML output
has been validated by a stricter set of validation rules (like the VNU validator)
</description>
</target>
<target name="test" depends="init, validate, config, tokenize, json, createStemsFile, copyFiles, makeSearchPage, report">
<description>
target: test
Target that runs the whole process, but does not clean up the temporary files. Useful for debugging.
</description>
</target>
<target name="basic" depends="init, validate, config, tokenize, json, createStemsFile, copyFiles, makeSearchPage, clean">
<description>
target: basic
Target that runs the whole process other than the report creation.
</description>
</target>
<!--****************************************************************
* *
* Script Definitions *
* *
****************************************************************-->
<!--This scriptDef is a javascript implementation for cross-browser
opening of a file. This is taken with thanks from:
http://stackoverflow.com/questions/4696176/using-ant-how-do-i-open-a-file-in-a-browser-->
<!--NOTE: For some reason, this task returns a warning about the Nashorn Engine, but we're not
sure how to resolve that-->
<scriptdef name="open" language="javascript">
<attribute name="file" />
<![CDATA[
var location = "file://"+attributes.get("file").toString().replaceAll("\\\\","/");
location = location.toString().replaceAll(" ","%20");
location = java.net.URLEncoder.encode(location, "UTF-8");
location = location.toString().replaceAll("%3A",":");
location = location.toString().replaceAll("%2F","/");
location = location.toString().replaceAll("%25","%");
var uriLocation = java.net.URI.create(location);
var desktop = java.awt.Desktop.getDesktop();
desktop.browse(uriLocation);
]]>
</scriptdef>
</project>