-
Notifications
You must be signed in to change notification settings - Fork 3
/
cwbdata2vrt-simple.sh
executable file
·413 lines (382 loc) · 13.8 KB
/
cwbdata2vrt-simple.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#! /bin/sh
# A simpler and faster alternative to cwbdata2vrt.py
progname=`basename $0`
progdir=`dirname $0`
usage_header="Usage: $progname [options] corpus_id ...
Generate a VRT file from each corpus specified as an argument, based on its
data stored in CWB.
The output is XML-compatible, except for possible crossing elements. By
default, the output contains only the positional attribute 'word' and the
structural attributes 'text' and 'sentence'.
The output has the encoded special characters unencoded; <, > and &
XML-encoded everywhere and \" XML-encoded in structural attribute values.
The corpus ids specified may contain shell wildcards that are expanded."
optspecs='
positional-attributes|pos-attrs=ATTRLIST "word" pos_attrs
output the positional attributes listed in ATTRLIST, separated by spaces
structural-attributes|struct-attrs=ATTRLIST "text sentence" struct_attrs
output the structural attributes listed in ATTRLIST, separated by spaces
all-attributes|all all_attrs
output all positional and structural attributes in the corpora
sort-structural-attributes|sort sort_struct_attrs
sort structural attribute annotations ("XML attributes") alphabetically,
instead of using their order of declaration in the registry file
undef-value|replace-undef=REPL
replace all "__UNDEF__" (undefined) values of all positional attributes
with REPL; if the value set of any positional attribute of any corpus
contains both "__UNDEF__" and REPL, no VRT files are generated unless
--force-undef-value is specified
force-undef-value|force-replace-undef force_undef
replace "__UNDEF__" values with the value specified with --undef-value
even if the value set of an attribute contains both "__UNDEF__" and the
replacement value
include-xml-declaration
include XML declaration in the output (omitted by default)
include-corpus-element
include in the output the top-level "corpus" element added by cwb-decode
(omitted by default)
omit-attribute-comment
omit the comment listing the positional attributes shown at the top of
the output VRT
omit-log-comment
omit the comment containing information about the run of the script
vrt-file-name-template|output-file=FILE "{corpid}.vrt" outfile_templ
write the output VRT to file named FILE, where {corpid} is replaced
with the corpus id; FILE may contain a directory part as well; possible
non-existent directories are created; use - to write to standard output
overwrite|force
overwrite output VRT file if it already exists; by default, do not
overwrite
v|verbose
output progress information to standard output (standard error if the
VRT output is written to standard output)
'
. $progdir/korp-lib.sh
# Process options
eval "$optinfo_opt_handler"
if [ "x$1" = x ]; then
error "Please specify corpora"
fi
corpora=$(list_corpora "$@")
if [ "x$all_attrs" != x ]; then
struct_attrs=
pos_attrs=
attr_opts=-ALL
process_tags=process_tags_multi
else
struct_attrs_lines=$(echo $struct_attrs | tr ' ' '\n')
struct_attrs_multi=$(
echo "$struct_attrs_lines" | sort | sed -e 's/_.*//' | uniq -d)
# Filter out structural attributes without values (corresponding to
# XML tags without attributes) if they also occur with a value (XML
# tags with attributes), since the tag will be output anyway and so
# that process_tags_multi needs not take into account attributes
# without values.
struct_attrs=$(
echo "$struct_attrs_lines" |
perl -e '$r = "^(" . join("|", qw('"$struct_attrs_multi"')) . ")\$";
while (<>) { print if ($_ !~ $r); }'
)
attr_opts="$(add_prefix '-P ' $pos_attrs) $(add_prefix '-S ' $struct_attrs)"
if [ "${struct_attrs#*_}" != "$struct_attrs" ]; then
if [ "x$struct_attrs_multi" != x ]; then
process_tags=process_tags_multi
else
process_tags=process_tags_single
fi
else
process_tags=cat_noargs
fi
fi
if [ "x$include_corpus_element" = x ]; then
if [ "x$include_xml_declaration" = x ]; then
# No <corpus>...</corpus>, no <?xml...>
head_filter="tail -n +3"
tail_filter="head -n -1"
else
# No <corpus>...</corpus>, but <?xml...>
# This needs to be eval'ed because of the space in the regex
head_filter="grep -Ev '^<(corpus |/corpus>)'"
tail_filter=cat
fi
else
if [ "x$include_xml_declaration" = x ]; then
# <corpus>...</corpus>, no <?xml...>
head_filter="tail -n +2"
else
# <corpus>...</corpus> and <?xml...>
head_filter=cat
fi
tail_filter=cat
fi
if [ "x$omit_attribute_comment" = x ] || [ "x$omit_log_comment" = x ]; then
add_vrt_comments=prepend_vrt_comments
else
add_vrt_comments=cat_noargs
fi
cat_noargs () {
# Ignore possible arguments
cat
}
# Check if any positional attribute of any corpus contains both
# __UNDEF__ and the replacement value; if so, abort unless
# --force-undef-value has been specified. (Another option would be to
# check this one corpus at a time and refuse to generate VRT output
# only for the corpora with attributes with both __UNDEF__ and
# replacement value. However, as the input corpora for a single run
# are often subcorpora of the same corpus or otherwise related, it is
# justified to produce output for either all or none of the corpora.)
check_undef_replacement () {
local undef_value corpus attr msg_base abort
abort=
undef_value=$1
shift
for corpus in "$@"; do
for attr in $(corpus_list_attrs $corpus p); do
if corpus_posattr_contains_values $corpus $attr "__UNDEF__" &&
corpus_posattr_contains_values $corpus $attr "$undef_value";
then
msg_base="corpus $corpus: the value set of positional attribute $attr contains both \"__UNDEF__\" and its replacement \"$undef_value\""
if [ "x$force_undef" != x ]; then
warn "$msg_base; replacing anyway, as --force-undef-value was specified"
else
warn "$msg_base"
abort=1
fi
fi
done
done
if [ "x$abort" != x ]; then
error "Aborting as the value set of at least one positional attribute of at least one corpus contains both \"__UNDEF__\" and its replacement \"$undef_value\"; specify --force-undef-value to replace anyway"
fi
}
# Perl code snippets used in both process_tags_single and
# process_tags_multi
# Encode & and " as XML character references in structural attribute
# values, as cwb-encode -Cx appears not to do that. If cwb-encode is
# ever changed to do that, this will have to be removed. Encoding <
# and > is handled by vrt_decode_special_chars in shlib/vrt.sh.
perl_encode_entities_attrval='
$attrval =~ s/&/&/g;
$attrval =~ s/"/"/g;
'
# Decode " and ' in tokens (positional attributes), as they
# need not be encoded there but they are encoded by cwb-encode -Cx.
perl_decode_entities_token='
s/"/"/g;
s/'/'"'"'/g;
'
# Optionally replace __UNDEF__ values with the string given with
# --undef-value.
perl_replace_undef=
if [ "x$undef_value" != x ]; then
check_undef_replacement "$undef_value" $corpora
perl_replace_undef='
s/(?:\t|^)\K__UNDEF__(?=\t|$)/'"$undef_value"'/g;
'
fi
process_tags_single () {
# This is somewhat faster than using sed, but not significantly
# faster than process_tags_multi below
perl -ne '
if (/^(<[^\/_\s]*)_([^ ]*) ([^>]*)>/) {
# Structure start tag with annotation value
($tag, $attrname, $attrval) = ($1, $2, $3);
'"$perl_encode_entities_attrval"'
print "$tag $attrname=\"$attrval\">\n";
} else {
# Anything else
s/^(<\/[^_]*)_.*>/$1>/;
if (! /^</) {
'"$perl_decode_entities_token"'
'"$perl_replace_undef"'
}
print;
}
'
}
# Perl code snippets depending on whether the structure annotations
# should be sorted or not. Using a list instead of a string does not
# seem to be significantly slower, so maybe we could have only the
# perl_attrs_get different (with or without sorting).
if [ "x$sort_struct_attrs" = x ]; then
perl_attrs_clear='$attrs = ""'
perl_attrs_append='$attrs .= " $attrname=\"$attrval\""'
perl_attrs_get='$attrs'
else
perl_attrs_clear='@attrs = ()'
perl_attrs_append='push(@attrs, "$attrname=\"$attrval\"")'
perl_attrs_get='" " . join (" ", sort { substr($a, 0, index($a, "=")) cmp
substr($b, 0, index($b, "=")) }
@attrs)'
fi
process_tags_multi () {
local corp
corp=$1
perl -ne '
BEGIN {
$prevtag = $tag = "";
'"$perl_attrs_clear"';
$cpos = 0;
$prevtag_printed = 0;
}
if (/^(<[^\/_\s]*)(?:_([^ ]*)( )?(.*))?>$/) {
# Structure start tag, possibly with an annotation value
$tag = $1;
if ($prevtag && $tag ne $prevtag && ! $prevtag_printed) {
print "$prevtag" . '"$perl_attrs_get"' . ">\n";
'"$perl_attrs_clear"';
}
$prevtag = $tag;
$prevtag_printed = 0;
if ($2) {
$attrname = $2;
# If the annotation is defined but has no value, not
# even an empty string (a line of the form
# <struct_attr>), it is treated as an empty string.
# Alternatively, we could have special value for such
# undefined values, but they probably should not occur
# anyway. One option might be to add (optionally) a
# special VRT comment indicating the issue.
if (! $3) {
$struct = substr($prevtag, 1);
print STDERR ("'"$progname"': Warning: corpus '"$corp"',"
. " position $cpos: structure \"$struct\":"
. " undefined value for attribute"
. " \"$attrname\" treated as empty string\n");
}
$attrval = $4;
'"$perl_encode_entities_attrval"'
'"$perl_attrs_append"';
}
} elsif (/^(<\/[^_]*)(_.*)?>/) {
# Structure end tag
$tag = $1;
if ($tag ne $prevtag) {
print "$tag>\n";
$prevtag_printed = 1;
}
$prevtag = $tag;
} else {
# Token, XML declaration or <corpus> start tag
if ($prevtag && ! $prevtag_printed) {
print "$prevtag" . '"$perl_attrs_get"' . ">\n";
}
$tag = $prevtag = "";
$prevtag_printed = 0;
'"$perl_attrs_clear"';
if (! /^</) {
# Token
'"$perl_decode_entities_token"'
'"$perl_replace_undef"'
$cpos++;
}
print;
}
'
}
prepend_vrt_comments () {
# Each comment is an argument of its own; empty arguments are
# excluded
awk '
BEGIN {
for (i = 1; i < ARGC; i++) {
if (ARGV[i]) {
comments[i] = ARGV[i]
}
}
ARGC = 0
}
NR == 1 {
if (/^<\?xml/) { print }
for (i in comments) {
print "<!-- #vrt " comments[i] " -->"
}
if (/^<\?xml/) { next }
}
{ print }
' "$@"
}
get_isodate () {
date +'%Y-%m-%d %H:%M:%S %z'
}
make_log_info () {
# This imitates a proposal for the VRT Tools log comment format,
# which is still subject to change (2019-09-20)
local corp timestamp fullname userinfo version freetext command args
corp=$1
timestamp="Time: $(get_isodate)"
# https://stackoverflow.com/a/833256
fullname="$(getent passwd $USER | cut -d: -f5 | cut -d, -f1)"
userinfo="User: $USER@$(hostname -f) ($fullname)"
# What should be the version, if any?
version="Version: FIN-CLARIN corpus processing scripts (undefined version)"
descr="Description: Generated VRT from CWB data for corpus \"$corp\""
script="Script: $(basename $0)"
args="Arguments: $cmdline_args_orig"
echo "process-log: BEGIN"
printf "%s\n" "process-log: $timestamp | $descr | $script | $args | $version | $userinfo"
echo "process-log: END"
}
make_vrt_comments () {
awk '{print "<!-- #vrt " $0 " -->"}'
}
extract_vrt () {
local corp verbose_msg outfile dirname head_comment head_comment2 attr_comment
corp=$1
verbose_msg="Writing VRT output of corpus $corp to"
if [ "x$outfile_templ" != "x-" ]; then
outfile=$(echo "$outfile_templ" | sed -e "s/{corpid}/$corp/g")
if [ -e "$outfile" ] && [ "x$overwrite" = x ]; then
warn "Skipping corpus $corp: output file $outfile already exists"
return
fi
if [ ! -e "$outfile" ]; then
dirname=$(dirname "$outfile")
mkdir -p "$dirname" 2> $tmp_prefix.err
if [ $? != 0 ]; then
warn "Skipping corpus $corp: $(sed -e 's/^mkdir: //' $tmp_prefix.err)"
return
fi
fi
touch "$outfile" 2> $tmp_prefix.err
if [ $? != 0 ]; then
warn "Skipping corpus $corp: cannot write to output file $outfile"
return
fi
echo_verb "$verbose_msg file $outfile"
if [ "x$overwrite" != x ] && [ -e "$outfile" ]; then
verbose warn "Overwriting existing file $outfile as requested"
fi
else
outfile=/dev/stdout
echo_verb "$verbose_msg standard output" >&2
fi
if [ "x$all_attrs" != x ]; then
# Use echo to get the attribute names on the same line,
# separated by spaces
pos_attrs=$(echo $(corpus_list_attrs --feature-set-slash $corp p))
fi
if [ "x$omit_log_comment" = x ]; then
head_comment="info: VRT generated from CWB data for corpus \"$corp\" ($(get_isodate))"
head_comment2="info: A processing log at the end of file"
fi
if [ "x$omit_attribute_comment" = x ]; then
attr_comment="positional-attributes: $pos_attrs"
fi
# $process_tags needs to precede vrt_decode_special_chars;
# otherwise it would encode the & in the < and > produced by
# the latter.
$cwb_bindir/cwb-decode -Cx $corp $attr_opts |
$process_tags $corp |
vrt_decode_special_chars --xml-entities |
eval "$head_filter" |
$tail_filter |
$add_vrt_comments "$attr_comment" "$head_comment" "$head_comment2" > $outfile
if [ "x$omit_log_comment" = x ]; then
make_log_info $corp | make_vrt_comments >> $outfile
fi
}
for corp in $corpora; do
extract_vrt $corp
done