-
Notifications
You must be signed in to change notification settings - Fork 3
/
korp-make
executable file
·1684 lines (1526 loc) · 58.1 KB
/
korp-make
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /bin/bash
# -*- coding: utf-8 -*-
# TODO:
# - Allow specifying input via options (config file).
# - Allow multiple instances of extra file options to pass to
# korp-make-corpus-package
# - --add-structure-ids: Specify by structure type if existing values
# should be overwritten or not.
# - Allow specifying structure id value format.
# - Generate a VRT file containing all the added information: in
# particular, add the name attributes to the VRT before
# cwb-encoding.
# - Support different (or multiple) lemma attributes for generating
# lemmas without compound boundaries, lemgrams and word picture
# data.
# - Run stages based on checksums of previous stage's output.
# - Make multiple (related) corpora in the same package.
# - Make parallel corpora.
# - (?) Omit positional attributes with a full stop in their names.
progname=`basename $0`
progdir=`dirname $0`
mapdir=$progdir/../corp
vrttoolsdir=$progdir/../vrt-tools
vrt_subdir=vrt/CORPUS
tsv_subdir=$vrt_subdir
lemgram_posmap_default=$mapdir/lemgram_posmap_tdt.tsv
lemgram_posmap_ud_default=$mapdir/lemgram_posmap_ud2_universal.tsv
wordpict_relmap_default=$mapdir/wordpict_relmap_tdt.tsv
wordpict_relmap_ud_default=$mapdir/wordpict_relmap_ud_fi.tsv
compound_boundary_marker_default="|"
compound_boundary_marker_ud_default="#"
# Designates that the corpus id should be used as the base seed
random_seed_default="{CORPUS}"
usage_header="Usage: $progname [options] [corpus] [input_file ...]
Process a VRT file to make a Korp corpus package containing CWB data files and
MySQL database import files.
The corpus id must be specified either as the first non-option argument or via
the option --corpus-id.
The input files may be either (possibly compressed) VRT files containing
dependency parse information and name tags, or ZIP or (possibly compressed)
tar archives containing such VRT files. If no input files are specified, read
from the standard input or use a VRT file stored on a previous run for the
same corpus.
If rerun on a corpus and processed data for the corpus exists, try to infer
which processing stages needs to be rerun. However, run all processing stages
if --force is specified or if any of the input files is newer than the VRT
file stored on a previous run."
optspecs='
@ General options
corpus-id=CORPUS corpus
make corpus with id CORPUS; this is alternative to specifying the
corpus id as the first non-option argument
force
force all stages of processing by first removing all the output
files if they exist; existing output files are also removed if any
of the input files is newer than the VRT file stored on a previous
run for the same corpus (unless --augment-data is specified)
config-file|configuration-file=FILE
read FILE as an INI-style configuration file (without sections).
Configuration keys correspond to option names without the leading
dashes; internal dashes may be replaced with underscores.
@ Diagnostic output
v|verbose "1"
output some progress information (the default)
quiet { verbose= }
do not output progress information (except for some subprocesses)
times show_times
output the amount of CPU time used for each stage
log-file=FILE logfile
log script output (standard output and standard error) to FILE
instead of the default
$corpus_root/log/${progname}_CORPUS_TIMESTAMP.log where CORPUS is
the corpus id and TIMESTAMP the start time of the script
no-logging !logging
do not copy script output to a log file
@ Directories
c|corpus-root=DIR "$corpus_root" { set_corpus_root "$1" }
use DIR as the root directory of corpus files
tsv-dir=DIR "CORPUS_ROOT/$tsv_subdir" tsvdir
output database tables as TSV files to DIR
@ Corpus licence information
licence-type=LIC auth_opts { add_auth_opts licence_type $optname $1 }
set the corpus licence type to LIC, where LIC is one of PUB, ACA,
ACA-Fi or RES
lbr-id=URN { add_auth_opts lbr_id $optname $1 }
set the LBR id of the corpus to URN, which is of the form
[urn:nbn:fi:lb-]YYYYMMNNN[@LBR], where YYYYMM is year and month
and NNN 3 to 5 digits; the bracketed parts are added if left out
@ Input attributes
input-attrs|input-attributes|input-fields=ATTRS \
"word ref lemma pos msd dephead deprel nertag" initial_input_attrs
specify the names of the positional attributes in the input,
separated by spaces; if "word" (token) is not included in the
list, add it as the first attribute, unless --no-word-attribute is
specified;
if the input VRT contains a positional attributes comment, it
overrides this option, unless --override-vrt-attributes is
specified;
attributes named "_" or with names beginning with a "-" are
skipped in the input;
if ATTRS contains attribute names suffixed with "_ud", "_ud1" or
"_ud2" and no corresponding attributes without the suffix, the
suffix is stripped
override-vrt-attrs|override-vrt-attributes
use the positional attributes specified with --input-attributes
even if the input VRT contains a positional attributes comment
omit-attributes|skip-attributes=ATTRS omit_attrs
omit the positional input attributes listed in ATTRS (seprated by
spaces)
no-word-attribute no_word_attr
the input does not contain a "word" attribute; implied by
--augment-data
keep-attr-order|keep-attribute-order
do not reorder positional attributes even if "word" is not the
first attribute; implied by --augment-data
augment-data|augment-existing-data
augment existing corpus data with the data in the input, for
example, to add parse annotations to a corpus already encoded; if
the input contains values for existing attributes, they override
existing values; you cannot use this option with --force
generate-input-from-data generate_input
use the existing (CWB) corpus data as the input, which is to be
augmented if needed with lemgrams, lemmas without compound
boundaries and the appropriate database data (implies
--augment-data); the list of input attribute names is read from
the data (overrides --input-attrs); you cannot use this option
with --force
@ Annotation mappings
lemgram-posmap|posmap=POSMAP_FILE "'"$lemgram_posmap_default"'"
use POSMAP_FILE as the mapping file from the corpus parts of
speech to those used in Korp lemgrams; the file should contain
lines with corpus POS and lemgram POS separated by a tab;
if the positional attributes contain UD annotations and no non-UD
annotations, file "'"$lemgram_posmap_ud_default"'" is used unless
a different file is specified explicitly
wordpict-relmap|wordpicture-relation-map=RELMAP_FILE \
"'"$wordpict_relmap_default"'"
use RELMAP_FILE as the mapping file from corpus dependency
relation codes to those used in the Korp word picture; the file
should contain lines with corpus dependency relation code and word
picture dependency relation code separated by a tab;
if the positional attributes contain UD annotations and no non-UD
annotations, file "'"$wordpict_relmap_ud_default"'" is used unless
a different file is specified explicitly
@ Compound boundaries
compound-boundary-marker=MARKER "'"$compound_boundary_marker_default"'"
the string MARKER marks compound boundaries in lemmas and will be
removed from lemmas without compound boundaries;
if the positional attributes contain UD annotations and no non-UD
annotations, "'"$compound_boundary_marker_ud_default"'" is used
remove-compound-boundary-algorithm=ALGORITHM "omorfi" compound_boundary_alg
use ALGORITHM for adding lemmas without compound boundaries, one
of "omorfi", "old" (alias "simple-omorfi") and "naive": "omorfi"
handles some idiosyncrasies of Omorfi, "old" produces results
(mostly) compatible with the algorithm used previously (handling
hyphens replaced with compound boundary markers), and "naive"
simply removes compound boundary markers
@ Lemgrams
add-lowercase-lemgrams { add_lemgram_opt --add-lowercase-variants }
add all-lower-case variants of lemgrams for lemmas containing
upper-case letters
add-lemgrams-without-diacritics { add_lemgram_opt --add-non-diacritic-variants }
add variants of lemgrams without diacritics for lemmas containing
letters with diacritics
lemgrams-keep-letters-with-diacritics=CHARS \
{ add_lemgram_opt --keep-letters-with-diacritics "$1" }
Keep the letters with diacritics in CHARS intact even in lemgram
variants otherwise without diacritics. CHARS is a string of
characters that can be used inside a set of characters in a
regular expression (as [^CHARS]). CHARS are retained regardless of
their case.
@ Random-number generator seed
random-seed=SEED "'"$random_seed_default"'"
use the string SEED as the base seed for the random-number
generator, to be used for generating structure id attributes and
scrambling data; the actual seed is generated using SEED and seed
for scrambling data; "" for a random seed (non-reproducible
results) (default: corpus id)
@ Structure ids
add-structure-ids|add-element-ids=STRUCTLIST \
"text paragraph sentence" add_struct_ids
add id attributes to the structures listed in STRUCTLIST
(separated by spaces); id values are based on unique random
integers; unless --overwrite-structure-ids is specified, rename
possible existing id attributes to id_N, where N is the smallest
positive integer for which attribute id_N does not already exist
in the structure; if STRUCTLIST is an empty string, do not add id
attributes
structure-id-format|element-id-format=STRUCT:FORMAT * \
{ add_struct_id_format "$1" }
format the value of the id attribute for structure STRUCT with
FORMAT; run "vrt-add-id -h" for more information on FORMAT (in the
usage description of option --format) but note that FORMAT may not
contain spaces here
overwrite-structure-ids|overwrite-element-ids !keep_struct_ids
overwrite possible existing id attribute values in the structures
listed with --add-structure-ids
@ Sorting text structures
text-sort-attribute=ATTRLIST * { add_text_sort_opt --key "$1" }
Sort text elements in the corpus by the attributes listed in
ATTRLIST, separated by spaces or commas. Sort primarily by the
first attribute, secondarily by the second and so on, by byte
values, without taking the locale into account. Multiple keys can
also be specified by repeating the option. Each attribute name may
be followed by a colon and sort ordering option characters
recognized by the "sort" command: often one or more of the
following: b (ignore leading blanks), d (dictionary order), f
(ignore case), g (general numeric sort), i (ignore nonprinting), M
(month sort), h (human numeric sort), R (random sort), r
(reverse), V (version sort).
text-sort-transform=ATTR:TRANSFORM '"'"' * { add_text_sort_opt --transform "$1" }
Transform the value of the attribute attrname using TRANSFORM
before using it as a sort key. ATTR is one of the attributes
listed in the argument of --text-sort-attribute. (ATTR and the
colon may be omitted if only one key attribute is specified.)
TRANSFORM may be one of the following: (1) a Perl-style
substitution "s/regexp/subst/[flags]", where regexp and subst
follow Python regular expression syntax and flags is zero or more
of the following letters: a (make \\w, \\W, \\b, \\B, \\d, \\D
match ASCII characters only instead of whole Unicode), g (replace
all matches and not only the first one), i (match
case-insensitively), l (make \\w, \\W, \\b, \\B dependent on the
current locale), x (ignore whitespace and comments); (2) a single
Python expression; or (3) the body of a Python function. In (2)
and (3), the variable "val" refers to the value of the attribute
(str), and they return the result of the transformation (converted
to str). If (3) has no return statement, the value of "val" is
returned. On an error depending on the value of "val", an empty
string is returned. The option may be repeated to specify
transformations for different attributes and/or multiple
transformations for a single attribute. Multiple transformations
for an attribute are processed in the order they are specified.
@ Omitting structures
omit-structures=STRUCTS
omit structures listed in STRUCTS, separated by spaces; you cannot
omit text or sentence structures; this can be used to remove
paragraphs from corpora whose sentences should be scrambled within
whole texts
@ Scrambling structures
scramble=STRUCTS
scramble structures listed in STRUCTS, separated by spaces;
typical structures are sentence and paragraph (and link for
parallel corpus parts); they are scrambled within the immediately
containing structure, typically within paragraph and text,
respectively; "sentence paragraph" scrambles both ways
@ Copying, renaming and omitting structural attributes
copy-struct-attr|copy-structure-attribute=TARGET:SOURCELIST *
copy structural attributes from a preceding (enclosing) structure.
TARGET is the name of the structure to which attributes are to be
copied and SOURCELIST is a semicolon-separated list of items of
the form SOURCESTRUCT/ATTRLIST, where SOURCESTRUCT is the source
structure and ATTRLIST is a comma-separated list of the names of
attributes in SOURCESTRUCT to be copied, or "*" for all
attributes. For example, the value
"sentence:paragraph/type,speaker" specifies that the values of the
attributes type and speaker of the preceding (enclosing) paragraph
structure are added to the attributes of a sentence structure,
named paragraph_type and paragraph_speaker. Copying attributes
takes place before omitting structures, so for example, paragraph
attributes may be copied to sentences before removing paragraphs.
Multiple attribute copy operations may be specified either by
listing them in the argument separated by spaces or by specifying
this option multiple times.
rename-struct-attr|rename-structure-attribute=STRUCT/SOURCE:TARGET *
rename in structure STRUCT attributes matching the (Perl) regular
expression SOURCE as TARGET. SOURCE needs to be mathced in full.
SOURCE may contain capture groups (...) and TARGET may reference
them as \$1, \$2 and so on. Attributes are renamed after copying
(see above), so you can rename copied attributes. Multiple
attribute rename operations may be specified either by listing
them in the argument separated by spaces or by specifying this
option multiple times.
omit-struct-attr|omit-structure-attribute=[STRUCT/][OMITLIST][![KEEPLIST]] *
Omit in structures STRUCT attributes whose names fully match a
(Python) regular expression in OMITLIST and do not match one in
KEEPLIST. OMITLIST and KEEPLIST may contain multiple regular
expressions separated by commas. This option can be specified
multiple times with different STRUCT values, for different
structures. If STRUCT/ is omitted, omit matching attributes from
all structures with no structure-specific value specified. A value
without OMITLIST but with !KEEPLIST adds a structure-specific list
of expressions to keep, overriding a non-structure-specific value.
A value with an "!" but no KEEPLIST keeps nothing in STRUCT. A
value with only STRUCT/ omits nothing in STRUCT. This option can
be used, for example, to omit from scrambled corpora attributes
that would reveal the original structure order.
@ Date information
corpus-date=DATE
use DATE as the date of all texts in the corpus; "unknown" if not
known
corpus-date-pattern=PATTERN
recognize corpus date information based on PATTERN of the form
"ELEM ATTR REGEX": extract date information from the attribute
ATTR of element (structural attribute) ELEM using the regular
expression REGEX. ELEM and ATTR may be "*" (any element or
attribute) or they may contain several attribute or element names
separated with vertical bars. REGEX may contain named groups
(subpatterns) in Python'"'"'s regular expressions Y, M and D,
which extract year, month and day; for example, "(?P<Y>[0-9]{4})"
(without the quotation marks) would recognize a year (although
this particular case is also covered by the default pattern).
REGEX may also cover both the start and end date, in which case
the subpatterns for the start date are Y1, M1 and D1, and those
for the end date, Y2, M2 and D2. If REGEX does not contain named
subpatterns, recognize the first group as the start date and the
possible second group as the end date.
corpus-date-full-order=ORDER
recognize full dates in the order ORDER (one of "ymd", "dmy",
"mdy")
corpus-date-ranges
make the patterns recognize date ranges with different start and
end days
@ Output data
no-lemmas-without-boundaries|skip-lemmas-without-boundaries \
!lemmas_without_boundaries
do not add lemmas without compound boundaries
no-lemgrams|skip-lemgrams !lemgrams
do not add lemgrams
no-wordpicture|skip-wordpicture !wordpicture
do not extract word picture relations database tables
no-name-attrs|no-name-attributes|skip-name-attrs|skip-name-attributes \
!name_attrs
do not add named-entity information based on a NER tag as the last
positional attribute
remake-wordpicture-data
force remaking word picture relations database tables; this option
is needed only if recreating word picture data that has been left
incomplete on a previous run
@ Packaging
no-package !make_package
do not create a corpus package
korp-frontend-dir=DIR "$korp_frontend_dir"
read Korp configuration files from DIR, to be included in corpus
package
package-readme-file|readme-file=FILE
include FILE as a top-level read-me file in the corpus package;
FILE may contain shell wildcards (but braces are not expanded)
package-doc-dir|doc-dir=DIR
include DIR as a documentation directory "doc" in the corpus
package
package-doc-file|doc-file=FILE
include FILE as a documentation file in directory "doc" in the
corpus package; FILE may contain shell wildcards
package-script-dir|script-dir=DIR
include DIR as a (conversion) script directory "scripts" of
the corpus package
package-script-file|script-file=FILE
include FILE as a (conversion) script file in directory "scripts"
of the corpus package; FILE may contain shell wildcards
package-extra-dir|extra-dir=SRCDIR[:DSTDIR]
include directory SRCDIR in the corpus package; if :DSTDIR is
specified, the directory is renamed as DSTDIR in the package
package-extra-file|extra-file=SRCFILE[:DSTFILE]
include file SRCFILE in the corpus package; if :DSTFILE is
specified, the file is renamed as DSTFILE in the package; if
DSTFILE ends in a slash or if SRCFILE contains wildcards, DSTFILE
is considered a directory name and SRCFILE is placed in that
directory in the package
@ Database import
import-database
import the database TSV files into the Korp MySQL database
'
config_file_optname=config-file
. $progdir/korp-lib.sh
# cleanup_on_exit=
vrt_rename_struct_attrs=$progdir/vrt-rename-struct-attrs.pl
vrt_fix_attrs=$progdir/vrt-fix-attrs.py
vrt_add_lemma_nobound=$vrttoolsdir/vrt-add-lemma-nobound
vrt_add_lemgrams=$progdir/vrt-add-lemgrams.py
vrt_sort=$vrttoolsdir/vrt-sort
vrt_add_id=$vrttoolsdir/vrt-add-id
vrt_convert_chars=$vrttoolsdir/vrt-convert-chars
vrt_extract_timespans=$progdir/vrt-extract-timespans.py
vrt_extract_seed=$vrttoolsdir/vrt-extract-seed
vrt_list_struct_attrs=$progdir/vrt-list-struct-attrs.py
korp_convert_timedata=$progdir/korp-convert-timedata.sh
vrt_drop_attrs=$vrttoolsdir/vrt-drop-attrs
vrt_scramble=$vrttoolsdir/vrt-scramble
cwbdata_extract_info=$progdir/cwbdata-extract-info.sh
vrt_extract_lemgrams=$progdir/vrt-extract-lemgrams.sh
run_extract_rels=$progdir/run-extract-rels.sh
vrt_add_name_attrs=$progdir/vrt-add-name-attrs.sh
korp_make_corpus_package=$progdir/korp-make-corpus-package.sh
korp_mysql_import=$progdir/korp-mysql-import.sh
cwbdata2vrt="$progdir/cwbdata2vrt-simple.sh --all-attributes --output-file=-"
cwb_encode=$cwb_bindir/cwb-encode
cwb_describe_corpus=$cwb_bindir/cwb-describe-corpus
cwb_make=$cwb_perl_bindir/cwb-make
vrt_file=
# Complete, unscrambled VRT file with no structures removed
vrt_file_full=
text_sort_opts=
sort_texts=
add_lemgrams_opts=
# Default structural attribute id formats
declare -A struct_id_format=(
# Standard structures
[text]="t-{hash:.8}-{id}"
[paragraph]="p-{hash:.8}-{idnum[text]}-{id}"
[sentence]="s-{hash:.8}-{idnum[text]}-{id}"
# Structures used in some corpora
[link]="l-{hash:.8}-{idnum[text]}-{id}"
[clause]="c-{hash:.8}-{idnum[text]}-{idnum[sentence]}-{id}"
[chapter]="ch-{hash:.8}-{idnum[text]}-{id}"
[utterance]="u-{hash:.8}-{idnum[text]}-{id}"
)
add_auth_opts () {
local type opt val
type=$1
opt=$2
val=$3
val=$(eval "make_$type \$val")
exit_if_error $?
auth_opts="$auth_opts $opt $val"
}
add_text_sort_opt () {
local optname=$1
local optarg=$2
if [ "x$optname" = "x--key" ]; then
sort_texts=1
fi
text_sort_opts="$text_sort_opts $optname $(quote_args_safe "$optarg")"
}
add_lemgram_opt () {
local optname=$1
local optarg=$2
# optarg may not contain spaces, but in this case they should not
# occur
add_lemgrams_opts="$add_lemgrams_opts $optname $optarg"
}
add_struct_id_format () {
local optarg=$1
local struct=${optarg%%:*}
local format=${optarg#*:}
struct_id_format[$struct]="$format"
}
# Process options
eval "$optinfo_opt_handler"
if [ "x$corpus" = "x" ]; then
if [ "x$1" = "x" ]; then
error "No corpus name specified"
fi
corpus=$1
shift
fi
preprocess_posattrs=
initial_vrt_posattrs=
if [ "x$generate_input" != x ]; then
if [ "x$force" != x ]; then
error "You cannot specify both --force and --generate-input-from-data"
fi
augment_data=1
stage1_fn=generate_input
stage1_descr="Generating input VRT file from CWB data"
else
stage1_fn=combine_input
stage1_descr="Combining input files"
fi
if [ "x$augment_data" != x ]; then
if [ "$(list_corpora --on-error : "$corpus")" != "$corpus" ]; then
error "Corpus $corpus not found; cannot augment corpus data"
fi
if [ "x$force" != x ]; then
error "You cannot specify both --force and --augment-data"
fi
fi
if [ "x$logging" != x ]; then
if [ "x$logfile" = x ]; then
if [ ! -e "$corpus_root/log" ]; then
mkdir_perms $corpus_root/log
fi
logfile=$corpus_root/log/${progname}_${corpus}_$(date +'%Y%m%d%H%M%S').log
fi
# http://stackoverflow.com/questions/3173131/redirect-copy-of-stdout-to-log-file-from-within-bash-script-itself
cat < /dev/null > $logfile
ensure_perms $logfile
exec > >(tee -ia $logfile)
exec 2> >(tee -ia $logfile >&2)
echo_verb "Logging output to $logfile"
fi
if [ "x$omit_structures" != x ]; then
if word_in "text" "$omit_structures" ||
word_in "sentence" "$omit_structures"
then
error "You cannot omit text or sentence structures"
fi
# Use echo to get single spaces between structures to be omitted
omit_structures=$(echo $omit_structures)
# Convert to a regexp for grep -Ev
omit_structures="^</?(${omit_structures// /|})[ >]"
fi
# Convert to lowercase
compound_boundary_alg=${compound_boundary_alg,,}
case $compound_boundary_alg in
old )
compound_boundary_alg=simple-omorfi
;;
omorfi | naive | simple-omorfi )
# Use the value as is
:
;;
* )
error 'Invalid algorithm in --remove-compound-boundary-algorithm: allowed values are "omorfi", "legacy" and "naive".'
;;
esac
echo_verb "Running: $cmdline_orig"
echo_verb "Processed arguments: $cmdline_args_processed"
input_files=( "$@" )
vrt_subdir=${vrt_subdir//CORPUS/$corpus}
tsv_subdir=${tsv_subdir//CORPUS/$corpus}
vrtdir=${vrtdir:-$corpus_root/$vrt_subdir}
vrtdir=${vrtdir//CORPUS/$corpus}
tsvdir=${tsvdir:-$corpus_root/$tsv_subdir}
tsvdir=${tsvdir/CORPUS_ROOT/$corpus_root}
tsvdir=${tsvdir//CORPUS/$corpus}
datadir=$corpus_root/data/$corpus
mkdir_perms $vrtdir $tsvdir 2> /dev/null
if [ "x$vrt_file" = "x" ]; then
# If augmenting data and not generating input from VRT, do not use
# possible existing VRT file. Another option might be to have an
# option for ignoring an existing VRT file and to write the
# augmented VRT file to $corpus.augm.vrt, for example.
if [ "x$augment_data" != x ] && [ "x$generate_input" = x ]; then
vrt_file=$tmp_prefix.$corpus.vrt
else
vrt_file=$vrtdir/$corpus.vrt
vrt_file_full=$vrtdir/$corpus-complete.vrt
fi
fi
stages_file=$vrtdir/$corpus.stages
opts_file=$vrtdir/$corpus.opts
seed_file=$vrtdir/$corpus.seed
# Should the information from a previous run be checked
check_prev_run=
verbose_opt=
if [ "x$verbose" != x ]; then
verbose_opt=--verbose
fi
input_token_count=0
existing_token_count=0
remove_existing_data () {
rm -f $datadir/* $cwb_regdir/$corpus $tsvdir/$corpus_*.tsv.gz \
$vrtdir/$corpus.vrt $vrtdir/$corpus.vrt.gz $stages_file $opts_file \
$seed_file
}
run_cmd () {
verbose printf " Running: " >&$top_stdout
verbose echo_quoted "$@" >&$top_stdout
"$@"
}
process_vrt () {
run_cmd "$@" < $vrt_file > $vrt_file.new &&
replace_file $vrt_file $vrt_file.new
}
time_stage () {
time_cmd --format "- CPU time used: %U %R" "$@"
}
check_errors_from_log () {
# FIXME: Grepping the log file for system error messages is a bit
# kludgy way to catch "Disk quota exceeded" (and possibly other
# similar) system errors. In particular, the error might prevent
# the process from writing the message to the log file. The
# subprocesses should notice the errors and exit with an error
# status. Or can they do that?
if grep '^\[Errno [0-9]' "$logfile" \
> $tmp_prefix.subproc_error 2> /dev/null;
then
error "Aborting because of an error: $(cat $tmp_prefix.subproc_error)"
fi
}
# Check if data exists from a previous run and if it could be used or
# if it should be removed
check_for_existing_data () {
if [ "x$force" != x ]; then
remove_existing_data
else
if [ -r $vrt_file.gz ] && [ -s $vrt_file.gz ]; then
gunzip $vrt_file.gz
fi
# If any of specified the input files is newer than the
# existing VRT file and not augmenting data, remove existing
# data
if [ -r $vrt_file ] && [ -s $vrt_file ] &&
[ "x$augment_data" = x ] &&
[ "${#input_files[@]}" -gt 0 ] &&
! file_newer $vrt_file "${input_files[@]}";
then
remove_existing_data
else
check_prev_run=1
echo_verb "Using existing data from previous runs"
fi
fi
}
# Check if korp-make has been run with the same options previously,
# allowing skipping some or all stages. Return 0 if all stages have
# already been completed, 1 otherwise.
check_completed () {
local cmdline_args_effective=$(get_effective_cmdline_args)
if [ "x$check_prev_run" = x ]; then
safe_echo "$cmdline_args_effective" > "$opts_file"
return 1
fi
# CHECK: Should the data from the previous runs be handled
# differently if using one of the options --augment-data,
# --generate-input or --remake-wordpicture-data?
if [ -e "$opts_file" ] && [ -e "$stages_file" ]; then
# If the options are the same as on the previous run, the
# script can continue from where it was left
if [ "$(cat "$opts_file")" = "$cmdline_args_effective" ]; then
if [ "$(tail -n1 "$stages_file")" = "Completed" ]; then
return 0
fi
else
# TODO: If the options differ, check by option and stage
# if the option affects the stage
rm "$opts_file" "$stages_file"
fi
fi
safe_echo "$cmdline_args_effective" > "$opts_file"
return 1
}
# Get the command-line arguments affecting the processing of the data
get_effective_cmdline_args () {
# Prepend space to recognize options at the beginning
local args=" $cmdline_args_processed"
local remove_opts="--force --times --quiet --verbose --no-logging --remake-wordpicture-data"
local opt
for opt in $remove_opts; do
args=${args/ $opt/}
done
# Remove leading space
safe_echo "${args# }"
}
# Run a single stage function (name) after printing the description
# (descr). If function test_skip_$name is defined and its output is
# non-empty, skip the stage.
run_stage () {
local name=$1
if [ "x$name" = x ]; then
return
fi
shift
local descr="$@"
local run_always prev_run prev_msg msg exitstat
if [ "${name#\*}" != "$name" ]; then
run_always=1
name=${name#\*}
fi
if [ "x$check_prev_run" != x ] && [ "x$run_always" = x ]; then
prev_run="$(grep -s "^$name:" $stages_file)"
if [ "x$prev_run" != x ]; then
prev_msg=${prev_run#*: }
echo_verb "(Skipping ${descr,}: $prev_msg on a previous run)"
return
fi
fi
if type -t "test_skip_$name" > /dev/null; then
msg=$(test_skip_$name 2> $tmp_prefix.errmsg)
exitstat=$?
# Exit with error if the test function (or the programs it
# runs) outputs something to stderr. An alternative would be [
# $? != 0 ], but that would require adding "return 0" to many
# of the test_skip_ functions.
if [ -s $tmp_prefix.errmsg ]; then
cat $tmp_prefix.errmsg
exit $exitstat
fi
if [ "x$msg" != "x" ]; then
echo_verb "(Skipping ${descr,}: $msg)"
echo "$name: $msg" >> $stages_file
return
fi
fi
echo_verb "$descr"
time_stage exit_on_error $name
check_errors_from_log
echo "$name: done" >> $stages_file
}
# Run all the stages in $stages sequentially.
run_stages () {
local stagecnt=${#stages[*]}
local i=0
if check_completed; then
echo_verb "All processing stages completed on the previous run; use --force to force rerunning"
return
fi
while [ $i -lt $stagecnt ]; do
run_stage ${stages[$i]} "${stages[$(($i + 1))]}"
i=$(($i + 2))
done
echo "Completed" >> $stages_file
}
# Stage functions and their descriptions
stages=(
"$stage1_fn" "$stage1_descr"
# A leading "*" marks the stage to be run always, even if it had
# been completed on a previous run
"*check_input_attrs" "Checking input attributes"
# Stage 3 information set in check_input_attrs if needed
"" ""
extract_random_seed "Extracting random seed"
copy_struct_attrs "Copying structural attributes"
rename_struct_attrs "Renaming structural attributes"
add_struct_ids "Adding structure ids"
add_lemmas_without_boundaries "Adding lemmas without compound boundaries"
add_lemgrams "Adding lemgrams"
add_datefromto "Adding datefrom and dateto"
sort_texts "Sorting text elements"
save_full_vrt "Saving complete VRT file"
omit_structures "Omitting structures"
omit_struct_attrs "Omitting structural attributes"
scramble_structs "Scrambling structures"
cwb_encode "Encoding the attributes for CWB"
cwb_make "Indexing and compressing the CWB data"
convert_timedata "Converting and augmenting time data"
extract_info "Extracting information for the .info file"
extract_lemgrams "Extracting lemgrams for the database"
extract_wordpict_rels
"Extracting word picture relations for the database"
add_name_attrs "Adding name attributes"
adjust_posattrs_comment "Adjusting or adding VRT positional-attributes comment"
make_corpus_package "Creating corpus package"
import_database "Importing data to the MySQL database"
)
combine_input () {
# Skip empty lines in the input VRT, in order to avoid a differing
# number of tokens from the already encoded attributes (assuming
# that cwb-encode was told to skip empty lines).
comprcat "${input_files[@]}" |
grep -v '^$' > $vrt_file
input_token_count=$(vrt_get_token_count "$vrt_file")
if [ "$input_token_count" = 0 ]; then
error "No tokens in the input"
fi
echo_verb " $input_token_count tokens in the input VRT"
if [ "x$augment_data" != x ]; then
existing_token_count=$(get_corpus_token_count $corpus)
if [ "$input_token_count" != "$existing_token_count" ]; then
error "The number of tokens in the input ($input_token_count) differs from that in the existing corpus data ($existing_token_count)"
fi
fi
}
test_skip_combine_input () {
# If other conditions for not combining input have been met,
# $vrt_file has already been removed in check_for_existing_data
if [ -r $vrt_file ] && [ -s $vrt_file ]; then
echo "using existing VRT file $vrt_file"
input_token_count=$(vrt_get_token_count "$vrt_file")
fi
}
check_input_attrs () {
local next_stage_fn=
local next_stage_descr=
local next_stage_descr2=
local next_stage_idx=
local attrnum_word=
local attrcount=
local vrt_attrcount=
initial_vrt_posattrs=$(vrt_get_posattr_names $vrt_file)
if [ "x$initial_vrt_posattrs" != x ]; then
if [ "x$override_vrt_attrs" = x ]; then
initial_input_attrs=$initial_vrt_posattrs
# The lex attribute in the positional attributes comment
# might lack the final slash, but lex (lemgram) is always
# a feature-set attribute, so add it if needed.
initial_input_attrs=$(suffix_word "$initial_input_attrs" lex /)
verbose safe_echo "Using positional attributes named in the input VRT: $initial_input_attrs"
elif [ "$initial_vrt_posattrs" != "$initial_input_attrs" ] &&
[ "$initial_vrt_posattrs" != "word $initial_input_attrs" ];
then
warn "Overriding positional attributes \"$initial_vrt_posattrs\" in the input VRT with \"$initial_input_attrs\""
fi
else
verbose safe_echo "No positional attributes named in the input VRT; using those listed with --input-attributes: $initial_input_attrs"
fi
# FIXME: Testing for UD attributes (and setting lemgramp_posmap
# and wordpict_relmap accordingly) does not work if using an
# existing VRT file with the attributes already renamed, for
# example, if later adding lemgram or word picture data
if has_only_ud_attrs $initial_input_attrs; then
initial_input_attrs=$(rename_ud_attrs $initial_input_attrs)
verbose safe_echo "Removing suffix _ud[12]? from positional attribute names, as no corresponding attributes without the suffix; modified input attributes: $initial_input_attrs"
if word_in "pos" "$initial_input_attrs" &&
[ "$lemgram_posmap" = "$lemgram_posmap_default" ]
then
lemgram_posmap=$lemgram_posmap_ud_default
verbose safe_echo "Using the default UD lemgram part-of-speech mapping file $lemgram_posmap_ud_default"
fi
if word_in "deprel" "$initial_input_attrs" &&
[ "$wordpict_relmap" = "$wordpict_relmap_default" ]
then
wordpict_relmap=$wordpict_relmap_ud_default
verbose safe_echo "Using the default UD word picture relation mapping file $wordpict_relmap_ud_default"
fi
if [ "x$lemmas_without_boundaries" != x ] &&
word_in "lemma" "$initial_input_attrs"
then
compound_boundary_marker=$compound_boundary_marker_ud_default
verbose safe_echo "Using the default UD compound boundary marker \"$compound_boundary_marker_ud_default\""
fi
fi
if [ "x$augment_data" != x ]; then
keep_attr_order=1
no_word_attr=1
fi
attrnum_word="$(word_index word $initial_input_attrs)"
if [ "x$no_word_attr" = x ] && [ "$attrnum_word" = "-1" ]; then
initial_input_attrs="word $initial_input_attrs"
fi
attrcount=$(count_words $initial_input_attrs)
vrt_attrcount=$(vrt_get_posattr_count $vrt_file)
if [ "$attrcount" != "$vrt_attrcount" ]; then
error "Error: the input VRT has $vrt_attrcount positional attributes, but $attrcount were specified"
fi
if word_in _ "$initial_input_attrs" ||
[ "${initial_input_attrs#*-}" != "$initial_input_attrs" ] ||
[ "x$omit_attrs" != x ]
then
next_stage_fn=filter_and_reorder_posattrs
next_stage_descr="Filtering out positional attributes with name \"_\" or starting with \"-\" or specified with --omit-attributes"
preprocess_posattrs=filter
fi
if [ "x$keep_attr_order" = x ] &&
[ "$attrnum_word" != "-1" ] && [ "$attrnum_word" != "1" ]
then
next_stage_fn=filter_and_reorder_posattrs
next_stage_descr2='moving "word" to be the first positional attribute'
if [ "x$next_stage_descr" != x ]; then
next_stage_descr="$next_stage_descr, and $next_stage_descr2"
else
next_stage_descr=${next_stage_descr2^}
fi
preprocess_posattrs="$preprocess_posattrs reorder"
fi
# filter_and_reorder_posattrs adds a positional attributes comment
# if it is missing, so if it will be run, do not do it here.
if [ "x$next_stage_fn" = x ] && [ "x$initial_vrt_posattrs" = x ]; then
process_vrt vrt_replace_posattr_names "$initial_input_attrs"
initial_vrt_posattrs=$initial_input_attrs
fi
next_stage_idx=$(first_empty_elem_index "${stages[@]}")
stages[$next_stage_idx]=$next_stage_fn
stages[$(($next_stage_idx + 1))]=$next_stage_descr
input_attrs=$initial_input_attrs
}
has_only_ud_attrs () {
# Check if the arguments has attribute names with suffix _ud, _ud1
# or _ud2 without corresponding non-suffixed attributes
# TODO: Also return false if multiple different _ud attributes,
# e.g. _ud1, _ud2
local attrs attr attr_base has_ud
has_ud=1
attrs="$@"
for attr in $attrs; do
if str_hassuffix "$attr" "_ud[12]" || str_hassuffix "$attr" "_ud"; then
has_ud=0
attr_base=${attr%_ud*}
if word_in "$attr_base" "$attrs"; then
return 1
fi
fi
done
return $has_ud
}
rename_ud_attrs () {
# Remove suffix _ud, _ud1 or _ud2 from attribute names given as
# arguments
local attrs=
local attr
for attr in "$@"; do
attr=${attr%_ud[12]}
attr=${attr%_ud}
if [ "x$attrs" = x ]; then
attrs=$attr
else
attrs="$attrs $attr"
fi
done
echo "$attrs"
}
first_empty_elem_index () {
# Return the index of the first empty element in the argument array
local arr=("$@")
local i
i=0
while [ $i -lt ${#arr[@]} ]; do
if [ "${arr[$i]}" = "" ]; then
echo $i
return
fi
i=$(($i + 1))
done
}
filter_and_reorder_posattrs () {
local attrs_names=