-
Notifications
You must be signed in to change notification settings - Fork 3
/
korp-make-corpus-package.sh
executable file
·878 lines (781 loc) · 25.7 KB
/
korp-make-corpus-package.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
#! /bin/bash
# -*- coding: utf-8 -*-
# Make a Korp corpus installation and archive package from CWB corpus
# files and Korp MySQL database information
# TODO:
# - Include possible fulltext files, and modify korp-install-corpora
# to install them.
# - Check that the CWB data files include all the necessary indices
# produced by cwb-make. Possibly also other sanity checks for the
# corpus.
# - Working --verbose (or --quiet).
# - Update MySQL dumps if older than the database files (or if an
# option is specified).
# - An option to generate MySQL dumps from the database, complementary
# to the --export-database.
# - --export-database: Generate TSV files only if they do not exist or
# if they are older than corpus data files.
# - An option to generate both a CWB data package and a VRT package
# with a single command.
#
# FIXME:
# - Finding the most recent database files from either SQL or TSV
# files does not work correctly; see FIXME comments in the code.
# - {corpid} does not work in the filename part of an extra VRT file.
# - Directory name transformation does not seem to work for ../dir.
# - If the TSV or SQL directory contains files for the same table
# compressed with different programs, they are all included, instead
# of only the newest ones.
progname=`basename $0`
progdir=`dirname $0`
usage_header="Usage: $progname [options] corpus_name [corpus_id ...]
Make an archive package for corpus corpus_name, containing the Korp
corpora corpus_id ... (or corpus_name if corpus_id not specified).
corpus_id may contain shell wildcards, in which case all matching
corpora in the corpus registry are included."
optspecs='
@ Directory options
c|corpus-root=DIR "$corpus_root" { set_corpus_root "$1" }
use DIR as the root directory of corpus files for the source files
(CORPUS_ROOT)
target-corpus-root=DIR "CORPUS_ROOT"
use DIR as the root directory of corpus files for the target files
(to adjust paths in the corpus registry files)
p|package-dir=DIR "CORPUS_ROOT/$pkgsubdir" pkgdir
put the resulting package to a subdirectory CORPUS_NAME under the
directory DIR
r|registry=DIR "CORPUS_ROOT/$regsubdir" { set_corpus_registry "$1" }
use DIR as the CWB registry
s|sql-dir=DIRTEMPL "CORPUS_ROOT/$sqlsubdir/{corpid}" sqldir
use DIRTEMPL as the source directory template for Korp MySQL
dumps; DIRTEMPL is a directory name possibly containing
placeholder {corpname} for corpus name or {corpid} for corpus id
t|tsv-dir=DIRTEMPL "CORPUS_ROOT/$tsvsubdir/{corpid}" tsvdir
use DIRTEMPL as the source directory template for Korp MySQL TSV
data files
korp-frontend-dir=DIR "$korp_frontend_dir" { set_korp_frontend_dir "$1" }
read Korp configuration files from DIR
@ Options for VRT files
vrt-dir=DIRTEMPL "CORPUS_ROOT/$vrtsubdir/{corpid}" \
{ include_vrtdir=1; vrtdir=$1 }
use DIRTEMPL as the source directory template for VRT files
(*.vrt, *.vrt.*)
include-vrt-dir include_vrtdir
include the files in the (default) VRT directory in the package;
this option needs to be specified only if using the default VRT
directory
vrt-file=FILE * { include_vrt=1; add_extra_file "$1" vrt/{corpid}/ }
include FILE as a VRT file in directory "vrt/{corpname}" (or
("vrt/{corpid}" if the directory component of FILE contains
{corpid}) in the package; this option may be specified multiple
times, and FILE may contain shell wildcards
generate-vrt
generate a single VRT file for each corpus from the CWB corpus
data; include the file in the package but do not add it to the VRT
directory
update-vrt|generate-missing-vrt { update_vrt=1; include_vrtdir=1; }
update or generate a single VRT file from the CWB corpus data for
each corpus for which it is missing from the VRT directory or is
older than the CWB data of the corpus, and include the file in the
package
no-cwb-data omit_cwb_data
omit CWB data files from the package; this option requires
that VRT files are being included in the package
@ Options for corpus information
licence-type=LIC auth_opts { add_auth_opts licence_type $optname $1 }
set the corpus licence type to LIC, where LIC is one of PUB, ACA,
ACA-Fi or RES
lbr-id=URN { add_auth_opts lbr_id $optname $1 }
set the LBR id of the corpus to URN, which is of the form
[urn:nbn:fi:lb-]YYYYMMNNN[@LBR], where YYYYMM is year and month
and NNN 3 to 5 digits; the bracketed parts are added if left out
set-info=KEY:VALUE * { printf "%s\n" "$1" >> "$extra_info_file" }
set the corpus information item KEY (in the file .info) to the
value VALUE, where KEY is of the form [SECTION_]SUBITEM, where
SECTION can be "Metadata", "Licence" or "Compiler" and SUBITEM
"URL", "URN", "Name" or "Description"; this option can be repeated
multiple times
info-from-file=FILENAME { cat "$1" >> "$extra_info_file" }
read information items to be added from file FILENAME
@ Options for additional files
readme-file=FILE * { add_extra_file --prepend "$1" /; has_readme=1 }
include FILE as a top-level read-me file; the option may be
specified multiple times to include multiple files, and FILE may
contain shell wildcards (but braces are not expanded)
doc-dir=DIR { add_extra_dir "$1" doc/; has_docs=1 }
include DIR as a documentation directory "doc" in the package
doc-file=FILE * { add_extra_file "$1" doc/; has_docs=1 }
include FILE as a documentation file in directory "doc"; may be
specified multiple times, and FILE may contain shell wildcards
script-dir=DIR { add_extra_dir "$1" scripts/; has_scripts=1 }
include DIR as a (conversion) script directory "scripts" in the
package
script-file=FILE * { add_extra_file "$1" scripts/; has_scripts=1}
include FILE as a (conversion) script file in directory "scripts";
may be specified multiple times, and FILE may contain shell
wildcards
extra-dir=SRCDIR[:DSTDIR] * { add_extra_dir "$1" }
include directory SRCDIR in the package; if :DSTDIR is specified,
the directory is renamed as DSTDIR in the package; the option may
be specified multiple times
extra-file=SRCFILE[:DSTFILE] * { add_extra_file "$1" }
include file SRCFILE in the package; if :DSTFILE is specified, the
file is renamed as DSTFILE in the package; if DSTFILE ends in a
slash or if SRCFILE contains wildcards, DSTFILE is considered a
directory name and SRCFILE is placed in that directory in the
package; the option may be specified multiple times
@ Options controlling the output
f|database-format=FMT "auto" dbformat { set_db_format "$1" }
include database files in format FMT: either sql (SQL), tsv (TSV),
auto (SQL or TSV, whichever files are newer) or none (do not
include database files)
export-database export_db { export_db=1; set_db_format "tsv" }
export database data into TSV files to be packaged; implies
--database-format=tsv
z|compress=PROG "gzip" { set_compress "$1" }
compress files with PROG; "none" for no compression
'
usage_footer="Environment variables:
Default values for the various directories can also be specified via
the following environment variables: CORPUS_ROOT, TARGET_CORPUS_ROOT,
CORPUS_PKGDIR, CORPUS_REGISTRY, CORPUS_SQLDIR, CORPUS_TSVDIR, CORPUS_VRTDIR,
KORP_FRONTEND_DIR."
. $progdir/korp-lib.sh
# Uncomment to enable some debug output to stderr:
# debug=1
# These will be set later based on $corpus_root, which may be modified
# by options
target_corpus_root=$TARGET_CORPUS_ROOT
regdir=$CORPUS_REGISTRY
datadir=$CORPUS_DATADIR
sqldir=$CORPUS_SQLDIR
pkgdir=$CORPUS_PKGDIR
tsvdir=$CORPUS_TSVDIR
vrtdir=$CORPUS_VRTDIR
cwbdata_extract_info=$progdir/cwbdata-extract-info.sh
cwbdata2vrt=$progdir/cwbdata2vrt-simple.sh
korp_mysql_export="$progdir/korp-mysql-export.sh"
korp_make_auth_info=$progdir/korp-make-auth-info.sh
regsubdir=registry
datasubdir=data
sqlsubdir=sql
tsvsubdir=vrt
pkgsubdir=pkgs
vrtsubdir=vrt
include_vrt=
exclude_files="backup *~ *.bak *.bak[0-9] *.old *.old[0-9] *.prev *.prev[0-9]"
has_readme=
has_docs=
has_scripts=
archive_ext_none=tar
archive_ext_gzip=tgz
archive_ext_bzip2=tbz
archive_ext_xz=txz
archive_type_name=korp
sql_file_types="lemgrams rels timespans timedata timedata_date"
sql_file_types_multicorpus="lemgrams timespans timedata timedata_date"
sql_table_name_lemgrams=lemgram_index
rels_tables_basenames="@ rel head_rel dep_rel strings sentences"
frontend_config_files="config.js $(echo modes/{other_languages,parallel,swedish}_mode.js) translations/corpora-*.json"
extra_info_file=$tmp_prefix.info
touch $extra_info_file
corpus_files=
corpus_files_prepend=
extra_dir_and_file_transforms=
# Functions used in the option handler (directly or indirectly)
remove_leading_slash () {
printf '%s\n' "$1" | sed -e 's,^/*,,'
}
remove_trailing_slash () {
printf '%s\n' "$1" | sed -e 's,/*$,,'
}
dirname_slash () {
# dirname drops the last component even with a trailing slash, so
# add a @ to mark the file name after a slash. If the argument has
# no trailing slash, this works like dirname.
dirname "$1@"
}
is_dirname () {
case "$1" in
*/ )
return 0
;;
esac
return 1
}
add_transform () {
echo_dbg add_transform "$1" "$2"
extra_dir_and_file_transforms="$extra_dir_and_file_transforms
$1 $2"
}
add_corpus_files () {
local prepend any any_added _fname
# If --prepend is specified, prepend the file name to
# $corpus_files instead of appending.
prepend=
if [ "x$1" = "x--prepend" ]; then
prepend=1
shift
fi
# If --any is specified, warn on non-existent files only if none
# of the files specified as arguments is found.
if [ "x$1" = "x--any" ]; then
any=1
shift
fi
for _fname in "$@"; do
if ls $_fname > /dev/null 2>&1; then
if [ "x$prepend" != x ]; then
corpus_files_prepend="$corpus_files_prepend $_fname"
else
corpus_files="$corpus_files $_fname"
fi
any_added=1
elif [ "x$any" = x ]; then
warn "File or directory not found: $_fname"
fi
done
if [ "x$any" != x ] && [ "x$any_added" = x ]; then
warn "None of the files or directories found: $*"
fi
}
add_corpus_files_expand () {
# Add corpus files with "{corp}" in arguments expanded to each
# corpus id in turn
local fname corpus_id
for fname in "$@"; do
if [[ $fname = *{corp*}* ]]; then
for corpus_id in $corpus_ids; do
add_corpus_files "$(echo $(fill_dirtempl $fname $corpus_id))"
done
else
add_corpus_files "$(echo $fname)"
fi
done
}
has_wildcards () {
# FIXME: This does not take into account backslash-protected
# wildcards, which should not be counted as wildcards.
case "$1" in
*\** | *\?* | *\[* )
return 0
;;
esac
return 1
}
wildcards_to_regex () {
echo "$1" |
sed -e 's,\.,\\.,g; s,?,[^/],g; s,\*,[^/]*,g;'
}
set_korp_frontend_dir () {
if test_file -r $1/config.js warn \
"config.js not found or not accessible in directory $1; using the default Korp frontend directory $korp_frontend_dir"
then
korp_frontend_dir=$1
fi
}
set_db_format () {
case "$1" in
sql | SQL )
dbformat=sql
;;
tsv | TSV )
dbformat=tsv
;;
auto | automatic )
dbformat=auto
;;
none )
dbformat=none
;;
* )
warn "Invalid database format '$1'; using $dbformat"
;;
esac
}
set_compress () {
if [ "x$1" = "xnone" ] || which $1 &> /dev/null; then
if [ "x$1" = "xcat" ]; then
compress=none
else
compress=$1
fi
else
warn "Compression program $1 not found; using $compress"
fi
}
add_extra_dir_or_file () {
# Target ends in / => dir, transform the dir part of source;
# otherwise transform the whole source; source ends in / => dir
local prepend=
if [ "x$1" = "x--prepend" ]; then
prepend=$1
shift
fi
local source=$1
local target=$2
echo_dbg "** add_extra:param" "$source" "$target"
if [ "x$target" = x ]; then
case "$source" in
*:* )
target=$(echo "$source" | sed -e 's/^.*://')
source=$(echo "$source" | sed -e 's/:[^:]*$//')
# If the source contains wildcards, then the target
# should always be a directory.
if has_wildcards "$source"; then
target=$target/
fi
;;
* )
if has_wildcards "$source"; then
target=$(dirname_slash "$source")/
else
target=$source
fi
;;
esac
fi
local sourcedir=$(remove_leading_slash $(dirname_slash "$source"))
local targetdir=$(remove_leading_slash $(dirname_slash "$target"))
echo_dbg add_extra:dirs "$sourcedir" "$targetdir"
add_corpus_files $prepend $(remove_trailing_slash "$source")
source=$(remove_leading_slash "$source")
target=$(remove_leading_slash "$target")
echo_dbg add_extra:mods "$source" "$target"
if is_dirname "$target" || [ "x$targetdir" = x ]; then
local targetdir_slash=
if [ "x$targetdir" != x ]; then
targetdir_slash="$targetdir/"
fi
# Originally $targetdir = /
if [ "x$sourcedir" = x. ]; then
# Extra backslashes to protect them through echos
add_transform "\\\\($source\\\\)" "$targetdir_slash\\\\1"
elif is_dirname "$source"; then
add_transform "$sourcedir/" "$targetdir_slash"
# The following is now added in make_tar_transforms:
# add_transform "$sourcedir\$" "$targetdir"
else
local sourcefile=$(wildcards_to_regex $(basename "$source"))
add_transform \
"$sourcedir/\\\\($sourcefile\\\\)" "$targetdir_slash\\\\1"
fi
else
add_transform "$source" "$target"
fi
}
add_extra_file () {
add_extra_dir_or_file "$@"
}
add_extra_dir () {
local prepend=
if [ "x$1" = "x--prepend" ]; then
prepend=$1
shift
fi
local target=$2
if [ "x$target" != x ]; then
target="$target/"
fi
add_extra_dir_or_file \
$prepend "$(echo "$1" | sed -e 's,:,/:,; s,$,/,')" "$target"
}
add_auth_opts () {
local type opt val
type=$1
opt=$2
val=$3
val=$(eval "make_$type \$val")
exit_if_error $?
auth_opts="$auth_opts $opt $val"
}
# Process options
eval "$optinfo_opt_handler"
if [ "x$1" = "x" ]; then
error "No corpus name specified"
fi
target_corpus_root=${target_corpus_root:-$corpus_root}
pkgdir=${pkgdir:-$corpus_root/$pkgsubdir}
regdir=$(remove_trailing_slash $cwb_regdir)
datadir=$(remove_trailing_slash ${datadir:-$corpus_root/$datasubdir})
sqldir=$(remove_trailing_slash ${sqldir:-"$corpus_root/$sqlsubdir/{corpid}"})
tsvdir=$(remove_trailing_slash ${tsvdir:-"$corpus_root/$tsvsubdir/{corpid}"})
vrtdir=$(remove_trailing_slash ${vrtdir:-"$corpus_root/$vrtsubdir/{corpid}"})
if [ "x$include_vrtdir$generate_vrt" != "x" ]; then
include_vrt=1
fi
corpus_name=$1
shift
if [ ! -d "$regdir" ]; then
error "Cannot access registry directory $regdir"
fi
if [ "x$omit_cwb_data" != x ]; then
if [ "x$include_vrt" = x ]; then
error "You need to include VRT data when omitting CWB data."
fi
archive_type_name=vrt
fi
if [ "x$generate_vrt" != x ] && [ "x$update_vrt" != x ]; then
warn "Both --generate-vrt and --update-vrt specified; assuming --update-vrt"
generate_vrt=
fi
eval archive_ext=\$archive_ext_$compress
if [ "x$archive_ext" = x ]; then
archive_ext=tar.$compress
warn "Unrecognized compression program $compress: using package file name extension .$archive_ext"
fi
if [ "x$1" = "x" ]; then
corpus_ids=$corpus_name
else
corpus_ids="$(list_corpora "$@")"
fi
if [ "x$korp_frontend_dir" = "x" ]; then
warn "Korp frontend directory not found"
elif test_file -r $korp_frontend_dir/config.js warn \
"$korp_frontend_dir/config.js not found or not accessible; not including Korp configuration files";
then
for fname_patt in $frontend_config_files; do
add_corpus_files $(echo $korp_frontend_dir/$fname_patt)
done
fi
if [ "x$has_readme" = x ]; then
warn "No readme file included"
fi
if [ "x$has_docs" = x ]; then
warn "No documentation included"
fi
if [ "x$has_scripts" = x ]; then
warn "No conversion scripts included"
fi
generate_vrt () {
local corpus_id=$1
$cwbdata2vrt --all-attributes --output-file=- $corpus_id
}
if [ "x$generate_vrt" != x ]; then
mkdir -p $tmp_prefix.vrt
for corpus_id in $corpus_ids; do
vrt_file=$tmp_prefix.vrt/$corpus_id.vrt
generate_vrt $corpus_id > "$vrt_file"
add_corpus_files "$vrt_file"
add_transform "$vrt_file" vrt/$corpus_id/$corpus_id.vrt
done
fi
fill_dirtempl () {
dirtempl=$1
corpus_id=$2
echo "$dirtempl" |
sed -e "s,{corpname},$corpus_name,g; s,{corpid},$corpus_id,g"
}
vrt_file_is_uptodate () {
local corpus_id=$1
local vrt_file=$2
local newer_corpus_files
vrt_file=$(ls -t "$vrt_file" "$vrt_file".* 2> /dev/null | head -1)
if [ "x$vrt_file" = x ]; then
return 1
fi
newer_corpus_files=$(
find "$datadir"/$corpus_id -newer "$vrt_file" -name '[a-z]*')
[ "x$newer_corpus_files" = x ]
}
if [ "x$update_vrt" != x ]; then
for corpus_id in $corpus_ids; do
corpus_vrtdir=$(fill_dirtempl "$vrtdir" $corpus_id)
vrt_file=$corpus_vrtdir/$corpus_id.vrt
mkdir -p "$corpus_vrtdir"
if ! vrt_file_is_uptodate $corpus_id "$vrt_file"; then
# Would the following be safe?
# rm -f "$vrt_file" "vrt_file".*
generate_vrt $corpus_id > "$vrt_file"
if [ "x$compress" != "xnone" ]; then
$compress -f "$vrt_file"
fi
fi
done
fi
make_rels_table_names () {
corp_id_upper=`echo $1 | sed -e 's/\(.*\)/\U\1\E/'`
for base in $rels_tables_basenames; do
echo relations_${corp_id_upper}_$base |
sed -e 's/_@//'
done
}
run_mysqldump () {
extra_opts=
while true; do
case "$1" in
-* )
extra_opts="$extra_opts $1"
shift
;;
* )
break
;;
esac
done
mysqldump --no-autocommit $mysql_opts $extra_opts $korpdb "$@" 2> /dev/null
}
compress_or_rm_sqlfile () {
sqlfile=$1
if grep -q 'INSERT INTO' $sqlfile; then
if [ "x$compress" != "xnone" ]; then
$compress $sqlfile
fi
else
rm $sqlfile
fi
}
make_sql_table_part () {
corp_id=$1
corp_id_upper=`echo $corp_id | sed -e 's/\(.*\)/\U\1\E/'`
filetype=$2
eval tablename=\$sql_table_name_$filetype
if [ "x$tablename" = "x" ]; then
tablename=$filetype
fi
sqlfile=`fill_dirtempl $sqldir $corp_id`/${corp_id}_$filetype.sql
#
{
# Add a CREATE TABLE IF NOT EXISTS statement for the table, so
# that the package can be installed even on an empty database.
# By default, mysqldump (without --no-create-info) would first
# drop the database and then recreate it, but we need to
# retain the data for the other corpora.
run_mysqldump --no-data --compact $tablename |
sed -e 's/CREATE TABLE/& IF NOT EXISTS/'
echo
# Instruct to delete existing data for the corpus first
echo "DELETE FROM $tablename WHERE corpus='$corp_id_upper';"
echo
# The actual data dump
run_mysqldump --no-create-info --where="corpus='$corp_id_upper'" \
$tablename
} > $sqlfile
compress_or_rm_sqlfile $sqlfile
}
dump_database () {
corp_id=$1
rels_tables=`make_rels_table_names $corp_id`
sqldir_real=`fill_dirtempl $sqldir $corp_id`
run_mysqldump $rels_tables > $sqldir_real/${corp_id}_rels.sql
compress_or_rm_sqlfile $sqldir_real/${corp_id}_rels.sql
for filetype in $sql_file_types_multicorpus; do
make_sql_table_part $corp_id $filetype
done
}
get_corpus_date () {
(
cd $datadir
ls -lt --time-style=long-iso "$@" |
grep -A1 '^total' |
grep -E -v '^(total|--)' |
awk '{print $6}' |
sort -r |
head -1 |
tr -d '-'
)
}
# FIXME: list_existing_db_files_by_type, list_existing_db_files and
# list_db_files probably do not work as they were intended to. There
# should probably be a loop somewhere iterating over the different
# types of tables.
list_existing_db_files_by_type () {
# FIXME: Check the arguments and their use!
corp_id=$1
type=$2
db_filetype=$3
if [ "x$type" = "xtsv" ]; then
dir=$tsvdir
ext=.tsv
else
dir=$sqldir
ext=.sql
fi
dir=`fill_dirtempl $dir $corp_id`
# FIXME: Should the line below have $db_filetype instead of
# $filetype? Now this lists all files beginning with $corp_id and
# ending in $ext, which is probably why all this seems to work.
basename="$dir/${corp_id}_$filetype*$ext"
ls -t $basename $basename.gz $basename.bz2 $basename.xz 2> /dev/null
}
get_first_word () {
echo "$1"
}
list_existing_db_files () {
# FIXME: Check the arguments and their use!
corp_id=$1
type=$2
filenames_sql=`list_existing_db_files_by_type $corp_id $type sql`
filenames_tsv=`list_existing_db_files_by_type $corp_id $type tsv`
if [ "x$filenames_sql" != x ]; then
if [ "x$filenames_tsv" != x ]; then
firstfile_sql=`get_first_word $filenames_sql`
firstfile_tsv=`get_first_word $filenames_tsv`
if [ "$firstfile_sql" -nt "$firstfile_tsv" ]; then
echo "$filenames_sql"
else
echo "$filenames_tsv"
fi
else
echo "$filenames_sql"
fi
else
echo "$filenames_tsv"
fi
}
list_db_files () {
if [ "$dbformat" != auto ]; then
list_existing_db_files_by_type $corpus_id $dbformat
else
dbfiles=`list_existing_db_files $corpus_id`
if [ "x$dbfiles" = "x" ]; then
dump_database $corpus_id
list_existing_db_files $corpus_id
else
echo "$dbfiles"
fi
fi
}
if [ "$corpus_root" = "$target_corpus_root" ]; then
target_regdir=$regdir
else
target_regdir=$tmp_prefix/$regsubdir
mkdir -p $target_regdir
for corpus_id in $corpus_ids; do
sed -e "s,^\(HOME\|INFO\) .*\($corpus_id\),\1 $target_corpus_root/$datasubdir/\2," $regdir/$corpus_id > $target_regdir/$corpus_id
touch --reference=$regdir/$corpus_id $target_regdir/$corpus_id
done
fi
if [ "x$auth_opts" != "x" ]; then
$korp_make_auth_info --tsv-dir "$tsvdir" $auth_opts $corpus_ids
fi
echo_dbg extra_files "$corpus_files"
# Add the extra files to the end, except for those to be prepended
extra_corpus_files=$corpus_files
corpus_files=
add_corpus_files_expand $corpus_files_prepend
for corpus_id in $corpus_ids; do
# Include the CWB registry file as documentation of the VRT fields
# even if omitting CWB data
add_corpus_files "$target_regdir/$corpus_id"
if [ "x$omit_cwb_data" = x ]; then
add_corpus_files "$datadir/$corpus_id"
fi
if [ "x$export_db" != x ]; then
$korp_mysql_export --corpus-root "$corpus_root" \
--output-dir "$tsvdir" --compress "$compress" $corpus_id
fi
if [ "x$dbformat" != xnone ]; then
add_corpus_files $(list_db_files $corpus_id)
fi
if [ "x$include_vrtdir" != x ]; then
filepatt=$(fill_dirtempl "$vrtdir/*.vrt $vrtdir/*.vrt.*" $corpus_id)
# --any to warn only if neither *.vrt nor *.vrt.* is found
add_corpus_files --any $(remove_trailing_slash \
"$(fill_dirtempl "$vrtdir/*.vrt $vrtdir/*.vrt.*" $corpus_id)")
fi
done
add_corpus_files_expand $extra_corpus_files
echo_dbg corpus_files "$corpus_files"
for corpus_id in $corpus_ids; do
$cwbdata_extract_info --update --registry "$regdir" \
--data-root-dir "$datadir" \
--tsv-dir $(fill_dirtempl "$tsvdir" $corpus_id) \
--info-from-file "$extra_info_file" $corpus_id
done
corpus_date=`get_corpus_date $corpus_ids`
mkdir_perms $pkgdir/$corpus_name
archive_basename=${corpus_name}_${archive_type_name}_$corpus_date
archive_name=$pkgdir/$corpus_name/$archive_basename.$archive_ext
archive_num=0
while [ -e $archive_name ]; do
archive_num=`expr $archive_num + 1`
archive_name=$pkgdir/$corpus_name/$archive_basename-`printf %02d $archive_num`.$archive_ext
done
tar_compress_opt=
if [ "x$compress" != "xnone" ]; then
tar_compress_opt=--use-compress-program=$compress
fi
transform_dirtempl () {
# Multiple backslashes are needed in the sed expression because of
# multiple echos, through which the output goes. (?)
remove_leading_slash "$1" |
sed -e 's,{corp.*},\\([^/]*\\),'
}
transform_dirtempl_pair () {
local srcdir=$1
local dstdir=$2
local repl=
case $dstdir in
*{corp*}* )
dstdir=${dstdir/"{corpname}"/$corpus_name}
case $srcdir in
*{corpid}* )
# This assumes that \1 is in the filename part,
# thus later than {corpid}
dstdir=${dstdir/'\1'/'\2'}
repl='\1'
;;
* )
repl=$corpus_name
;;
esac
dstdir=${dstdir/"{corpid}"/$repl}
;;
esac
srcdir=$(transform_dirtempl "$srcdir")
echo "$srcdir $dstdir"
}
make_tar_transform () {
echo --transform "s,^$1,$archive_basename/$2,"
echo_dbg tar_transform:to "s,^$1,$archive_basename/$2,"
}
make_tar_transforms () {
local source=
local target=
echo_dbg tar_transforms:param "$1"
echo "$1" |
sort -u |
while read source target; do
echo_dbg tar_transform:from "$source" "$target"
local pair=$(transform_dirtempl_pair $source $target)
source=${pair% *}
target=${pair#* }
# Tar removes leading "../" before transformations, so remove
# it to make the transformation work
source=${source#../}
make_tar_transform "$source" "$target"
if [[ "$source" = */ && "$target" = */ ]]; then
make_tar_transform "$(remove_trailing_slash $source)\$" \
"$(remove_trailing_slash $target)"
fi
done
}
make_tar_excludes () {
for patt in "$@"; do
printf '%s\n' --exclude "$patt"
done
}
dir_transforms=\
"$datadir/ data/
$target_regdir/ registry/
$sqldir/\\\\([^/]*\\\\.sql[^/]*\\\\) sql/{corpid}/\\\\1
$tsvdir/\\\\([^/]*\\\\.tsv[^/]*\\\\) sql/{corpid}/\\\\1
$vrtdir/\\\\([^/]*\\\\.vrt[^/]*\\\\) vrt/{corpid}/\\\\1"
if [ "x$korp_frontend_dir" != "x" ]; then
dir_transforms="$dir_transforms
$korp_frontend_dir/ korp_config/"
fi
if [ "x$extra_dir_and_file_transforms" != x ]; then
dir_transforms="$dir_transforms$extra_dir_and_file_transforms"
fi
echo_dbg "$dir_transforms"
tar cvp --group=$filegroup --mode=g+rwX,o+rX $tar_compress_opt \
-f $archive_name --exclude-backups $(make_tar_excludes $exclude_files) \
$(make_tar_transforms "$dir_transforms") \
--ignore-failed-read \
--show-transformed-names $corpus_files
chgrp $filegroup $archive_name
chmod 444 $archive_name
echo "
Created corpus package $archive_name"