-
Notifications
You must be signed in to change notification settings - Fork 3
/
korp-make-timedata-tables.sh
executable file
·162 lines (140 loc) · 4.34 KB
/
korp-make-timedata-tables.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#! /bin/sh
# -*- coding: utf-8 -*-
# Usage: korp-make-timedata-tables.sh [options] corpus ...
#
# For more information, run korp-make-timedata-tables.sh --help
progname=`basename $0`
progdir=`dirname $0`
shortopts="hc:t:v"
longopts="help,corpus-root:,tsv-dir:,verbose,import-database"
tsvdir=$CORPUS_TSVDIR
tsvsubdir=sql
verbose=
import=
dbname="korp"
. $progdir/korp-lib.sh
usage () {
cat <<EOF
Usage: $progname [options] corpus ...
Generate Korp timedata database tables based on text attributes datefrom,
dateto, timefrom and timeto, and import them into the Korp MySQL database.
Corpus names are specified in lower case, and they may contain shell
wildcards.
Options:
-h, --help show this help
-c, --corpus-root DIR
use DIR as the root directory of corpus files for the
source files (CORPUS_ROOT) (default: $corpus_root)
-t, --tsv-dir DIRTEMPL
use DIRTEMPL as the directory template to which to write
Korp MySQL TSV data files; DIRTEMPL is a directory name
possibly containing the placeholder {corpid} for corpus id
(default: CORPUS_ROOT/$tsvsubdir)
--import-database
import data into the Korp MySQL database
-v, --verbose verbose output
EOF
exit 0
}
# Process options
while [ "x$1" != "x" ] ; do
case "$1" in
-h | --help )
usage
;;
-c | --corpus-root )
shift
set_corpus_root "$1"
;;
-t | --tsv-dir )
shift
tsvdir=$1
;;
-v | --verbose )
verbose=1
;;
--import-database )
import=1
;;
-- )
shift
break
;;
--* )
warn "Unrecognized option: $1"
;;
* )
break
;;
esac
shift
done
tsvdir=${tsvdir:-$corpus_root/$tsvsubdir}
corpora=$(list_corpora "$@")
verbose_opt=
if [ "x$verbose" != x ]; then
verbose_opt=--verbose
fi
descr_corpus="$cwb_bindir/cwb-describe-corpus"
s_decode="$cwb_bindir/cwb-s-decode"
mysql_import="/v/korp/scripts/korp-mysql-import.sh --prepare-tables"
generate_timedata () {
# TODO: This is largely copied from korp-convert-timedata.sh; how
# to have the code in one place only?
_corpus=$1
_corpus_u=$(echo $_corpus | sed -e 's/.*/\U&\E/')
tsvdir_real=$(echo "$tsvdir" | sed -e "s/{corpid}/$_corpus/g")
if ! mkdir -p $tsvdir_real 2> /dev/null; then
error "Cannot create TSV directory $tsvdir_real"
fi
timedata_tsv=$tsvdir_real/${_corpus}_timedata.tsv.gz
timedata_date_tsv=$tsvdir_real/${_corpus}_timedata_date.tsv.gz
$s_decode $_corpus -S text > $tmp_prefix.text.tsv 2> $tmp_prefix.text.err
if grep -q "Can't access s-attribute" $tmp_prefix.text.err; then
echo_verb " No structural attribute 'text' in corpus $_corpus; skipping"
return
fi
for attrname in datefrom timefrom dateto timeto; do
_fname=$tmp_prefix.$attrname.tsv
$s_decode $_corpus -S text_$attrname 2> $_fname.err |
cut -d"$tab" -f3 > $_fname
if grep -q "Can't access s-attribute" $_fname.err; then
cat /dev/null > $_fname
fi
done
for fromto in from to; do
paste $tmp_prefix.date$fromto.tsv $tmp_prefix.time$fromto.tsv |
tr -d '\t' > $tmp_prefix.$fromto.tsv
done
paste $tmp_prefix.text.tsv $tmp_prefix.from.tsv $tmp_prefix.to.tsv |
gawk -F"$tab" '{print "'"$_corpus_u"'\t" $3 "\t" $4 "\t" $2 - $1 + 1}' |
$progdir/timespans-adjust-granularity.py \
--granularity=second --from-field=2 --to-field=3 --count-field=4 |
sort | gzip > $timedata_tsv
zcat $timedata_tsv |
$progdir/timespans-adjust-granularity.py \
--granularity=day --from-field=2 --to-field=3 --count-field=4 |
sort | gzip > $timedata_date_tsv
tokencnt=$($descr_corpus $_corpus | gawk '/^size / {print $NF}')
for file in $timedata_tsv $timedata_date_tsv; do
timedata_tokencnt=$(
zcat $file |
gawk -F"$tab" '{s+=$4} END {print s}'
)
if [ "x$timedata_tokencnt" != "x$tokencnt" ]; then
echo "Error: Corpus has $tokencnt tokens but database table in $file $timedata_tokencnt tokens"
exit 1
fi
done
if [ "x$import" != x ]; then
echo_verb " Importing data into the Korp MySQL database"
$progdir/korp-mysql-import.sh --prepare-tables \
$timedata_tsv $timedata_date_tsv |
cat_verb |
indent_input 4
fi
}
for corpus in $corpora; do
echo_verb "Generating time data for corpus $corpus"
generate_timedata $corpus
done