-
Notifications
You must be signed in to change notification settings - Fork 3
/
korp-fix-dateto.sh
executable file
·56 lines (47 loc) · 1.53 KB
/
korp-fix-dateto.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#! /bin/sh
# Fix invalid text_dateto dates (day 31 in months with fewer days) in
# Korp CWB data (and MySQL timedata tables)
#
# Usage: korp-fix-dateto.sh corpus_id ...
#
# This might be generalized to handle more invalid cases. Or would
# this functionality be more appropriate in korp-convert-timedata.sh?
progname=`basename $0`
progdir=`dirname $0`
. $progdir/korp-lib.sh
fix_dateto () {
corpus=$1
if [ ! -e $cwb_regdir/$corpus ]; then
printf "Warning: Corpus $corpus not found in the CWB corpus registry\n"
return
elif ! grep -q text_dateto $cwb_regdir/$corpus; then
printf "Warning: Corpus $corpus has no text_dateto attribute\n"
return
fi
printf "$corpus: "
origfile=$tmp_prefix.text_dateto_orig.pos
corrfile=$tmp_prefix.text_dateto_corr.pos
$cwb_bindir/cwb-s-decode $corpus -S text_dateto > $origfile
if egrep -q '(0[469]|11)31$' $origfile; then
printf "Fixing... "
perl -pe '
s/(0[469]|11)31$/${1}30/;
($y) = /\t(\d{4})\d{4}$/;
$d = ($y % 4 == 0 && ($y % 100 != 0 || $y % 400 == 0)) ? "29" : "28";
s/0231$/02$d/' $origfile > $corrfile
datadir=$corpus_root/data/$corpus
for suff in avs avx rng; do
fname=$datadir/text_dateto.$suff
cp -p --backup=numbered $fname $fname.bak
ensure_perms $fname.bak*
done
$cwb_bindir/cwb-s-encode -d $datadir -B -V text_dateto < $corrfile
$progdir/korp-convert-timedata.sh --convert mysql,info $corpus
printf "Done.\n"
else
printf "Ok.\n"
fi
}
for corpus in "$@"; do
fix_dateto $corpus
done