scripts/korp-make

#! /bin/bash
# -*- coding: utf-8 -*-


# TODO:
# - Allow specifying input via options (config file).
# - Allow multiple instances of extra file options to pass to
#   korp-make-corpus-package
# - --add-structure-ids: Specify by structure type if existing values
#   should be overwritten or not.
# - Allow specifying structure id value format.
# - Generate a VRT file containing all the added information: in
#   particular, add the name attributes to the VRT before
#   cwb-encoding.
# - Support different (or multiple) lemma attributes for generating
#   lemmas without compound boundaries, lemgrams and word picture
#   data.
# - Run stages based on checksums of previous stage's output.
# - Make multiple (related) corpora in the same package.
# - Make parallel corpora.
# - (?) Omit positional attributes with a full stop in their names.


progname=`basename $0`
progdir=`dirname $0`

mapdir=$progdir/../corp

vrttoolsdir=$progdir/../vrt-tools

vrt_subdir=vrt/CORPUS
tsv_subdir=$vrt_subdir

lemgram_posmap_default=$mapdir/lemgram_posmap_tdt.tsv
lemgram_posmap_ud_default=$mapdir/lemgram_posmap_ud2_universal.tsv
wordpict_relmap_default=$mapdir/wordpict_relmap_tdt.tsv
wordpict_relmap_ud_default=$mapdir/wordpict_relmap_ud_fi.tsv
compound_boundary_marker_default="|"
compound_boundary_marker_ud_default="#"

# Designates that the corpus id should be used as the base seed
random_seed_default="{CORPUS}"

usage_header="Usage: $progname [options] [corpus] [input_file ...]

Process a VRT file to make a Korp corpus package containing CWB data files and
MySQL database import files.

The corpus id must be specified either as the first non-option argument or via
the option --corpus-id.

The input files may be either (possibly compressed) VRT files containing
dependency parse information and name tags, or ZIP or (possibly compressed)
tar archives containing such VRT files. If no input files are specified, read
from the standard input or use a VRT file stored on a previous run for the
same corpus.

If rerun on a corpus and processed data for the corpus exists, try to infer
which processing stages needs to be rerun. However, run all processing stages
if --force is specified or if any of the input files is newer than the VRT
file stored on a previous run."

optspecs='
@ General options

corpus-id=CORPUS corpus
    make corpus with id CORPUS; this is alternative to specifying the
    corpus id as the first non-option argument
force
    force all stages of processing by first removing all the output
    files if they exist; existing output files are also removed if any
    of the input files is newer than the VRT file stored on a previous
    run for the same corpus (unless --augment-data is specified)
config-file|configuration-file=FILE
    read FILE as an INI-style configuration file (without sections).
    Configuration keys correspond to option names without the leading
    dashes; internal dashes may be replaced with underscores.

@ Diagnostic output

v|verbose "1"
    output some progress information (the default)
quiet { verbose= }
    do not output progress information (except for some subprocesses)
times show_times
    output the amount of CPU time used for each stage
log-file=FILE logfile
    log script output (standard output and standard error) to FILE
    instead of the default
    $corpus_root/log/${progname}_CORPUS_TIMESTAMP.log where CORPUS is
    the corpus id and TIMESTAMP the start time of the script
no-logging !logging
    do not copy script output to a log file

@ Directories

c|corpus-root=DIR "$corpus_root" { set_corpus_root "$1" }
    use DIR as the root directory of corpus files
tsv-dir=DIR "CORPUS_ROOT/$tsv_subdir" tsvdir
    output database tables as TSV files to DIR

@ Corpus licence information

licence-type=LIC auth_opts { add_auth_opts licence_type $optname $1 }
    set the corpus licence type to LIC, where LIC is one of PUB, ACA,
    ACA-Fi or RES
lbr-id=URN { add_auth_opts lbr_id $optname $1 }
    set the LBR id of the corpus to URN, which is of the form
    [urn:nbn:fi:lb-]YYYYMMNNN[@LBR], where YYYYMM is year and month
    and NNN 3 to 5 digits; the bracketed parts are added if left out

@ Input attributes

input-attrs|input-attributes|input-fields=ATTRS \
  "word ref lemma pos msd dephead deprel nertag" initial_input_attrs
    specify the names of the positional attributes in the input,
    separated by spaces; if "word" (token) is not included in the
    list, add it as the first attribute, unless --no-word-attribute is
    specified;
    if the input VRT contains a positional attributes comment, it
    overrides this option, unless --override-vrt-attributes is
    specified;
    attributes named "_" or with names beginning with a "-" are
    skipped in the input;
    if ATTRS contains attribute names suffixed with "_ud", "_ud1" or
    "_ud2" and no corresponding attributes without the suffix, the
    suffix is stripped
override-vrt-attrs|override-vrt-attributes
    use the positional attributes specified with --input-attributes
    even if the input VRT contains a positional attributes comment
omit-attributes|skip-attributes=ATTRS omit_attrs
    omit the positional input attributes listed in ATTRS (seprated by
    spaces)
no-word-attribute no_word_attr
    the input does not contain a "word" attribute; implied by
    --augment-data
keep-attr-order|keep-attribute-order
    do not reorder positional attributes even if "word" is not the
    first attribute; implied by --augment-data
augment-data|augment-existing-data
    augment existing corpus data with the data in the input, for
    example, to add parse annotations to a corpus already encoded; if
    the input contains values for existing attributes, they override
    existing values; you cannot use this option with --force
generate-input-from-data generate_input
    use the existing (CWB) corpus data as the input, which is to be
    augmented if needed with lemgrams, lemmas without compound
    boundaries and the appropriate database data (implies
    --augment-data); the list of input attribute names is read from
    the data (overrides --input-attrs); you cannot use this option
    with --force

@ Annotation mappings

lemgram-posmap|posmap=POSMAP_FILE "'"$lemgram_posmap_default"'"
    use POSMAP_FILE as the mapping file from the corpus parts of
    speech to those used in Korp lemgrams; the file should contain
    lines with corpus POS and lemgram POS separated by a tab;
    if the positional attributes contain UD annotations and no non-UD
    annotations, file "'"$lemgram_posmap_ud_default"'" is used unless
    a different file is specified explicitly
wordpict-relmap|wordpicture-relation-map=RELMAP_FILE \
  "'"$wordpict_relmap_default"'"
    use RELMAP_FILE as the mapping file from corpus dependency
    relation codes to those used in the Korp word picture; the file
    should contain lines with corpus dependency relation code and word
    picture dependency relation code separated by a tab;
    if the positional attributes contain UD annotations and no non-UD
    annotations, file "'"$wordpict_relmap_ud_default"'" is used unless
    a different file is specified explicitly

@ Compound boundaries

compound-boundary-marker=MARKER "'"$compound_boundary_marker_default"'"
    the string MARKER marks compound boundaries in lemmas and will be
    removed from lemmas without compound boundaries;
    if the positional attributes contain UD annotations and no non-UD
    annotations, "'"$compound_boundary_marker_ud_default"'" is used
remove-compound-boundary-algorithm=ALGORITHM "omorfi" compound_boundary_alg
    use ALGORITHM for adding lemmas without compound boundaries, one
    of "omorfi", "old" (alias "simple-omorfi") and "naive": "omorfi"
    handles some idiosyncrasies of Omorfi, "old" produces results
    (mostly) compatible with the algorithm used previously (handling
    hyphens replaced with compound boundary markers), and "naive"
    simply removes compound boundary markers

@ Lemgrams

add-lowercase-lemgrams { add_lemgram_opt --add-lowercase-variants }
    add all-lower-case variants of lemgrams for lemmas containing
    upper-case letters
add-lemgrams-without-diacritics { add_lemgram_opt --add-non-diacritic-variants }
    add variants of lemgrams without diacritics for lemmas containing
    letters with diacritics
lemgrams-keep-letters-with-diacritics=CHARS \
  { add_lemgram_opt --keep-letters-with-diacritics "$1" }
    Keep the letters with diacritics in CHARS intact even in lemgram
    variants otherwise without diacritics. CHARS is a string of
    characters that can be used inside a set of characters in a
    regular expression (as [^CHARS]). CHARS are retained regardless of
    their case.

@ Random-number generator seed

random-seed=SEED "'"$random_seed_default"'"
    use the string SEED as the base seed for the random-number
    generator, to be used for generating structure id attributes and
    scrambling data; the actual seed is generated using SEED and seed
    for scrambling data; "" for a random seed (non-reproducible
    results) (default: corpus id)

@ Structure ids

add-structure-ids|add-element-ids=STRUCTLIST \
  "text paragraph sentence" add_struct_ids
    add id attributes to the structures listed in STRUCTLIST
    (separated by spaces); id values are based on unique random
    integers; unless --overwrite-structure-ids is specified, rename
    possible existing id attributes to id_N, where N is the smallest
    positive integer for which attribute id_N does not already exist
    in the structure; if STRUCTLIST is an empty string, do not add id
    attributes
structure-id-format|element-id-format=STRUCT:FORMAT * \
  { add_struct_id_format "$1" }
    format the value of the id attribute for structure STRUCT with
    FORMAT; run "vrt-add-id -h" for more information on FORMAT (in the
    usage description of option --format) but note that FORMAT may not
    contain spaces here
overwrite-structure-ids|overwrite-element-ids !keep_struct_ids
    overwrite possible existing id attribute values in the structures
    listed with --add-structure-ids

@ Sorting text structures

text-sort-attribute=ATTRLIST * { add_text_sort_opt --key "$1" }
    Sort text elements in the corpus by the attributes listed in
    ATTRLIST, separated by spaces or commas. Sort primarily by the
    first attribute, secondarily by the second and so on, by byte
    values, without taking the locale into account. Multiple keys can
    also be specified by repeating the option. Each attribute name may
    be followed by a colon and sort ordering option characters
    recognized by the "sort" command: often one or more of the
    following: b (ignore leading blanks), d (dictionary order), f
    (ignore case), g (general numeric sort), i (ignore nonprinting), M
    (month sort), h (human numeric sort), R (random sort), r
    (reverse), V (version sort).
text-sort-transform=ATTR:TRANSFORM '"'"' * { add_text_sort_opt --transform "$1" }
    Transform the value of the attribute attrname using TRANSFORM
    before using it as a sort key. ATTR is one of the attributes
    listed in the argument of --text-sort-attribute. (ATTR and the
    colon may be omitted if only one key attribute is specified.)
    TRANSFORM may be one of the following: (1) a Perl-style
    substitution "s/regexp/subst/[flags]", where regexp and subst
    follow Python regular expression syntax and flags is zero or more
    of the following letters: a (make \\w, \\W, \\b, \\B, \\d, \\D
    match ASCII characters only instead of whole Unicode), g (replace
    all matches and not only the first one), i (match
    case-insensitively), l (make \\w, \\W, \\b, \\B dependent on the
    current locale), x (ignore whitespace and comments); (2) a single
    Python expression; or (3) the body of a Python function. In (2)
    and (3), the variable "val" refers to the value of the attribute
    (str), and they return the result of the transformation (converted
    to str). If (3) has no return statement, the value of "val" is
    returned. On an error depending on the value of "val", an empty
    string is returned. The option may be repeated to specify
    transformations for different attributes and/or multiple
    transformations for a single attribute. Multiple transformations
    for an attribute are processed in the order they are specified.

@ Omitting structures

omit-structures=STRUCTS
    omit structures listed in STRUCTS, separated by spaces; you cannot
    omit text or sentence structures; this can be used to remove
    paragraphs from corpora whose sentences should be scrambled within
    whole texts

@ Scrambling structures

scramble=STRUCTS
    scramble structures listed in STRUCTS, separated by spaces;
    typical structures are sentence and paragraph (and link for
    parallel corpus parts); they are scrambled within the immediately
    containing structure, typically within paragraph and text,
    respectively; "sentence paragraph" scrambles both ways

@ Copying, renaming and omitting structural attributes

copy-struct-attr|copy-structure-attribute=TARGET:SOURCELIST *
    copy structural attributes from a preceding (enclosing) structure.
    TARGET is the name of the structure to which attributes are to be
    copied and SOURCELIST is a semicolon-separated list of items of
    the form SOURCESTRUCT/ATTRLIST, where SOURCESTRUCT is the source
    structure and ATTRLIST is a comma-separated list of the names of
    attributes in SOURCESTRUCT to be copied, or "*" for all
    attributes. For example, the value
    "sentence:paragraph/type,speaker" specifies that the values of the
    attributes type and speaker of the preceding (enclosing) paragraph
    structure are added to the attributes of a sentence structure,
    named paragraph_type and paragraph_speaker. Copying attributes
    takes place before omitting structures, so for example, paragraph
    attributes may be copied to sentences before removing paragraphs.
    Multiple attribute copy operations may be specified either by
    listing them in the argument separated by spaces or by specifying
    this option multiple times.
rename-struct-attr|rename-structure-attribute=STRUCT/SOURCE:TARGET *
    rename in structure STRUCT attributes matching the (Perl) regular
    expression SOURCE as TARGET. SOURCE needs to be mathced in full.
    SOURCE may contain capture groups (...) and TARGET may reference
    them as \$1, \$2 and so on. Attributes are renamed after copying
    (see above), so you can rename copied attributes. Multiple
    attribute rename operations may be specified either by listing
    them in the argument separated by spaces or by specifying this
    option multiple times.
omit-struct-attr|omit-structure-attribute=[STRUCT/][OMITLIST][![KEEPLIST]] *
    Omit in structures STRUCT attributes whose names fully match a
    (Python) regular expression in OMITLIST and do not match one in
    KEEPLIST. OMITLIST and KEEPLIST may contain multiple regular
    expressions separated by commas. This option can be specified
    multiple times with different STRUCT values, for different
    structures. If STRUCT/ is omitted, omit matching attributes from
    all structures with no structure-specific value specified. A value
    without OMITLIST but with !KEEPLIST adds a structure-specific list
    of expressions to keep, overriding a non-structure-specific value.
    A value with an "!" but no KEEPLIST keeps nothing in STRUCT. A
    value with only STRUCT/ omits nothing in STRUCT. This option can
    be used, for example, to omit from scrambled corpora attributes
    that would reveal the original structure order.

@ Date information

corpus-date=DATE
    use DATE as the date of all texts in the corpus; "unknown" if not
    known
corpus-date-pattern=PATTERN
    recognize corpus date information based on PATTERN of the form
    "ELEM ATTR REGEX": extract date information from the attribute
    ATTR of element (structural attribute) ELEM using the regular
    expression REGEX. ELEM and ATTR may be "*" (any element or
    attribute) or they may contain several attribute or element names
    separated with vertical bars. REGEX may contain named groups
    (subpatterns) in Python'"'"'s regular expressions Y, M and D,
    which extract year, month and day; for example, "(?P<Y>[0-9]{4})"
    (without the quotation marks) would recognize a year (although
    this particular case is also covered by the default pattern).
    REGEX may also cover both the start and end date, in which case
    the subpatterns for the start date are Y1, M1 and D1, and those
    for the end date, Y2, M2 and D2. If REGEX does not contain named
    subpatterns, recognize the first group as the start date and the
    possible second group as the end date.
corpus-date-full-order=ORDER
    recognize full dates in the order ORDER (one of "ymd", "dmy",
    "mdy")
corpus-date-ranges
    make the patterns recognize date ranges with different start and
    end days

@ Output data

no-lemmas-without-boundaries|skip-lemmas-without-boundaries \
  !lemmas_without_boundaries
    do not add lemmas without compound boundaries
no-lemgrams|skip-lemgrams !lemgrams
    do not add lemgrams
no-wordpicture|skip-wordpicture !wordpicture
    do not extract word picture relations database tables
no-name-attrs|no-name-attributes|skip-name-attrs|skip-name-attributes \
  !name_attrs
    do not add named-entity information based on a NER tag as the last
    positional attribute
remake-wordpicture-data
    force remaking word picture relations database tables; this option
    is needed only if recreating word picture data that has been left
    incomplete on a previous run

@ Packaging

no-package !make_package
    do not create a corpus package
korp-frontend-dir=DIR "$korp_frontend_dir"
    read Korp configuration files from DIR, to be included in corpus
    package
package-readme-file|readme-file=FILE
    include FILE as a top-level read-me file in the corpus package;
    FILE may contain shell wildcards (but braces are not expanded)
package-doc-dir|doc-dir=DIR
    include DIR as a documentation directory "doc" in the corpus
    package
package-doc-file|doc-file=FILE
    include FILE as a documentation file in directory "doc" in the
    corpus package; FILE may contain shell wildcards
package-script-dir|script-dir=DIR
    include DIR as a (conversion) script directory "scripts" of
    the corpus package
package-script-file|script-file=FILE
    include FILE as a (conversion) script file in directory "scripts"
    of the corpus package; FILE may contain shell wildcards
package-extra-dir|extra-dir=SRCDIR[:DSTDIR]
    include directory SRCDIR in the corpus package; if :DSTDIR is
    specified, the directory is renamed as DSTDIR in the package
package-extra-file|extra-file=SRCFILE[:DSTFILE]
    include file SRCFILE in the corpus package; if :DSTFILE is
    specified, the file is renamed as DSTFILE in the package; if
    DSTFILE ends in a slash or if SRCFILE contains wildcards, DSTFILE
    is considered a directory name and SRCFILE is placed in that
    directory in the package

@ Database import

import-database
    import the database TSV files into the Korp MySQL database
'

config_file_optname=config-file


. $progdir/korp-lib.sh

# cleanup_on_exit=


vrt_rename_struct_attrs=$progdir/vrt-rename-struct-attrs.pl
vrt_fix_attrs=$progdir/vrt-fix-attrs.py
vrt_add_lemma_nobound=$vrttoolsdir/vrt-add-lemma-nobound
vrt_add_lemgrams=$progdir/vrt-add-lemgrams.py
vrt_sort=$vrttoolsdir/vrt-sort
vrt_add_id=$vrttoolsdir/vrt-add-id
vrt_convert_chars=$vrttoolsdir/vrt-convert-chars
vrt_extract_timespans=$progdir/vrt-extract-timespans.py
vrt_extract_seed=$vrttoolsdir/vrt-extract-seed
vrt_list_struct_attrs=$progdir/vrt-list-struct-attrs.py
korp_convert_timedata=$progdir/korp-convert-timedata.sh
vrt_drop_attrs=$vrttoolsdir/vrt-drop-attrs
vrt_scramble=$vrttoolsdir/vrt-scramble
cwbdata_extract_info=$progdir/cwbdata-extract-info.sh
vrt_extract_lemgrams=$progdir/vrt-extract-lemgrams.sh
run_extract_rels=$progdir/run-extract-rels.sh
vrt_add_name_attrs=$progdir/vrt-add-name-attrs.sh
korp_make_corpus_package=$progdir/korp-make-corpus-package.sh
korp_mysql_import=$progdir/korp-mysql-import.sh
cwbdata2vrt="$progdir/cwbdata2vrt-simple.sh --all-attributes --output-file=-"

cwb_encode=$cwb_bindir/cwb-encode
cwb_describe_corpus=$cwb_bindir/cwb-describe-corpus
cwb_make=$cwb_perl_bindir/cwb-make

vrt_file=
# Complete, unscrambled VRT file with no structures removed
vrt_file_full=


text_sort_opts=
sort_texts=
add_lemgrams_opts=


# Default structural attribute id formats
declare -A struct_id_format=(
    # Standard structures
    [text]="t-{hash:.8}-{id}"
    [paragraph]="p-{hash:.8}-{idnum[text]}-{id}"
    [sentence]="s-{hash:.8}-{idnum[text]}-{id}"
    # Structures used in some corpora
    [link]="l-{hash:.8}-{idnum[text]}-{id}"
    [clause]="c-{hash:.8}-{idnum[text]}-{idnum[sentence]}-{id}"
    [chapter]="ch-{hash:.8}-{idnum[text]}-{id}"
    [utterance]="u-{hash:.8}-{idnum[text]}-{id}"
)

add_auth_opts () {
    local type opt val
    type=$1
    opt=$2
    val=$3
    val=$(eval "make_$type \$val")
    exit_if_error $?
    auth_opts="$auth_opts $opt $val"
}

add_text_sort_opt () {
    local optname=$1
    local optarg=$2
    if [ "x$optname" = "x--key" ]; then
	sort_texts=1
    fi
    text_sort_opts="$text_sort_opts $optname $(quote_args_safe "$optarg")"
}

add_lemgram_opt () {
    local optname=$1
    local optarg=$2
    # optarg may not contain spaces, but in this case they should not
    # occur
    add_lemgrams_opts="$add_lemgrams_opts $optname $optarg"
}

add_struct_id_format () {
    local optarg=$1
    local struct=${optarg%%:*}
    local format=${optarg#*:}
    struct_id_format[$struct]="$format"
}


# Process options
eval "$optinfo_opt_handler"

if [ "x$corpus" = "x" ]; then
    if [ "x$1" = "x" ]; then
	error "No corpus name specified"
    fi
    corpus=$1
    shift
fi

preprocess_posattrs=
initial_vrt_posattrs=

if [ "x$generate_input" != x ]; then
    if [ "x$force" != x ]; then
	error "You cannot specify both --force and --generate-input-from-data"
    fi
    augment_data=1
    stage1_fn=generate_input
    stage1_descr="Generating input VRT file from CWB data"
else
    stage1_fn=combine_input
    stage1_descr="Combining input files"
fi

if [ "x$augment_data" != x ]; then
    if [ "$(list_corpora --on-error : "$corpus")" != "$corpus" ]; then
	error "Corpus $corpus not found; cannot augment corpus data"
    fi
    if [ "x$force" != x ]; then
	error "You cannot specify both --force and --augment-data"
    fi
fi

if [ "x$logging" != x ]; then
    if [ "x$logfile" = x ]; then
	if [ ! -e "$corpus_root/log" ]; then
	    mkdir_perms $corpus_root/log
	fi
	logfile=$corpus_root/log/${progname}_${corpus}_$(date +'%Y%m%d%H%M%S').log
    fi
    # http://stackoverflow.com/questions/3173131/redirect-copy-of-stdout-to-log-file-from-within-bash-script-itself
    cat < /dev/null > $logfile
    ensure_perms $logfile
    exec > >(tee -ia $logfile)
    exec 2> >(tee -ia $logfile >&2)
    echo_verb "Logging output to $logfile"
fi

if [ "x$omit_structures" != x ]; then
    if word_in "text" "$omit_structures" ||
	    word_in "sentence" "$omit_structures"
    then
	error "You cannot omit text or sentence structures"
    fi
    # Use echo to get single spaces between structures to be omitted
    omit_structures=$(echo $omit_structures)
    # Convert to a regexp for grep -Ev
    omit_structures="^</?(${omit_structures// /|})[ >]"
fi

# Convert to lowercase
compound_boundary_alg=${compound_boundary_alg,,}
case $compound_boundary_alg in
    old )
	compound_boundary_alg=simple-omorfi
	;;
    omorfi | naive | simple-omorfi )
	# Use the value as is
	:
	;;
    * )
	error 'Invalid algorithm in --remove-compound-boundary-algorithm: allowed values are "omorfi", "legacy" and "naive".'
	;;
esac

echo_verb "Running: $cmdline_orig"
echo_verb "Processed arguments: $cmdline_args_processed"

input_files=( "$@" )

vrt_subdir=${vrt_subdir//CORPUS/$corpus}
tsv_subdir=${tsv_subdir//CORPUS/$corpus}
vrtdir=${vrtdir:-$corpus_root/$vrt_subdir}
vrtdir=${vrtdir//CORPUS/$corpus}
tsvdir=${tsvdir:-$corpus_root/$tsv_subdir}
tsvdir=${tsvdir/CORPUS_ROOT/$corpus_root}
tsvdir=${tsvdir//CORPUS/$corpus}
datadir=$corpus_root/data/$corpus

mkdir_perms $vrtdir $tsvdir 2> /dev/null

if [ "x$vrt_file" = "x" ]; then
    # If augmenting data and not generating input from VRT, do not use
    # possible existing VRT file. Another option might be to have an
    # option for ignoring an existing VRT file and to write the
    # augmented VRT file to $corpus.augm.vrt, for example.
    if [ "x$augment_data" != x ] && [ "x$generate_input" = x ]; then
	vrt_file=$tmp_prefix.$corpus.vrt
    else
	vrt_file=$vrtdir/$corpus.vrt
        vrt_file_full=$vrtdir/$corpus-complete.vrt
    fi
fi

stages_file=$vrtdir/$corpus.stages
opts_file=$vrtdir/$corpus.opts
seed_file=$vrtdir/$corpus.seed
# Should the information from a previous run be checked
check_prev_run=

verbose_opt=
if [ "x$verbose" != x ]; then
    verbose_opt=--verbose
fi

input_token_count=0
existing_token_count=0


remove_existing_data () {
    rm -f $datadir/* $cwb_regdir/$corpus $tsvdir/$corpus_*.tsv.gz \
       $vrtdir/$corpus.vrt $vrtdir/$corpus.vrt.gz $stages_file $opts_file \
       $seed_file
}

run_cmd () {
    verbose printf "  Running: " >&$top_stdout
    verbose echo_quoted "$@" >&$top_stdout
    "$@"
}

process_vrt () {
    run_cmd "$@" < $vrt_file > $vrt_file.new &&
        replace_file $vrt_file $vrt_file.new
}

time_stage () {
    time_cmd --format "- CPU time used: %U %R" "$@"
}

check_errors_from_log () {
    # FIXME: Grepping the log file for system error messages is a bit
    # kludgy way to catch "Disk quota exceeded" (and possibly other
    # similar) system errors. In particular, the error might prevent
    # the process from writing the message to the log file. The
    # subprocesses should notice the errors and exit with an error
    # status. Or can they do that?
    if grep '^\[Errno [0-9]' "$logfile" \
	> $tmp_prefix.subproc_error 2> /dev/null;
    then
	error "Aborting because of an error: $(cat $tmp_prefix.subproc_error)"
    fi
}

# Check if data exists from a previous run and if it could be used or
# if it should be removed
check_for_existing_data () {
    if [ "x$force" != x ]; then
        remove_existing_data
    else
        if [ -r $vrt_file.gz ] && [ -s $vrt_file.gz ]; then
            gunzip $vrt_file.gz
        fi
        # If any of specified the input files is newer than the
        # existing VRT file and not augmenting data, remove existing
        # data
        if [ -r $vrt_file ] && [ -s $vrt_file ] &&
               [ "x$augment_data" = x ] &&
               [ "${#input_files[@]}" -gt 0 ] &&
               ! file_newer $vrt_file "${input_files[@]}";
        then
            remove_existing_data
        else
            check_prev_run=1
            echo_verb "Using existing data from previous runs"
        fi
    fi
}

# Check if korp-make has been run with the same options previously,
# allowing skipping some or all stages. Return 0 if all stages have
# already been completed, 1 otherwise.
check_completed () {
    local cmdline_args_effective=$(get_effective_cmdline_args)
    if [ "x$check_prev_run" = x ]; then
        safe_echo "$cmdline_args_effective" > "$opts_file"
        return 1
    fi
    # CHECK: Should the data from the previous runs be handled
    # differently if using one of the options --augment-data,
    # --generate-input or --remake-wordpicture-data?
    if [ -e "$opts_file" ] && [ -e "$stages_file" ]; then
        # If the options are the same as on the previous run, the
        # script can continue from where it was left
        if [ "$(cat "$opts_file")" = "$cmdline_args_effective" ]; then
            if [ "$(tail -n1 "$stages_file")" = "Completed" ]; then
                return 0
            fi
        else
            # TODO: If the options differ, check by option and stage
            # if the option affects the stage
            rm "$opts_file" "$stages_file"
        fi
    fi
    safe_echo "$cmdline_args_effective" > "$opts_file"
    return 1
}

# Get the command-line arguments affecting the processing of the data
get_effective_cmdline_args () {
    # Prepend space to recognize options at the beginning
    local args=" $cmdline_args_processed"
    local remove_opts="--force --times --quiet --verbose --no-logging --remake-wordpicture-data"
    local opt
    for opt in $remove_opts; do
        args=${args/ $opt/}
    done
    # Remove leading space
    safe_echo "${args# }"
}

# Run a single stage function (name) after printing the description
# (descr). If function test_skip_$name is defined and its output is
# non-empty, skip the stage.
run_stage () {
    local name=$1
    if [ "x$name" = x ]; then
	return
    fi
    shift
    local descr="$@"
    local run_always prev_run prev_msg msg exitstat
    if [ "${name#\*}" != "$name" ]; then
        run_always=1
        name=${name#\*}
    fi
    if [ "x$check_prev_run" != x ] && [ "x$run_always" = x ]; then
        prev_run="$(grep -s "^$name:" $stages_file)"
        if [ "x$prev_run" != x ]; then
            prev_msg=${prev_run#*: }
            echo_verb "(Skipping ${descr,}: $prev_msg on a previous run)"
            return
        fi
    fi
    if type -t "test_skip_$name" > /dev/null; then
	msg=$(test_skip_$name 2> $tmp_prefix.errmsg)
	exitstat=$?
	# Exit with error if the test function (or the programs it
	# runs) outputs something to stderr. An alternative would be [
	# $? != 0 ], but that would require adding "return 0" to many
	# of the test_skip_ functions.
	if [ -s $tmp_prefix.errmsg ]; then
	    cat $tmp_prefix.errmsg
	    exit $exitstat
	fi
	if [ "x$msg" != "x" ]; then
	    echo_verb "(Skipping ${descr,}: $msg)"
            echo "$name: $msg" >> $stages_file
	    return
	fi
    fi
    echo_verb "$descr"
    time_stage exit_on_error $name
    check_errors_from_log
    echo "$name: done" >> $stages_file
}

# Run all the stages in $stages sequentially.
run_stages () {
    local stagecnt=${#stages[*]}
    local i=0
    if check_completed; then
        echo_verb "All processing stages completed on the previous run; use --force to force rerunning"
        return
    fi
    while [ $i -lt $stagecnt ]; do
	run_stage ${stages[$i]} "${stages[$(($i + 1))]}"
	i=$(($i + 2))
    done
    echo "Completed" >> $stages_file
}


# Stage functions and their descriptions
stages=(
    "$stage1_fn" "$stage1_descr"
    # A leading "*" marks the stage to be run always, even if it had
    # been completed on a previous run
    "*check_input_attrs" "Checking input attributes"
    # Stage 3 information set in check_input_attrs if needed
    "" ""
    extract_random_seed "Extracting random seed"
    copy_struct_attrs "Copying structural attributes"
    rename_struct_attrs "Renaming structural attributes"
    add_struct_ids "Adding structure ids"
    add_lemmas_without_boundaries "Adding lemmas without compound boundaries"
    add_lemgrams "Adding lemgrams"
    add_datefromto "Adding datefrom and dateto"
    sort_texts "Sorting text elements"
    save_full_vrt "Saving complete VRT file"
    omit_structures "Omitting structures"
    omit_struct_attrs "Omitting structural attributes"
    scramble_structs "Scrambling structures"
    cwb_encode "Encoding the attributes for CWB"
    cwb_make "Indexing and compressing the CWB data"
    convert_timedata "Converting and augmenting time data"
    extract_info "Extracting information for the .info file"
    extract_lemgrams "Extracting lemgrams for the database"
    extract_wordpict_rels
    "Extracting word picture relations for the database"
    add_name_attrs "Adding name attributes"
    adjust_posattrs_comment "Adjusting or adding VRT positional-attributes comment"
    make_corpus_package "Creating corpus package"
    import_database "Importing data to the MySQL database"
)


combine_input () {
    # Skip empty lines in the input VRT, in order to avoid a differing
    # number of tokens from the already encoded attributes (assuming
    # that cwb-encode was told to skip empty lines).
    comprcat "${input_files[@]}" |
    grep -v '^$' > $vrt_file
    input_token_count=$(vrt_get_token_count "$vrt_file")
    if [ "$input_token_count" = 0 ]; then
	error "No tokens in the input"
    fi
    echo_verb "  $input_token_count tokens in the input VRT"
    if [ "x$augment_data" != x ]; then
	existing_token_count=$(get_corpus_token_count $corpus)
	if [ "$input_token_count" != "$existing_token_count" ]; then
	    error "The number of tokens in the input ($input_token_count) differs from that in the existing corpus data ($existing_token_count)"
	fi
    fi
}

test_skip_combine_input () {
    # If other conditions for not combining input have been met,
    # $vrt_file has already been removed in check_for_existing_data
    if [ -r $vrt_file ] && [ -s $vrt_file ]; then
	echo "using existing VRT file $vrt_file"
        input_token_count=$(vrt_get_token_count "$vrt_file")
    fi
}

check_input_attrs () {
    local next_stage_fn=
    local next_stage_descr=
    local next_stage_descr2=
    local next_stage_idx=
    local attrnum_word=
    local attrcount=
    local vrt_attrcount=
    initial_vrt_posattrs=$(vrt_get_posattr_names $vrt_file)
    if [ "x$initial_vrt_posattrs" != x ]; then
	if [ "x$override_vrt_attrs" = x ]; then
	    initial_input_attrs=$initial_vrt_posattrs
	    # The lex attribute in the positional attributes comment
	    # might lack the final slash, but lex (lemgram) is always
	    # a feature-set attribute, so add it if needed.
	    initial_input_attrs=$(suffix_word "$initial_input_attrs" lex /)
	    verbose safe_echo "Using positional attributes named in the input VRT: $initial_input_attrs"
	elif [ "$initial_vrt_posattrs" != "$initial_input_attrs" ] &&
		 [ "$initial_vrt_posattrs" != "word $initial_input_attrs" ];
	then
	    warn "Overriding positional attributes \"$initial_vrt_posattrs\" in the input VRT with \"$initial_input_attrs\""
	fi
    else
	verbose safe_echo "No positional attributes named in the input VRT; using those listed with --input-attributes: $initial_input_attrs"
    fi
    # FIXME: Testing for UD attributes (and setting lemgramp_posmap
    # and wordpict_relmap accordingly) does not work if using an
    # existing VRT file with the attributes already renamed, for
    # example, if later adding lemgram or word picture data
    if has_only_ud_attrs $initial_input_attrs; then
        initial_input_attrs=$(rename_ud_attrs $initial_input_attrs)
        verbose safe_echo "Removing suffix _ud[12]? from positional attribute names, as no corresponding attributes without the suffix; modified input attributes: $initial_input_attrs"
        if word_in "pos" "$initial_input_attrs" &&
                [ "$lemgram_posmap" = "$lemgram_posmap_default" ]
        then
            lemgram_posmap=$lemgram_posmap_ud_default
            verbose safe_echo "Using the default UD lemgram part-of-speech mapping file $lemgram_posmap_ud_default"
        fi
        if word_in "deprel" "$initial_input_attrs" &&
                [ "$wordpict_relmap" = "$wordpict_relmap_default" ]
        then
            wordpict_relmap=$wordpict_relmap_ud_default
            verbose safe_echo "Using the default UD word picture relation mapping file $wordpict_relmap_ud_default"
        fi
        if [ "x$lemmas_without_boundaries" != x ] &&
               word_in "lemma" "$initial_input_attrs"
        then
            compound_boundary_marker=$compound_boundary_marker_ud_default
            verbose safe_echo "Using the default UD compound boundary marker \"$compound_boundary_marker_ud_default\""
        fi
    fi
    if [ "x$augment_data" != x ]; then
	keep_attr_order=1
	no_word_attr=1
    fi
    attrnum_word="$(word_index word $initial_input_attrs)"
    if [ "x$no_word_attr" = x ] && [ "$attrnum_word" = "-1" ]; then
	initial_input_attrs="word $initial_input_attrs"
    fi
    attrcount=$(count_words $initial_input_attrs)
    vrt_attrcount=$(vrt_get_posattr_count $vrt_file)
    if [ "$attrcount" != "$vrt_attrcount" ]; then
	error "Error: the input VRT has $vrt_attrcount positional attributes, but $attrcount were specified"
    fi
    if word_in _ "$initial_input_attrs" ||
	    [ "${initial_input_attrs#*-}" != "$initial_input_attrs" ] ||
            [ "x$omit_attrs" != x ]
    then
	next_stage_fn=filter_and_reorder_posattrs
	next_stage_descr="Filtering out positional attributes with name \"_\" or starting with \"-\" or specified with --omit-attributes"
	preprocess_posattrs=filter
    fi
    if [ "x$keep_attr_order" = x ] &&
	   [ "$attrnum_word" != "-1" ] && [ "$attrnum_word" != "1" ]
    then
	next_stage_fn=filter_and_reorder_posattrs
	next_stage_descr2='moving "word" to be the first positional attribute'
	if [ "x$next_stage_descr" != x ]; then
	    next_stage_descr="$next_stage_descr, and $next_stage_descr2"
	else
	    next_stage_descr=${next_stage_descr2^}
	fi
	preprocess_posattrs="$preprocess_posattrs reorder"
    fi
    # filter_and_reorder_posattrs adds a positional attributes comment
    # if it is missing, so if it will be run, do not do it here.
    if [ "x$next_stage_fn" = x ] && [ "x$initial_vrt_posattrs" = x ]; then
	process_vrt vrt_replace_posattr_names "$initial_input_attrs"
	initial_vrt_posattrs=$initial_input_attrs
    fi
    next_stage_idx=$(first_empty_elem_index "${stages[@]}")
    stages[$next_stage_idx]=$next_stage_fn
    stages[$(($next_stage_idx + 1))]=$next_stage_descr
    input_attrs=$initial_input_attrs
}

has_only_ud_attrs () {
    # Check if the arguments has attribute names with suffix _ud, _ud1
    # or _ud2 without corresponding non-suffixed attributes
    # TODO: Also return false if multiple different _ud attributes,
    # e.g. _ud1, _ud2
    local attrs attr attr_base has_ud
    has_ud=1
    attrs="$@"
    for attr in $attrs; do
        if str_hassuffix "$attr" "_ud[12]" || str_hassuffix "$attr" "_ud"; then
            has_ud=0
            attr_base=${attr%_ud*}
            if word_in "$attr_base" "$attrs"; then
                return 1
            fi
        fi
    done
    return $has_ud
}

rename_ud_attrs () {
    # Remove suffix _ud, _ud1 or _ud2 from attribute names given as
    # arguments
    local attrs=
    local attr
    for attr in "$@"; do
        attr=${attr%_ud[12]}
        attr=${attr%_ud}
        if [ "x$attrs" = x ]; then
            attrs=$attr
        else
            attrs="$attrs $attr"
        fi
    done
    echo "$attrs"
}

first_empty_elem_index () {
    # Return the index of the first empty element in the argument array
    local arr=("$@")
    local i
    i=0
    while [ $i -lt ${#arr[@]} ]; do
        if [ "${arr[$i]}" = "" ]; then
            echo $i
            return
        fi
        i=$(($i + 1))
    done
}

filter_and_reorder_posattrs () {
    local attrs_names=
    local attrs_nums=
    local word_filter=true
    local skip_attrs=
    # set -vx
    if word_in reorder "$preprocess_posattrs"; then
	attrs_names=word
	attrs_nums="\$$(word_index word $initial_input_attrs)"
	word_filter='[ "$attrname" != "word" ]'
    fi
    local attrname
    local attrnum=1
    for attrname in $initial_input_attrs; do
	if [ "$attrname" != "_" ] &&
	       [ "${attrname#-}" = "$attrname" ] &&
               ! word_in "$attrname" "$omit_attrs" &&
	       eval "$word_filter"
	then
	    attrs_names="$attrs_names $attrname"
	    attrs_nums="$attrs_nums, \$$attrnum"
        elif [ "$attrname" != "word" ]; then
            skip_attrs="$skip_attrs $attrname"
	fi
	attrnum=$(($attrnum + 1))
    done
    attrs_names=${attrs_names# }
    attrs_nums=${attrs_nums#,}
    if [ "x$skip_attrs" != x ]; then
        echo_verb "  Filtering out positional attributes:$skip_attrs"
    fi
    # set +vx
    process_vrt awk -F"$tab" '
		BEGIN {
		    OFS = "\t"
	        }
		/^</ { print }
		/^[^<]/ { print '"$attrs_nums"' }'
    initial_input_attrs=$attrs_names
    input_attrs=$attrs_names
    if [ "$initial_vrt_posattrs" != "$attrs_names" ]; then
        initial_vrt_posattrs=$attrs_names
        adjust_posattrs_comment
    fi
}

generate_input () {
    # TODO: Set initial_vrt_posattrs here too
    existing_token_count=$(get_corpus_token_count $corpus)
    echo_verb "  $existing_token_count tokens in the existing data"
    run_cmd $cwbdata2vrt $corpus > $vrt_file
    # Use echo to convert newlines in corpus_list_attrs to spaces
    input_attrs=$(echo $(corpus_list_attrs $corpus POS))
    # Replace lex with lex/ (lemgram has feature-set values)
    input_attrs=$(suffix_word "$input_attrs" lex /)
    initial_input_attrs=$input_attrs
}

extract_random_seed () {
    local cnt=256
    if [ $input_token_count -lt $(($cnt * 4)) ]; then
        # Add 1 to avoid having $cnt = 0
        cnt=$(($input_token_count / 4 + 1))
    fi
    if [ "$random_seed" = "$random_seed_default" ]; then
        random_seed=$corpus
    fi
    run_cmd $vrt_extract_seed --separator=" " --baseseed="$random_seed" \
            --count=$cnt --distance=random --last=$input_token_count \
            $vrt_file > $seed_file
}

test_skip_extract_random_seed () {
    if [ "x$add_struct_ids" = x ] && [ "x$scramble" = x ]; then
        echo "not required"
    elif [ -e "$seed_file" ]; then
        echo "already extracted"
    fi
}

copy_struct_attrs () {
    process_vrt $vrt_fix_attrs \
		$(add_prefix "--copy-struct-attribute " $copy_struct_attr)
}

test_skip_copy_struct_attrs () {
    [ "x$copy_struct_attr" = x ] &&
	echo "not requested"
}

rename_struct_attrs () {
    process_vrt $vrt_rename_struct_attrs $rename_struct_attr
}

test_skip_rename_struct_attrs () {
    [ "x$rename_struct_attr" = x ] &&
	echo "not requested"
}

omit_structures () {
    process_vrt grep -Ev "$omit_structures"
}

test_skip_omit_structures () {
    [ "x$omit_structures" = x ] &&
	echo "not requested"
}

add_lemmas_without_boundaries () {
    # TODO: Recognize lemma attributes suffixed with annotation scheme
    # (lemma_ud1) and add the corresponding lemmas without compound
    # boundaries
    process_vrt $vrt_add_lemma_nobound \
		--mode=$compound_boundary_alg \
		--compound-boundary-marker="$compound_boundary_marker" \
		--add-name=lemmacomp --add-type=boundaries \
		--add-after-name=lemma
    input_attrs=$(suffix_word "$input_attrs" lemma " lemmacomp")
}

test_skip_add_lemmas_without_boundaries () {
    if [ "x$lemmas_without_boundaries" = x ]; then
	echo "requested not to add"
    elif ! word_in lemma "$initial_input_attrs"; then
	echo "no attribute lemma"
    elif { corpus_exists $corpus &&
	    corpus_has_attr $corpus p lemmacomp; } ||
	word_in lemmacomp "$initial_input_attrs"
    then
	echo "already present"
    fi
}

add_lemgrams () {
    process_vrt $vrt_add_lemgrams --pos-map-file "$lemgram_posmap" \
	--lemma-field=$(get_attr_num lemma "$input_attrs") \
	--pos-field=$(get_attr_num pos "$input_attrs") \
        $add_lemgrams_opts
    if [ $? != 0 ]; then
	exit_on_error false
    fi
    input_attrs="$input_attrs lex/"
    # This is some extra work but is needed to keep the
    # positional-attributes comment in sync with the content as long
    # as vrt-add-lemgrams does not handle the comment
    adjust_posattrs_comment
}

test_skip_add_lemgrams () {
    local attr
    if [ "x$lemgrams" = x ]; then
	echo "requested not to add"
    elif { corpus_exists $corpus &&
	    corpus_has_attr $corpus p lex; } ||
	word_in lex/ "$initial_input_attrs"
    then
	echo "already present"
    else
	for attr in lemma pos; do
	    if ! { word_in $attr "$initial_input_attrs" ||
		    word_in $attr/ "$initial_input_attrs"; }
	    then
		echo "no attribute $attr"
		break
	    fi
	done
    fi
}

add_datefromto () {
    local opts
    if [ "x$corpus_date" = "xunknown" ]; then
	opts=--unknown
    elif [ "x$corpus_date" != "x" ]; then
	opts=--fixed=$corpus_date
    fi
    if [ "x$corpus_date_ranges" != "x" ]; then
	opts="$opts --ranges"
    fi
    if [ "x$corpus_date_full_order" != "x" ]; then
	opts="$opts --full-dates --full-date-order=$corpus_date_full_order"
    fi
    if [ "x$corpus_date_pattern" != x ]; then
	opts="$opts --pattern $(quote_args_safe "$corpus_date_pattern")"
    fi
    # Use eval to get the quoted argument of --pattern correctly
    eval process_vrt $vrt_extract_timespans --mode=add \
	--output-full-dates=always $opts
}

test_skip_add_datefromto () {
    if ! grep -q -s '^<text ' $vrt_file; then
        echo "no text structures"
    elif grep -q -s '^<text.* datefrom="' $vrt_file; then
        echo "already present"
    fi
}

sort_texts () {
    # Use eval to get the separate options in $text_sort_opts
    # correctly
    eval process_vrt $vrt_sort $text_sort_opts
}

test_skip_sort_texts () {
    [ "x$sort_texts" = x ] &&
    echo "not requested"
}

save_full_vrt () {
    run_cmd cp -p $vrt_file $vrt_file_full
}

test_skip_save_full_vrt () {
    [ "x$augment_data" != x ] &&
        echo "not done when augmenting data"
}

# find_containing_struct struct vrt_fname
#
# Print the structure immediately containing struct in the VRT file
# vrt_fname, an empty string if struct is the top-level structure. If
# struct is not found in vrt_fname, return error.
#
# This assumes that if the text has paragraphs, no sentence is outside
# a paragraph. Likewise, if the text has chapters between text and
# paragraph, all paragraphs should be inside a chapter. This also does
# not work with crossing structures, like lines or pages interleaved
# with others. It is unclear how or if scrambling should be done in
# such cases.
find_containing_struct () {
    local struct vrt_fname exitcode
    struct=$1
    vrt_fname=$2
    grep -q -s "^<$struct" $vrt_fname
    exitcode=$?
    if [ $exitcode != 0 ]; then
	return $exitcode
    fi
    grep '^<' $vrt_fname |
    awk '
	BEGIN { top = 0 }
	/^<[^\/!]/ {
	    struct = substr($1, 2)
	    if (struct == "'"$struct"'") {
		print (top > 0 ? stack[top] : "")
                exit 0
	    }
	    stack[++top] = struct
	}
	/^<\// { top-- }
    '
    return 0
}

omit_struct_attrs () {
    local opt attrlist drop keep struct drop_attrs_opts
    local global_opts=
    local struct_opts=
    for opt in $omit_struct_attr; do
        attrlist=${opt#*/}
        drop_attrs_opts="$(_make_vrt_drop_attrs_opts "$attrlist")"
        # echo "$attrlist:$drop_attrs_opts"
        if [ "$attrlist" = "$opt" ]; then
            # No structure specified; applies to all
            global_opts="$drop_attrs_opts"
        else
            struct=${opt%%/*}
            struct_opts="$struct_opts --structure $struct $drop_attrs_opts"
        fi
    done
    # Use eval to preserve the quotation marks in $global_opts and
    # $struct_opts
    eval process_vrt $vrt_drop_attrs $global_opts $struct_opts
}

_make_vrt_drop_attrs_opts () {
    local drop keep
    local attrlist=$1
    local opts=
    drop=${attrlist%%!*}
    # "!attr" keeps attr, overriding global --keep, but does not add
    # --drop "" (not dropping anything)
    if [ "x$drop" != x ] || [ "x$attrlist" = x ]; then
        opts="--drop '$drop'"
    fi
    keep=${attrlist#*!}
    # If $keep = $attrlist, $attrlist did not contain an "!"
    if [ "$keep" != "$attrlist" ]; then
        opts="$opts --keep '$keep'"
    fi
    echo "$opts"
}

test_skip_omit_struct_attrs () {
    [ "x$omit_struct_attr" = x ] &&
    echo "not requested"
}

scramble_structs () {
    local unit within
    for unit in $scramble; do
	within=$(find_containing_struct $unit "$vrt_file")
	if [ $? != 0 ]; then
	    warn "Input VRT has no $unit structures specified with --scramble"
	elif [ "x$within" = x ]; then
	    warn "Cannot scramble top-level structures $unit"
	elif ! word_in "$unit" "sentence paragraph link"; then
	    warn "Scrambling $unit structures instead of the usual sentence, paragraph or link"
	fi
	if [ "x$within" != x ]; then
	    process_vrt $vrt_scramble --seed "<$seed_file" \
		--unit $unit --within $within
	fi
    done
}

test_skip_scramble_structs () {
    [ "x$scramble" = x ] &&
    echo "not requested"
}

add_struct_ids () {
    local opts=
    local struct
    local use_defaults=1
    if [ "x$keep_struct_ids" = x ]; then
	opts=--force
    else
        # If the VRT data already has id attributes, rename them
        for struct in $add_struct_ids; do
            if vrt_has_struct_id $struct; then
                opts=--rename
                break
            fi
        done
    fi
    # Use vrt-add-id defaults if $add_struct_ids contains "text",
    # "paragraph" and "sentence" (the default) and nothing else
    if ! word_in "text" "$add_struct_ids" ||
            ! word_in "paragraph" "$add_struct_ids" ||
            ! word_in "sentence" "$add_struct_ids"; then
        use_defaults=
    else
        for struct in $add_struct_ids; do
            if ! word_in $struct "text paragraph sentence"; then
                use_defaults=
            fi
        done
    fi
    if [ "$use_defaults" != "1" ]; then
        for struct in $add_struct_ids; do
	    opts="$opts --element=$struct"
            if [ "${struct_id_format[$struct]}" != "" ]; then
                opts="$opts --format=${struct_id_format[$struct]}"
            fi
        done
    fi
    process_vrt $vrt_add_id --hash=$corpus --seed="<$seed_file" $opts
}

vrt_has_struct_id () {
    # Return true if $vrt_file has attribute "id" in structure $1.
    # Note that this also returns true if an attribute value ends in
    # " id=".
    # If $vrt_file does not contain $struct structures, this greps
    # through the whole file. An alternative could be to take only a
    # head of $vrt_file, but it would be more complex, as it would
    # have to handle SIGPIPE somehow.
    grep -m1 "^<$struct[ >]" $vrt_file |
        grep -sq ' id="'
}

test_skip_add_struct_ids () {
    local struct
    local has_structs=
    if [ "x$add_struct_ids" = x ]; then
        echo "requested not to add"
        return
    fi
    for struct in $add_struct_ids; do
        if grep -q -s "^<$struct[ >]" "$vrt_file"; then
            has_structs=1
            break
        fi
    done
    if [ "$has_structs" != 1 ]; then
        echo "no structures ($(delimit ", " $add_struct_ids)) present"
        return
    fi
}

filter_new_attrs () {
    $cwb_describe_corpus -s $corpus > $tmp_prefix.corpusattrs 2> /dev/null
    if [ $? != 0 ]; then
	echo "$@"
    else
	# FIXME: Why does this put lemma and lemmacomp at the end,
	# whereas the other attributes are in the order in which they
	# are in the input?
	awk '
	    BEGIN {
		for (i = 1; i < ARGC; i++) { attrs[i] = ARGV[i] }
		delete ARGV
	    }
	    /^p-ATT/ { old_attrs[$2] = 1 }
	    END {
		for (i in attrs) {
		    attrname_bare = gensub (/\//, "", "g", attrs[i])
		    # Lemma needs to be recoded as lemmacomp if lemmacomp is
		    # not already present
		    if (! (attrname_bare in old_attrs) \
			|| (attrname_bare == "lemma" \
			    && ! ("lemmacomp" in old_attrs))) {
			print attrs[i]
		    }
		}
	    }' $@ < $tmp_prefix.corpusattrs
    fi
}

add_attrs_to_registry () {
    local structspecs structspec
    structspecs=$1
    # cwb_registry_add_posattr adds only non-existing attributes, so
    # it does not matter if $input_attrs contains attributes already
    # existing in the corpus.
    cwb_registry_add_posattr $corpus ${input_attrs///}
    for structspec in $structspecs; do
	# Remove the nesting depth
	structspec=${structspec/:[0-9]/}
	# Arguments: struct name followed by attr names
	cwb_registry_add_structattr $corpus ${structspec//+/ }
    done
    # If needed, move lemmacomp to immediately after lemma. This is
    # relevant when augmenting existing data: if the data has other
    # positional attributes after lemma, lemmacomp would come after
    # them all without this.
    cwb_registry_reorder_posattrs $corpus lemma lemmacomp
}

cwb_encode () {
    local structnames reg_opt
    echo cwb_encode: "$input_attrs"
    echo_verb "  Inferring structural attributes from the VRT file"
    structnames=$(grep '^<' $vrt_file | run_cmd $vrt_list_struct_attrs)
    mkdir_perms $datadir
    reg_opt=
    # For new corpora, let cwb-encode generate the registry file
    if [ "x$augment_data" = x ]; then
	reg_opt="-R $cwb_regdir/$corpus"
    fi
    # $input_attrs contains "word" (unless --no-word-attribute), so
    # use "cwb-encode -p -" to avoid adding "word" twice
    run_cmd $vrt_convert_chars \
	--encode --feature-set-struct-attributes "$structnames" < $vrt_file |
    tee $tmp_prefix.tee |
    run_cmd $cwb_encode -d $datadir $reg_opt -p - \
	-xsB -c utf8 $(add_prefix "-P " $input_attrs) \
	$(add_prefix "-S " $structnames)
    # For existing corpora, add the new new attributes to the registry
    if [ "x$augment_data" != x ]; then
	add_attrs_to_registry "$structnames"
    fi
}

test_skip_cwb_encode () {
    # new_attrs is global, but it is initialized here for practical
    # reasons.
    new_attrs=$(filter_new_attrs "${input_attrs///}")
    [ "x$augment_data" = x ] &&
    [ "x$new_attrs" = x ] &&
    [ -s $cwb_regdir/$corpus ] &&
    echo "already encoded"
}

cwb_make () {
    run_cmd $cwb_make -r $cwb_regdir -g $filegroup -p 664 -M 2000 $corpus
}

test_skip_cwb_make () {
    [ "x$augment_data" = x ] &&
    [ "x$new_attrs" = x ] &&
    datadir_has_all_index_files &&
    # cwb-make removes *.corpus, *.corpus.rev, *.corpus.rdx after
    # indexing and compressing
    ! ls $datadir/*.corpus{,.rev,.rdx} &> /dev/null &&
    echo "already done"
}

# Check if the data directory has all the files it should contain for
# all positional attributes after running cwb-make
datadir_has_all_index_files () {
    local attr fname
    for attr in ${input_attrs///}; do
        for fname in $attr{,corpus.cnt,cr[cx],hcd,huf{,.syn},lexicon{.idx,.srt}}
        do
            if [ ! -s $datadir/$fname ]; then
                return 1
            fi
        done
    done
    return 0
}

convert_timedata () {
    run_cmd $korp_convert_timedata --tsv-dir "$tsvdir" $verbose_opt \
	--corpus-root "$corpus_root" $corpus
}

test_skip_convert_timedata () {
    corpus_has_attr $corpus s text_timefrom &&
    [ -s $tsvdir/${corpus}_timedata.tsv.gz ] &&
    echo "already converted"
}

extract_info () {
    # --verbose would add the corpus id to the .info file which is not
    # desired.
    run_cmd $cwbdata_extract_info --tsv-dir "$tsvdir" $corpus > $datadir/.info
}

extract_lemgrams () {
    local lemgram_attrnum=$(word_index lex/ $input_attrs)
    run_cmd $vrt_extract_lemgrams --corpus-id $corpus --lemgram-field $lemgram_attrnum \
	$vrt_file |
    gzip > $tsvdir/${corpus}_lemgrams.tsv.gz
}

test_skip_extract_lemgrams () {
    if ! word_in lex/ "$input_attrs"; then
	echo "lemgrams not present"
    elif [ -s $tsvdir/${corpus}_lemgrams.tsv.gz ]; then
	echo "already extracted"
    fi
}

extract_wordpict_rels () {
    run_cmd $run_extract_rels --corpus-name $corpus \
	--input-fields "${input_attrs%/}" \
	--output-dir "$tsvdir" --relation-map "$wordpict_relmap" \
	--optimize-memory --no-tar \
	< $vrt_file
}

test_skip_extract_wordpict_rels () {
    local attr
    # Suffixes in word picture relation database file names
    local relfile_suffs=("" _dep_rel _head_rel _rel _sentences _strings)
    local suff
    local fname
    local missing_attrs=
    local pl=
    for attr in lemma pos deprel dephead ref; do
	if ! { word_in $attr "$input_attrs" ||
		word_in $attr/ "$input_attrs"; }
	then
            missing_attrs="$missing_attrs $attr"
	fi
    done
    if [ "x$missing_attrs" != x ]; then
        if [ "$(count_words $missing_attrs)" -gt 1 ]; then
            pl=s
        fi
        echo "no positional attribute$pl $(delimit ", " $missing_attrs)"
        return
    fi
    if [ "x$wordpicture" = x ]; then
	echo "requested not to extract"
        return
    fi
    # Requested to remake
    if [ "x$remake_wordpicture_data" != x ]; then
        rm -f $tsvdir/${corpus}_rels*.tsv.gz
        return
    fi
    # If any file does not exist or is empty (when uncompressed),
    # remake all
    for suff in "${relfile_suffs[@]}"; do
        fname=$tsvdir/${corpus}_rels$suff.tsv.gz
        if [ ! -r $fname ] || [ ! -s $fname ] || {
               [ "$(wc -c < $fname)" -lt 80 ] &&
                   [ "$(zcat $fname 2> /dev/null | wc -c)" = "0" ]; };
        then
            return
        fi
    done
    echo "already extracted"
}

add_name_attrs () {
    run_cmd $vrt_add_name_attrs $corpus @data @data
}

test_skip_add_name_attrs () {
    if [ "x$name_attrs" = x ]; then
	echo "requested not to add"
    elif ! word_in "nertag" "$input_attrs"; then
	echo "NER tags not present (positional attribute nertag)"
    elif corpus_has_attr $corpus s ne_ex; then
	echo "already present"
    fi
}

adjust_posattrs_comment () {
    process_vrt vrt_replace_posattr_names "$input_attrs"
}

test_skip_adjust_posattrs_comment () {
    [ "x$input_attrs" = "x$initial_vrt_posattrs" ] &&
	echo "positional attributes not changed"
}

make_corpus_package () {
    local extra_opts vrt_opt seed_file_opt
    [ "x$korp_frontend_dir" != x ] &&
    extra_opts="$extra_opts --korp-frontend-dir=$korp_frontend_dir"
    [ "x$package_readme_file" != x ] &&
    extra_opts="--readme-file=$package_readme_file"
    [ "x$package_doc_dir" != x ] &&
    extra_opts="$extra_opts --doc-dir=$package_doc_dir"
    [ "x$package_doc_file" != x ] &&
    extra_opts="$extra_opts --doc-file=$package_doc_file"
    [ "x$package_script_dir" != x ] &&
    extra_opts="$extra_opts --script-dir=$package_script_dir"
    [ "x$package_script_file" != x ] &&
    extra_opts="$extra_opts --script-file=$package_script_file"
    [ "x$package_extra_dir" != x ] &&
    extra_opts="$extra_opts --extra-dir=$package_extra_dir"
    [ "x$package_extra_file" != x ] &&
    extra_opts="$extra_opts --extra-file=$package_extra_file"
    [ -e "$seed_file" ] &&
        seed_file_opt="--extra-file=$seed_file:$vrt_subdir/"
    run_cmd gzip --no-name --force $vrt_file $vrt_file_full
    if [ "x$augment_data" = x ]; then
	vrt_opt="--vrt-file $vrt_file.gz --vrt-file $vrt_file_full.gz"
    else
	vrt_opt="--update-vrt"
    fi
    run_cmd $korp_make_corpus_package --target-corpus-root /v/corpora \
	--corpus-root "$corpus_root" \
	--tsv-dir "$tsvdir" --database-format tsv --compress gzip \
	$vrt_opt $auth_opts $seed_file_opt $extra_opts $corpus
}

test_skip_make_corpus_package () {
    [ "x$make_package" = x ] &&
    echo "requested not to create"
}

import_database () {
    tsv_files="$(add_prefix $tsvdir/${corpus}_ lemgrams.tsv.gz timedata.tsv.gz timedata_date.tsv.gz)"
    if [ "x$wordpicture" != x ]; then
	tsv_files="$tsv_files $(echo $tsvdir/${corpus}_rels*.tsv.gz)"
    fi
    run_cmd $korp_mysql_import --prepare-tables --relations-format new \
	$tsv_files
}

test_skip_import_database () {
    [ "x$import_database" = x ] &&
    echo "not requested"
}


main () {
    echo_verb "Making Korp corpus $corpus:"
    # $top_stdout is used by run_cmd to output the command to the top
    # stdout even if the command is run in a pipeline.
    top_stdout=3
    exec 3> /dev/stdout
    set -o pipefail
    check_for_existing_data
    run_stages
    exec 3>&-
    ensure_perms $tsvdir/* $vrtdir/* $datadir/* $cwb_regdir/$corpus 2> /dev/null
    echo_verb "Completed."
}


echo_verb $(date +'[%F %T]')
# FIXME: The format is not effective, since the formats used in inner
# time_cmd calls take overwrite the format (TIMEFORMAT environment
# variable).
time_cmd --format "- Total CPU time used: %U %R" main "$@"
echo_verb $(date +'[%F %T]')