-
Notifications
You must be signed in to change notification settings - Fork 12
/
train_test_mwe.sh
executable file
·68 lines (48 loc) · 2.89 KB
/
train_test_mwe.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
# Train and test the MWE identification system
# This requires the CMWE corpus; see README.md to download
# and follow the instructions to restore the full text from the LDC data.
# (To predict with an existing model, see mwe_identify.sh.)
set -eu
set -o pipefail
ark=/path/to/ark-tweet-nlp-0.3.2
# - prepare train and test data (automatic POS tags)
# look up entries by sentence id to populate the train and test splits
SCRIPT="from __future__ import print_function
import fileinput, sys
corpusFP, splitFP = sys.argv[1:]
entries = {}
for ln in fileinput.input(sys.argv[1]):
sentid, rest = ln.split('\t',1)
assert sentid not in entries
entries[sentid] = rest.strip()
for ln in fileinput.input(sys.argv[2]):
sentid = ln.strip()
print(sentid, entries[sentid], sep='\t')
"
python2.7 -c "$SCRIPT" cmwe/corpus.mwe train.sentids > cmwe/train.mwe
python2.7 -c "$SCRIPT" cmwe/corpus.mwe test.sentids > cmwe/test.mwe
# convert to tags
python2.7 src/mwe2tags.py cmwe/train.mwe > cmwe/train.tags
python2.7 src/mwe2tags.py cmwe/test.mwe > cmwe/test.tags
#train the POS tagger:
#java -XX:ParallelGCThreads=2 -Xmx8g -cp $ark/ark-tweet-nlp-0.3.2.jar cmu.arktweetnlp.Train $in ewtb_pos.model
# prepare POS tagger input
cut -f2,4,5,9 cmwe/train.tags > cmwe/train.wdposid
cut -f2,4,5,9 cmwe/test.tags > cmwe/test.wdposid
# run POS tagger
$ark/runTagger.sh --input-format conll --output-format conll --model ewtb_pos.model cmwe/train.wdposid | cut -f1-2 > cmwe/train.syspos.wdpos
paste cmwe/train.syspos.wdpos <(cut -f3-4 cmwe/train.wdposid) > cmwe/train.syspos.wdposid
$ark/runTagger.sh --input-format conll --output-format conll --model ewtb_pos.model cmwe/test.wdposid | cut -f1-2 > cmwe/test.syspos.wdpos
paste cmwe/test.syspos.wdpos <(cut -f3-4 cmwe/test.wdposid) > cmwe/test.syspos.wdposid
# incorporate system POS tags
paste <(cut -f1-3 cmwe/train.tags) <(cut -f2 cmwe/train.syspos.wdpos) <(cut -f5- cmwe/train.tags) > cmwe/train.syspos.tags
paste <(cut -f1-3 cmwe/test.tags) <(cut -f2 cmwe/test.syspos.wdpos) <(cut -f5- cmwe/test.tags) > cmwe/test.syspos.tags
# - learning on training set, predict on test set
python2.7 src/main.py --mwe --YY tagsets/bio2g --defaultY O --train cmwe/train.syspos.tags --test-predict cmwe/test.syspos.tags --iters 3 --debug --save mwe.model --bio NO_SINGLETON_B --clusters --cluster-file mwelex/yelpac-c1000-m25.gz --lex mwelex/{semcor_mwes,wordnet_mwes,said,phrases_dot_net,wikimwe,enwikt}.json /dev/null --includeLossTerm --costAug 100 > testpredictions.syspos.tags
# the /dev/null was supposed to be ignored, but it does have a small effect
# - convert predictions to .mwe
python2.7 src/tags2mwe.py testpredictions.syspos.tags > testpredictions.syspos.mwe
# - evaluate
python2.7 src/tags2mwe.py cmwe/test.tags > cmwe/test.withtags.mwe
python2.7 src/mweval.py --default-strength strong cmwe/test.withtags.mwe testpredictions.syspos.mwe > testpredictions.eval