-
Notifications
You must be signed in to change notification settings - Fork 3
/
ftb2vrt.pl
executable file
·128 lines (115 loc) · 2.32 KB
/
ftb2vrt.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#! /usr/bin/perl -w
use strict;
use warnings;
use Getopt::Long;
sub getopts
{
my %opts = (lemgrams => 1,
msd_sep => '|',
fix_msd_tags => 1,
word_nums => 1);
GetOptions ('lemgrams!' => \$opts{lemgrams},
'msd-separator|morpho-tag-separator=s' => \$opts{msd_sep},
'fix-msd-tags|fix-morpho-tags!' => \$opts{fix_msd_tags},
'word-numbers|word-nums!' => \$opts{word_nums});
return \%opts;
}
sub make_lemgram
{
my ($lemma, $pos) = @_;
my %posmap = ("_" => "xx", # Unspecified
A => "jj",
# AN in the target is not PoS category but a feature,
# but we treat it as a category here.
Abbr => "an",
Adp => "pp",
Adv => "ab",
CC => "kn",
Con => "kn",
CS => "sn",
Interj => "in",
INTERJ => "in",
Noun => "nn",
N => "nn",
Num => "rg",
POST => "pp",
Pron => "pn",
Pun => "xx",
V => "vb");
return "|$lemma..$posmap{$pos}.1|";
}
sub process_input
{
my ($opts_r) = @_;
my $prevname = "";
my $subcorpname = "";
my $subcorp_sent_nr = 1;
my $sent_id = 1;
while (my $line = <>)
{
if ($ARGV ne $prevname)
{
$subcorpname = $ARGV;
$prevname = $ARGV;
$subcorpname =~ s/_tab\.txt//;
$subcorpname =~ s/^.*\///;
if ($subcorp_sent_nr > 1)
{
print "</sentence>\n</subcorpus>\n";
}
$subcorp_sent_nr = 1;
}
chomp ($line);
if ($line !~ /^\#/ && $line !~ /^\s*$/)
{
my @fields = split (/\t/, $line);
if ($#fields < 9)
{
next;
}
if ($fields[0] eq 1)
{
if ($subcorp_sent_nr == 1)
{
print "<subcorpus name=\"$subcorpname\">\n";
}
else
{
print "</sentence>\n";
}
print "<sentence id=\"$sent_id\">\n";
$sent_id++;
$subcorp_sent_nr++;
}
$fields[3] =~ s/\|.*//;
for my $field (@fields)
{
$field =~ s/^\s+//;
$field =~ s/\s+$//;
$field =~ s/\s{2,}/ /;
}
if ($$opts_r{fix_msd_tags})
{
$fields[4] =~ tr/<>/[]/;
$fields[4] =~ s/\s+/$$opts_r{msd_sep}/g;
}
print join ("\t", @fields[1, 2, 3, 4, 6, 7]);
if ($$opts_r{word_nums})
{
print "\t$fields[0]";
}
if ($$opts_r{lemgrams})
{
print "\t" . make_lemgram (@fields[2, 3]);
}
print "\n";
}
}
print "</sentence>\n</subcorpus>\n";
}
sub main
{
my $opts_r = getopts ();
process_input ($opts_r);
}
main ();