-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert2cwb
executable file
·108 lines (93 loc) · 2.07 KB
/
convert2cwb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/perl -w
use strict;
# convert2cwb --tags=<tag_file>
# Perl script for converting CG2 output to a format
# which is required for creating an IMS Corpus Workbench binary corpus.
# In the output, each word is in its own line followed by base form,
# and the list of tags, separated by tab.
# The script reads STDIN and takes one command line argument --tags
# which is the name of the file where the names of the tag classes
# and tag names are specified.
#
# $Id$
# Use the local character class.
# It means that variable \w contains Latin-1 alphabet.
use locale;
# permit named arguments
use Getopt::Long;
my %tags;
my @classes;
my $tags_file;
GetOptions ("tags=s" => \$tags_file);
# Read from lex-file and write to abbr file.
open TAGS, "< $tags_file" or die "Cant open the file: $!\n";
#read from the beginning of the file.
my $current;
my $tag_class;
while (<TAGS>) {
chomp;
s/\s+//g;
if (/^%/) {
next;
}
if (/^$/) {
next;
}
if (s/#//) {
$tag_class = $_;
push @classes, $tag_class;
next;
}
$tags{$tag_class}{$_} = 1;
}
close TAGS;
#foreach my $key (keys %tags) {
# print "JOO $key JES\n";
# for my $tag ( keys %{ $tags{$key} } ) {
# print "$tag\n";
# }
#}
#print @classes;
my $word;
my $correct="";
my $base;
my @tag_list;
while (<>) {
chomp;
#If at the start of a cohort, read the word and go to next line.
if (/^\"</) {
$word = $_;
$correct = "";
next;
}
#Read next line if the start of the cohort is not yet read.
# or if the analysis was not correct.
if(!/<Correct!>/ || $correct ) {
next;
}
$correct = $_;
$word =~ s/^\"<(.*)?>\".*$/$1/;
$correct =~ s/^\s+//;
$correct =~ s/\s*<Correct!>//;
($base, @tag_list) = split(/\s/, $correct);
$base =~ s/\"//g;
my %curr_tag;
for my $tag (@tag_list) {
foreach my $class (keys %tags) {
if ( exists $tags{$class}{$tag} ) {
$curr_tag{$class} = $tag;
}
}
}
my @list;
for my $class (@classes) {
if ( exists $curr_tag{$class} ) {
push @list, $curr_tag{$class};
}
else {
push @list, "";
}
}
my $line = join "\t", $word, $base, @list, "\n";
print $line;
}