-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathth_gen_idx.pl
61 lines (53 loc) · 1.65 KB
/
th_gen_idx.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/perl
# Taken from Kevin B. Hendricks' MyThes, see LICENSE_th_gen_idx.txt
# perl program to take a thesaurus structured text data file
# and create the proper sorted index file (.idx)
#
# typcially invoked as follows:
# cat th_en_US_new.dat | ./th_gen_idx.pl > th_en_US_new.idx
#
sub by_entry {
my ($aent, $aoff) = split('\|',$a);
my ($bent, $boff) = split('\|',$b);
$aent cmp $bent;
}
# main routine
my $ne = 0; # number of entries in index
my @tindex=(); # the index itself
my $foffset = 0; # file position offset into thesaurus
my $rec=""; # current string and related pieces
my $rl=0; # misc string length
my $entry=""; # current word being processed
my $nm=0; # number of meaning for the current word
my $meaning=""; # current meaning and synonyms
my $p; # misc uses
my $encoding; # encoding used by text file
# top line of thesaurus provides encoding
$encoding=<STDIN>;
$foffset = $foffset + length($encoding);
chomp($encoding);
# read thesaurus line by line
# first line of every block is an entry and meaning count
while ($rec=<STDIN>){
$rl = length($rec);
chomp($rec);
($entry, $nm) = split('\|',$rec);
$p = 0;
while ($p < $nm) {
$meaning=<STDIN>;
$rl = $rl + length($meaning);
chomp($meaning);
$p++;
}
push(@tindex,"$entry|$foffset");
$ne++;
$foffset = $foffset + $rl;
}
# now we have all of the information
# so sort it and then output the encoding, count and index data
@tindex = sort by_entry @tindex;
print STDOUT "$encoding\n";
print STDOUT "$ne\n";
foreach $one (@tindex) {
print STDOUT "$one\n";
}