-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrap.pl
72 lines (61 loc) · 1.58 KB
/
scrap.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/perl
use warnings;
use strict;
use URI;
use Web::Scraper;
use Encode;
use Data::Dumper;
my $lexuri = "https://static.slov-lex.sk/static/SK/ZZ";
my %lexdump;
my @years = (2020..2024);
my %stats;
sub lex_scrap {
my $year = shift;
die "Wrong year: $year" unless $year =~ m/\d{4}/;
my $lexs = scraper {
process '//table[@id="YearTable"]/tbody/tr', "lexs[]" => scraper {
# And, in each TD,
process '//td[1]', index => [ 'TEXT', qr/(\d+\/\d{4})/ ];
process '//td[2]/a', uri => '@href';
process '//td[2]/a', fullname => 'TEXT';
};
};
my $res = $lexs->scrape( URI->new("$lexuri/$year/") );
return get_lextype($res);
}
sub get_lextype {
my $x = shift;
foreach my $a (@{$x->{'lexs'}}){
$a->{'fullname'} =~ /^(\w+)/;
$a->{'type'} = $1;
}
return $x;
}
sub print_lex {
my $year = shift;
die "Wrong year: $year" unless $year =~ m/\d{4}/;
for my $lex (@{$lexdump{$year}->{'lexs'}}) {
print Encode::encode("utf8", join "%", $lex->{'index'}, $lex->{'type'}, $lex->{'fullname'}, $lex->{'uri'});
print "\n";
}
}
sub print_stats {
print "Years processed: ";
print scalar keys %lexdump;
print "\n";
print "Lexs per year:\n";
foreach my $y (keys %lexdump){
$stats{$y}{'lexno'} = scalar @{$lexdump{$y}{lexs}};
print " $y $stats{$y}{'lexno'} \n";
}
}
for my $y (@years) {
print STDERR "Processing: $y";
my $rs = lex_scrap($y);
$lexdump{$y} = $rs;
print STDERR ". Done\n";
print_lex($y);
}
#print_stats;
#print Dumper %lexdump;
1;