Skip to content

Commit

Permalink
Merge pull request #922 from shawnlaffan/indices_2024
Browse files Browse the repository at this point in the history
A set of optimisations to several index and related calculations.

Summary of main changes:

  * Hierarchical calculations are now supported. This allows cluster tree calculations to build on the child results rather than rebuilding everything for each node.
  * Several endemism calculations now re-use results where the central and whole variants will be the same.
  * Significance assessments are faster.
  • Loading branch information
shawnlaffan authored Feb 24, 2024
2 parents 124569f + 62fd8a1 commit 3c590de
Show file tree
Hide file tree
Showing 11 changed files with 440 additions and 111 deletions.
20 changes: 17 additions & 3 deletions lib/Biodiverse/Cluster.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2777,7 +2777,9 @@ sub sp_calc {
);

# drop out if we have none to do
return if $indices_object->get_valid_calculation_count == 0;
return if $indices_object->get_valid_calculation_count == 0;

$indices_object->set_hierarchical_mode(!$args{no_hierarchical_mode});

delete $args{calculations}; # saves passing it onwards when we call the calculations
delete $args{analyses}; # for backwards compat
Expand Down Expand Up @@ -2809,16 +2811,26 @@ sub sp_calc {
$count / $to_do,
);

# needs a better name
my $current_node_details = {
name => $node->get_name,
child_names => [map {$_->get_name} $node->get_children],
};

my %sp_calc_values = $indices_object->run_calculations(
%args,
element_list1 => [keys %{$node->get_terminal_elements}]
element_list1 => [ keys %{$node->get_terminal_elements} ],
current_node_details => $current_node_details,
);

foreach my $key (keys %sp_calc_values) {
if (is_arrayref($sp_calc_values{$key})
|| is_hashref($sp_calc_values{$key})) {

$node->add_to_lists ($key => $sp_calc_values{$key});
$node->add_to_lists (
$key => $sp_calc_values{$key},
use_ref => 1,
);
delete $sp_calc_values{$key};
}
}
Expand All @@ -2830,6 +2842,8 @@ sub sp_calc {

$self->delete_cached_metadata;

$indices_object->set_hierarchical_mode(0);

return 1;
}

Expand Down
74 changes: 40 additions & 34 deletions lib/Biodiverse/Common.pm
Original file line number Diff line number Diff line change
Expand Up @@ -266,23 +266,26 @@ sub load_yaml_file {
croak 'Loading from a YAML file is no longer supported';
}

# Orig should never have used a hash. Oh well.
sub set_basedata_ref_aa {
my ($self, $ref) = @_;
$self->set_basedata_ref(BASEDATA_REF => $ref);
}

sub set_basedata_ref {
my $self = shift;
my %args = @_;

$self->set_param (BASEDATA_REF => $args{BASEDATA_REF});
$self->weaken_basedata_ref;
$self->weaken_basedata_ref if defined $args{BASEDATA_REF};

return;
}

sub get_basedata_ref {
my $self = shift;

my $bd = $self->get_param ('BASEDATA_REF')
|| Biodiverse::MissingBasedataRef->throw (
message => 'Parameter BASEDATA_REF not set'
);
my $bd = $self->get_param ('BASEDATA_REF');

return $bd;
}
Expand Down Expand Up @@ -2415,18 +2418,19 @@ sub compare_lists_by_item {
\my %base_ref = $args{base_list_ref};
\my %comp_ref = $args{comp_list_ref};
\my %results = $args{results_list_ref};
my ($diff, $increment);

COMP_BY_ITEM:
foreach my $index (keys %base_ref) {

next COMP_BY_ITEM
if !(defined $base_ref{$index} && defined $comp_ref{$index});
if !(defined $comp_ref{$index} && defined $base_ref{$index});

# compare at 10 decimal place precision
# this also allows for serialisation which
# rounds the numbers to 15 decimals
my $diff = $base_ref{$index} - $comp_ref{$index};
my $increment = $diff > DEFAULT_PRECISION_SMALL ? 1 : 0;
$diff = $base_ref{$index} - $comp_ref{$index};
$increment = $diff > DEFAULT_PRECISION_SMALL ? 1 : 0;

# for debug, but leave just in case
#carp "$element, $op\n$comp\n$base " . ($comp - $base) if $increment;
Expand All @@ -2438,18 +2442,18 @@ sub compare_lists_by_item {
# SUMX is the sum of compared values
# SUMXX is the sum of squared compared values
# The latter two are used in z-score calcs
$results{"C_$index"} += $increment;
$results{"Q_$index"} ++;
$results{"P_$index"} = $results{"C_$index"}
/ $results{"Q_$index"};
# obfuscated to squeeze as much speed as we can
# $results{"C_$index"} += $increment;
# $results{"Q_$index"} ++;
$results{"P_$index"} = ($results{"C_$index"} += $increment)
/ (++$results{"Q_$index"});
# use original vals for sums
$results{"SUMX_$index"} += $comp_ref{$index};
$results{"SUMXX_$index"} += ($comp_ref{$index}**2);
$results{"SUMX_$index"} += $comp_ref{$index};
$results{"SUMXX_$index"} += ($comp_ref{$index}**2);
# track the number of ties
if (abs($diff) <= DEFAULT_PRECISION_SMALL) {
$results{"T_$index"} ++;
}
$results{"T_$index"} ++
if (abs($diff) <= DEFAULT_PRECISION_SMALL);
}
return;
Expand Down Expand Up @@ -2513,35 +2517,33 @@ sub get_zscore_from_comp_results {
my %args = @_;

# could alias this
my $comp_list_ref = $args{comp_list_ref}
\my %comp_list_ref = $args{comp_list_ref}
// croak "comp_list_ref argument not specified\n";
# need the observed values
my $base_list_ref = $args{base_list_ref}
\my %base_list_ref = $args{base_list_ref}
// croak "base_list_ref argument not specified\n";

my $results_list_ref = $args{results_list_ref} // {};

KEY:
foreach my $q_key (grep {$_ =~ /^Q_/} keys %$comp_list_ref) {
my $index_name = substr $q_key, 2;
foreach my $index_name (keys %base_list_ref) {

my $n = $comp_list_ref->{$q_key};
my $n = $comp_list_ref{'Q_' . $index_name};
next KEY if !$n;

my $x_key = 'SUMX_' . $index_name;
my $xx_key = 'SUMXX_' . $index_name;

# sum of x vals and x vals squared
my $sumx = $comp_list_ref->{$x_key};
my $sumxx = $comp_list_ref->{$xx_key};
my $sumx = $comp_list_ref{$x_key};
my $sumxx = $comp_list_ref{$xx_key};

my $z_key = $index_name;
# n better be large, as we do not use n-1
my $variance = max (0, ($sumxx - ($sumx**2) / $n) / $n);
my $obs = $base_list_ref->{$index_name};
$results_list_ref->{$z_key}
= $variance
? ($obs - ($sumx / $n)) / sqrt ($variance)
my $variance = ($sumxx - ($sumx**2) / $n) / $n;

$results_list_ref->{$index_name}
= $variance > 0
? ($base_list_ref{$index_name} - ($sumx / $n)) / sqrt ($variance)
: 0;
}

Expand Down Expand Up @@ -2666,15 +2668,19 @@ sub get_sig_rank_from_comp_results {
my $self = shift;
my %args = @_;
# could alias this
\my %comp_list_ref = $args{comp_list_ref}
// croak "comp_list_ref argument not specified\n";

\my %results_list_ref = $args{results_list_ref} // {};

foreach my $c_key (grep {$_ =~ /^C_/} keys %comp_list_ref) {

my $index_name = substr $c_key, 2;
# base_list_ref will usually be shorter so fewer comparisons will be needed
my @keys = $args{base_list_ref}
? grep {exists $comp_list_ref{'C_' . $_}} keys %{$args{base_list_ref}}
: map {substr $_, 2} grep {$_ =~ /^C_/} keys %comp_list_ref;

foreach my $index_name (@keys) {

my $c_key = 'C_' . $index_name;

if (!defined $comp_list_ref{$c_key}) {
$results_list_ref{$index_name} = undef;
Expand Down
47 changes: 34 additions & 13 deletions lib/Biodiverse/Indices.pm
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use warnings;
#use Data::Dumper;
use Scalar::Util qw /blessed weaken/;
use List::MoreUtils qw /uniq/;
use List::Util qw /sum/;
use List::Util qw /sum any/;
use English ( -no_match_vars );
use Ref::Util qw { :all };
use JSON::MaybeXS;
Expand Down Expand Up @@ -794,18 +794,10 @@ sub parse_dependencies_for_calc {
$self->_convert_to_array( input => $required_args );

foreach my $required_arg ( sort @$reqd_args_a ) {
my $re = qr /^($required_arg)$/
; # match is used in the grep? Was used in now-removed code.
my $is_defined;
CALC_ARG:
foreach
my $calc_arg ( sort grep { $_ =~ $re } keys %$calc_args )
{
if ( defined $calc_args->{$calc_arg} ) {
$is_defined++;
last CALC_ARG;
}
}
my $re = qr /^($required_arg)$/;
my $is_defined
= any { $_ =~ $re && defined $calc_args->{$_}}
sort keys %$calc_args;

if ( !$is_defined ) {
Biodiverse::Indices::MissingRequiredArguments->throw(
Expand Down Expand Up @@ -1533,11 +1525,16 @@ sub run_dependencies {
my $tmp = $self->get_param('AS_RESULTS_FROM_GLOBAL') || {};
my %as_results_from_global = %$tmp; # make a copy

state $cache_name_local_results = 'AS_RESULTS_FROM_LOCAL';

# Now we run the calculations at this level.
# We also keep track of what has been run
# to avoid repetition through multiple dependencies.
my %results;
my %as_results_from;
# make sure this is new each iteration
$self->set_cached_value ($cache_name_local_results => \%as_results_from);

foreach my $calc (@$calc_list) {
my $calc_results;

Expand Down Expand Up @@ -1574,6 +1571,9 @@ sub run_dependencies {
$results{$calc} = $calc_results;
}

# We refresh each call above, but this ensures last one is cleaned up.
$self->delete_cached_value($cache_name_local_results);

if ( $type eq 'pre_calc_global' ) {
$self->set_param( AS_RESULTS_FROM_GLOBAL => \%as_results_from_global );
}
Expand Down Expand Up @@ -1663,6 +1663,9 @@ sub set_pairwise_mode {

$self->{pairwise_mode} = $mode;

croak "Cannot have both pairwise and hierarchical modes on at the same time"
if $mode && $self->get_hierarchical_mode;

return $mode;
}

Expand All @@ -1671,6 +1674,24 @@ sub get_pairwise_mode {
$_[0]->{pairwise_mode};
}

sub set_hierarchical_mode {
my ( $self, $mode ) = @_;

$self->{hierarchical_mode} = $mode;

croak "Cannot have both pairwise and hierarchical modes on at the same time"
if $mode && $self->get_pairwise_mode;

return $mode;
}

# potential hot path so optimise to avoid arg handling
sub get_hierarchical_mode {
$_[0]->{hierarchical_mode};
}



1;

__END__
Expand Down
Loading

0 comments on commit 3c590de

Please sign in to comment.