diff --git a/CMakeLists.txt b/CMakeLists.txt
index e927b59..bc97d7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,7 +138,7 @@ include( "${CMAKE_CURRENT_LIST_DIR}/tools/cmake/DownloadDependency.cmake" )
# These are replaced by tools/cmake/update_dependencies.sh to the hashes that are currently checked out.
# Thus, do not replace the hashes manually!
SET( CLI11_COMMIT_HASH "13becaddb657eacd090537719a669d66d393b8b2" ) #CLI11_COMMIT_HASH#
-SET( genesis_COMMIT_HASH "9d65c7b1ee13d15034bfa0944830c3c4d8e5e723" ) #genesis_COMMIT_HASH#
+SET( genesis_COMMIT_HASH "277292d4a4b2a081ef7ca4a8e813fdeb8b19e272" ) #genesis_COMMIT_HASH#
SET( sparsepp_COMMIT_HASH "6bfe3b4bdb364993e612d6bb729d680cf4c77649" ) #sparsepp_COMMIT_HASH#
# Call the github download function, which takes four arguments:
diff --git a/libs/genesis b/libs/genesis
index 9d65c7b..277292d 160000
--- a/libs/genesis
+++ b/libs/genesis
@@ -1 +1 @@
-Subproject commit 9d65c7b1ee13d15034bfa0944830c3c4d8e5e723
+Subproject commit 277292d4a4b2a081ef7ca4a8e813fdeb8b19e272
diff --git a/scripts/plot-pf-glm-coeffs.R b/scripts/plot-pf-glm-coeffs.R
new file mode 100644
index 0000000..9fcdf8a
--- /dev/null
+++ b/scripts/plot-pf-glm-coeffs.R
@@ -0,0 +1,109 @@
+#!/usr/bin/env Rscript
+
+# gappa - Genesis Applications for Phylogenetic Placement Analysis
+# Copyright (C) 2017-2024 Lucas Czech
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+#
+# Contact:
+# Lucas Czech
+# Department of Plant Biology, Carnegie Institution For Science
+# 260 Panama Street, Stanford, CA 94305, USA
+
+# Load packages
+library(tidyr)
+library(ggplot2)
+library(cowplot)
+theme_set(theme_cowplot())
+
+# Get input and output files from command line, or use defaults.
+meta_file = "meta.csv"
+balances_file = "factor_balances.csv"
+coeffs_file = "factor_glm_coefficients.csv"
+out_pref = "pf-glm-coeffs-"
+args = commandArgs(trailingOnly=TRUE)
+if (length(args) >= 1) {
+ meta_file = args[1]
+}
+if (length(args) >= 2) {
+ balances_file = args[2]
+}
+if (length(args) >= 3) {
+ coeffs_file = args[3]
+}
+if (length(args) <= 4) {
+ out_pref = args[4]
+} else {
+ stop( "Usage: plot-pf-glm-coeffs.R [meta-data] [balances] [glm-coefficients] [out-prefix]")
+}
+
+print(paste("balances: ", balances_file))
+print(paste("glm-coefficients: ", coeffs_file))
+print(paste("meta-data: ", meta_file))
+print(paste("out-prefix: ", out_pref))
+
+Balances <- read.table(balances_file, sep="\t", header=TRUE)
+Coefficients <- read.table(coeffs_file, sep="\t", header=TRUE)
+Meta <- read.table(meta_file, sep="\t", header=TRUE)
+
+# Match the Samples between Meta and Balances
+# The Meta table might not have the first column named Samples,
+# so we just take whatever is the first column here.
+matched_samples <- match(Meta[, colnames(Meta)[1]], Balances$Sample)
+
+# Loop through each row in the Coefficients table
+for(i in 1:nrow(Coefficients)) {
+ print(paste("Factor", i))
+
+ # Extract the current row's coefficients and intercept
+ # Exclude Factor and Intercept columns
+ current_coefficients <- as.numeric(Coefficients[i, -c(1, 2)])
+ current_intercept <- Coefficients$Intercept[i]
+
+ # Compute the dot product for the current coefficients against all Meta rows
+ # Then add the current intercept
+ result <- as.matrix(Meta[, -1]) %*% current_coefficients + current_intercept
+
+ # Find the corresponding column in the Balances table
+ balance_column_name <- paste("Factor", i, sep = "_")
+ if(!(balance_column_name %in% names(Balances))) {
+ print(paste("Invalid column", balance_column_name))
+ next
+ }
+
+ # Create a data frame for plotting.
+ # Add the result with the corresponding balance value.
+ # Note: Ensure that the matched_samples indices are valid and there are no NAs
+ df <- data.frame(
+ Result = result,
+ Balance = Balances[matched_samples, balance_column_name]
+ )
+
+ # Generate the scatter plot.
+ p <- ggplot(df, aes(x = Balance, y = Result)) +
+ geom_point() +
+ # geom_smooth(method = "lm", color = "red", se = FALSE, formula="y~x", orientation = "y") +
+ labs(
+ title = balance_column_name,
+ x = "Balance Value",
+ y = "GLM Prediction"
+ ) +
+ geom_abline(slope=1, intercept=0, color='#6666FF') +
+ coord_fixed() +
+ theme(legend.title=element_blank())
+
+ # Display the plot
+ # print(p)
+ ggsave(paste0(out_pref, i,".png"), width=12, height=8)
+}
diff --git a/src/commands/analyze/placement_factorization.cpp b/src/commands/analyze/placement_factorization.cpp
index 78c0a5e..a1715a1 100644
--- a/src/commands/analyze/placement_factorization.cpp
+++ b/src/commands/analyze/placement_factorization.cpp
@@ -43,7 +43,9 @@
#include "genesis/utils/color/color.hpp"
#include "genesis/utils/color/list_sequential.hpp"
+#include
#include
+#include
#include
#include
#include
@@ -162,8 +164,17 @@ void setup_placement_factorization( CLI::App& app )
// Input Reading
// =================================================================================================
-genesis::utils::Matrix read_meta_data( PlacementFactorizationOptions const& options )
+struct MetaMatrix
{
+ genesis::utils::Matrix matrix;
+ std::vector column_names;
+ std::vector row_names;
+};
+
+MetaMatrix read_meta_data( PlacementFactorizationOptions const& options )
+{
+ MetaMatrix result;
+
// Get the metadata.
// options.metadata_input.print();
auto df = options.metadata_input.read_string_dataframe();
@@ -180,18 +191,21 @@ genesis::utils::Matrix read_meta_data( PlacementFactorizationOptions con
// Convert as needed for phylo factorization.
std::string report;
auto meta = glm_prepare_dataframe( df, report );
+ result.column_names = meta.col_names();
LOG_MSG1 << report;
// TODO use glm_convert_dataframe instead?!
// Copy the meta data in the correct sample order.
auto const jplace_count = options.jplace_input.file_count();
- auto result = genesis::utils::Matrix( jplace_count, meta.cols() );
+ result.matrix = genesis::utils::Matrix( jplace_count, meta.cols() );
+ result.row_names.resize( jplace_count );
assert( meta.rows() == jplace_count );
for( size_t i = 0; i < jplace_count; ++i ) {
for( size_t c = 0; c < meta.cols(); ++c ) {
- result( i, c ) = meta[ c ].as()[ options.jplace_input.base_file_name(i) ];
+ result.matrix( i, c ) = meta[ c ].as()[ options.jplace_input.base_file_name(i) ];
}
+ result.row_names[i] = options.jplace_input.base_file_name(i);
}
return result;
@@ -254,6 +268,23 @@ genesis::tree::BalanceData read_balance_data( PlacementFactorizationOptions cons
return mass_balance_data( mass_trees, settings );
}
+struct GlmCoefficients
+{
+ using Coefficients = std::vector;
+ std::vector edge_coefficients;
+};
+
+std::vector prepare_glm_coefficients(
+ PlacementFactorizationOptions const& options,
+ genesis::tree::BalanceData const& balances
+) {
+ auto result = std::vector( options.factors.value() );
+ for( auto& factor : result ) {
+ factor.edge_coefficients.resize( balances.tree.edge_count() );
+ }
+ return result;
+}
+
// =================================================================================================
// Output Writing
// =================================================================================================
@@ -337,8 +368,13 @@ void write_factor_taxa(
using namespace genesis::tree;
using namespace genesis::utils;
- auto factor_taxa_of = options.file_output.get_output_target( "factor_taxa", "txt" );
- auto write_taxa_list = [&]( std::unordered_set indices ) {
+ // Open the file
+ auto factor_taxa_of = options.file_output.get_output_target( "factor_taxa", "csv" );
+
+ // Write a line to the file. Factor index, then taxon name, then indicator of which side
+ auto write_taxa_list = [&](
+ size_t factor, std::unordered_set indices, std::string const& side
+ ) {
std::unordered_set edge_names;
for( auto const ei : indices ) {
auto const& ed = tree.edge_at( ei ).secondary_link().node().data();
@@ -347,19 +383,15 @@ void write_factor_taxa(
}
}
for( auto const& en : edge_names ) {
- (*factor_taxa_of) << en << "\n";
+ (*factor_taxa_of) << factor << "\t" << en << "\t" << side << "\n";
}
- (*factor_taxa_of) << "\n";
};
+ // Write the table, for each factor, and for each side of it the
+ (*factor_taxa_of) << "Factor\tTaxon\tRootSide\n";
for( size_t i = 0; i < factors.size(); ++i ) {
- auto const& factor = factors[i];
-
- (*factor_taxa_of) << "Factor " << (i+1) << ", root side:\n";
- write_taxa_list( factor.edge_indices_primary );
-
- (*factor_taxa_of) << "Factor " << (i+1) << ", non-root side:\n";
- write_taxa_list( factor.edge_indices_secondary );
+ write_taxa_list( i+1, factors[i].edge_indices_primary, "1" );
+ write_taxa_list( i+1, factors[i].edge_indices_secondary, "0" );
}
}
@@ -390,6 +422,35 @@ void write_balances_table(
);
}
+void write_glm_coefficients(
+ PlacementFactorizationOptions const& options,
+ std::vector const& factors,
+ MetaMatrix const& meta,
+ std::vector const& glm_coeffs
+) {
+ auto target = options.file_output.get_output_target( "factor_glm_coefficients", "csv" );
+
+ // Write the header. One column per meta
+ (*target) << "Factor\tIntercept";
+ for( auto const& col_name : meta.column_names ) {
+ (*target) << "\t" << col_name;
+ }
+ (*target) << "\n";
+
+ // Write the coefficients of the winning edge.
+ assert( glm_coeffs.size() == factors.size() );
+ for( size_t i = 0; i < factors.size(); ++i ) {
+ auto const winning_edge_idx = factors[i].edge_index;
+ assert( glm_coeffs[i].edge_coefficients.size() == factors[i].all_objective_values.size() );
+ auto const& edge_vals = glm_coeffs[i].edge_coefficients[winning_edge_idx];
+ (*target) << (i+1);
+ for( auto v : edge_vals ) {
+ (*target) << "\t" << v;
+ }
+ (*target) << "\n";
+ }
+}
+
// =================================================================================================
// Run
// =================================================================================================
@@ -397,6 +458,8 @@ void write_balances_table(
void run_placement_factorization( PlacementFactorizationOptions const& options )
{
using namespace genesis;
+ using namespace genesis::tree;
+ using namespace genesis::utils;
// -------------------------------------------------------------------------
// Preparations
@@ -411,8 +474,9 @@ void run_placement_factorization( PlacementFactorizationOptions const& options )
files_to_check.push_back({ "objective_values_" + std::to_string( i+1 ), e });
}
}
- files_to_check.push_back({ "factor_taxa", "txt" });
+ files_to_check.push_back({ "factor_taxa", "csv" });
files_to_check.push_back({ "factor_balances", "csv" });
+ files_to_check.push_back({ "factor_glm_coefficients", "csv" });
options.file_output.check_output_files_nonexistence( files_to_check );
// Print some user output.
@@ -432,27 +496,27 @@ void run_placement_factorization( PlacementFactorizationOptions const& options )
// Calculations and Output
// -------------------------------------------------------------------------
- auto const factors = tree::phylogenetic_factorization(
+ // We capture the GLM coefficients of all factors and edges. The outer vector has elements
+ // per factor (iteration), and the inner per edge index.
+ auto glm_coeffs = prepare_glm_coefficients( options, balances );
+
+ // Ruuuuuun!
+ auto const factors = phylogenetic_factorization(
balances,
- [&]( std::vector const& balances ){
- auto const fit = glm_fit( meta, balances, utils::glm_family_gaussian() );
-
- // TODO return nan in the follwing cases. but make sure that this works with
- // the phylofactor implementation first!
-
- // if( !fit.converged ) {
- // LOG_DBG << "did not converge";
- // }
- // if( ! std::isfinite(fit.null_deviance) ) {
- // LOG_DBG << "fit.null_deviance " << fit.null_deviance;
- // }
- // if( ! std::isfinite(fit.deviance) ) {
- // LOG_DBG << "fit.deviance " << fit.deviance;
- // }
- // if( ! std::isfinite(fit.null_deviance) || ! std::isfinite(fit.deviance) ) {
- // LOG_DBG << "data.meta_vals_mat " << join( data.meta_vals_mat );
- // LOG_DBG << "balances " << join( balances );
- // }
+ [&]( size_t iteration, size_t edge_index, std::vector const& balances ){
+ auto const fit = glm_fit( meta.matrix, balances, glm_family_gaussian() );
+
+ // Store the coefficients computed from the fitting
+ assert( iteration < glm_coeffs.size() );
+ assert( edge_index < glm_coeffs[iteration].size() );
+ auto& coeff = glm_coeffs[iteration].edge_coefficients[edge_index];
+ coeff = glm_coefficients( meta.matrix, balances, fit );
+
+ // If something did not work in the GLM, we return a nan,
+ // so that this edge is not considered downstream.
+ if( !fit.converged || !std::isfinite(fit.null_deviance) || !std::isfinite(fit.deviance) ) {
+ return std::numeric_limits::quiet_NaN();
+ }
return fit.null_deviance - fit.deviance;
},
@@ -467,4 +531,5 @@ void run_placement_factorization( PlacementFactorizationOptions const& options )
write_factor_objective_values( options, factors, balances.tree );
write_factor_taxa( options, factors, balances.tree );
write_balances_table( options, factors );
+ write_glm_coefficients( options, factors, meta, glm_coeffs );
}