diff --git a/CMakeLists.txt b/CMakeLists.txt index e927b59..bc97d7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,7 +138,7 @@ include( "${CMAKE_CURRENT_LIST_DIR}/tools/cmake/DownloadDependency.cmake" ) # These are replaced by tools/cmake/update_dependencies.sh to the hashes that are currently checked out. # Thus, do not replace the hashes manually! SET( CLI11_COMMIT_HASH "13becaddb657eacd090537719a669d66d393b8b2" ) #CLI11_COMMIT_HASH# -SET( genesis_COMMIT_HASH "9d65c7b1ee13d15034bfa0944830c3c4d8e5e723" ) #genesis_COMMIT_HASH# +SET( genesis_COMMIT_HASH "277292d4a4b2a081ef7ca4a8e813fdeb8b19e272" ) #genesis_COMMIT_HASH# SET( sparsepp_COMMIT_HASH "6bfe3b4bdb364993e612d6bb729d680cf4c77649" ) #sparsepp_COMMIT_HASH# # Call the github download function, which takes four arguments: diff --git a/libs/genesis b/libs/genesis index 9d65c7b..277292d 160000 --- a/libs/genesis +++ b/libs/genesis @@ -1 +1 @@ -Subproject commit 9d65c7b1ee13d15034bfa0944830c3c4d8e5e723 +Subproject commit 277292d4a4b2a081ef7ca4a8e813fdeb8b19e272 diff --git a/scripts/plot-pf-glm-coeffs.R b/scripts/plot-pf-glm-coeffs.R new file mode 100644 index 0000000..9fcdf8a --- /dev/null +++ b/scripts/plot-pf-glm-coeffs.R @@ -0,0 +1,109 @@ +#!/usr/bin/env Rscript + +# gappa - Genesis Applications for Phylogenetic Placement Analysis +# Copyright (C) 2017-2024 Lucas Czech +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Contact: +# Lucas Czech +# Department of Plant Biology, Carnegie Institution For Science +# 260 Panama Street, Stanford, CA 94305, USA + +# Load packages +library(tidyr) +library(ggplot2) +library(cowplot) +theme_set(theme_cowplot()) + +# Get input and output files from command line, or use defaults. +meta_file = "meta.csv" +balances_file = "factor_balances.csv" +coeffs_file = "factor_glm_coefficients.csv" +out_pref = "pf-glm-coeffs-" +args = commandArgs(trailingOnly=TRUE) +if (length(args) >= 1) { + meta_file = args[1] +} +if (length(args) >= 2) { + balances_file = args[2] +} +if (length(args) >= 3) { + coeffs_file = args[3] +} +if (length(args) <= 4) { + out_pref = args[4] +} else { + stop( "Usage: plot-pf-glm-coeffs.R [meta-data] [balances] [glm-coefficients] [out-prefix]") +} + +print(paste("balances: ", balances_file)) +print(paste("glm-coefficients: ", coeffs_file)) +print(paste("meta-data: ", meta_file)) +print(paste("out-prefix: ", out_pref)) + +Balances <- read.table(balances_file, sep="\t", header=TRUE) +Coefficients <- read.table(coeffs_file, sep="\t", header=TRUE) +Meta <- read.table(meta_file, sep="\t", header=TRUE) + +# Match the Samples between Meta and Balances +# The Meta table might not have the first column named Samples, +# so we just take whatever is the first column here. +matched_samples <- match(Meta[, colnames(Meta)[1]], Balances$Sample) + +# Loop through each row in the Coefficients table +for(i in 1:nrow(Coefficients)) { + print(paste("Factor", i)) + + # Extract the current row's coefficients and intercept + # Exclude Factor and Intercept columns + current_coefficients <- as.numeric(Coefficients[i, -c(1, 2)]) + current_intercept <- Coefficients$Intercept[i] + + # Compute the dot product for the current coefficients against all Meta rows + # Then add the current intercept + result <- as.matrix(Meta[, -1]) %*% current_coefficients + current_intercept + + # Find the corresponding column in the Balances table + balance_column_name <- paste("Factor", i, sep = "_") + if(!(balance_column_name %in% names(Balances))) { + print(paste("Invalid column", balance_column_name)) + next + } + + # Create a data frame for plotting. + # Add the result with the corresponding balance value. + # Note: Ensure that the matched_samples indices are valid and there are no NAs + df <- data.frame( + Result = result, + Balance = Balances[matched_samples, balance_column_name] + ) + + # Generate the scatter plot. + p <- ggplot(df, aes(x = Balance, y = Result)) + + geom_point() + + # geom_smooth(method = "lm", color = "red", se = FALSE, formula="y~x", orientation = "y") + + labs( + title = balance_column_name, + x = "Balance Value", + y = "GLM Prediction" + ) + + geom_abline(slope=1, intercept=0, color='#6666FF') + + coord_fixed() + + theme(legend.title=element_blank()) + + # Display the plot + # print(p) + ggsave(paste0(out_pref, i,".png"), width=12, height=8) +} diff --git a/src/commands/analyze/placement_factorization.cpp b/src/commands/analyze/placement_factorization.cpp index 78c0a5e..a1715a1 100644 --- a/src/commands/analyze/placement_factorization.cpp +++ b/src/commands/analyze/placement_factorization.cpp @@ -43,7 +43,9 @@ #include "genesis/utils/color/color.hpp" #include "genesis/utils/color/list_sequential.hpp" +#include #include +#include #include #include #include @@ -162,8 +164,17 @@ void setup_placement_factorization( CLI::App& app ) // Input Reading // ================================================================================================= -genesis::utils::Matrix read_meta_data( PlacementFactorizationOptions const& options ) +struct MetaMatrix { + genesis::utils::Matrix matrix; + std::vector column_names; + std::vector row_names; +}; + +MetaMatrix read_meta_data( PlacementFactorizationOptions const& options ) +{ + MetaMatrix result; + // Get the metadata. // options.metadata_input.print(); auto df = options.metadata_input.read_string_dataframe(); @@ -180,18 +191,21 @@ genesis::utils::Matrix read_meta_data( PlacementFactorizationOptions con // Convert as needed for phylo factorization. std::string report; auto meta = glm_prepare_dataframe( df, report ); + result.column_names = meta.col_names(); LOG_MSG1 << report; // TODO use glm_convert_dataframe instead?! // Copy the meta data in the correct sample order. auto const jplace_count = options.jplace_input.file_count(); - auto result = genesis::utils::Matrix( jplace_count, meta.cols() ); + result.matrix = genesis::utils::Matrix( jplace_count, meta.cols() ); + result.row_names.resize( jplace_count ); assert( meta.rows() == jplace_count ); for( size_t i = 0; i < jplace_count; ++i ) { for( size_t c = 0; c < meta.cols(); ++c ) { - result( i, c ) = meta[ c ].as()[ options.jplace_input.base_file_name(i) ]; + result.matrix( i, c ) = meta[ c ].as()[ options.jplace_input.base_file_name(i) ]; } + result.row_names[i] = options.jplace_input.base_file_name(i); } return result; @@ -254,6 +268,23 @@ genesis::tree::BalanceData read_balance_data( PlacementFactorizationOptions cons return mass_balance_data( mass_trees, settings ); } +struct GlmCoefficients +{ + using Coefficients = std::vector; + std::vector edge_coefficients; +}; + +std::vector prepare_glm_coefficients( + PlacementFactorizationOptions const& options, + genesis::tree::BalanceData const& balances +) { + auto result = std::vector( options.factors.value() ); + for( auto& factor : result ) { + factor.edge_coefficients.resize( balances.tree.edge_count() ); + } + return result; +} + // ================================================================================================= // Output Writing // ================================================================================================= @@ -337,8 +368,13 @@ void write_factor_taxa( using namespace genesis::tree; using namespace genesis::utils; - auto factor_taxa_of = options.file_output.get_output_target( "factor_taxa", "txt" ); - auto write_taxa_list = [&]( std::unordered_set indices ) { + // Open the file + auto factor_taxa_of = options.file_output.get_output_target( "factor_taxa", "csv" ); + + // Write a line to the file. Factor index, then taxon name, then indicator of which side + auto write_taxa_list = [&]( + size_t factor, std::unordered_set indices, std::string const& side + ) { std::unordered_set edge_names; for( auto const ei : indices ) { auto const& ed = tree.edge_at( ei ).secondary_link().node().data(); @@ -347,19 +383,15 @@ void write_factor_taxa( } } for( auto const& en : edge_names ) { - (*factor_taxa_of) << en << "\n"; + (*factor_taxa_of) << factor << "\t" << en << "\t" << side << "\n"; } - (*factor_taxa_of) << "\n"; }; + // Write the table, for each factor, and for each side of it the + (*factor_taxa_of) << "Factor\tTaxon\tRootSide\n"; for( size_t i = 0; i < factors.size(); ++i ) { - auto const& factor = factors[i]; - - (*factor_taxa_of) << "Factor " << (i+1) << ", root side:\n"; - write_taxa_list( factor.edge_indices_primary ); - - (*factor_taxa_of) << "Factor " << (i+1) << ", non-root side:\n"; - write_taxa_list( factor.edge_indices_secondary ); + write_taxa_list( i+1, factors[i].edge_indices_primary, "1" ); + write_taxa_list( i+1, factors[i].edge_indices_secondary, "0" ); } } @@ -390,6 +422,35 @@ void write_balances_table( ); } +void write_glm_coefficients( + PlacementFactorizationOptions const& options, + std::vector const& factors, + MetaMatrix const& meta, + std::vector const& glm_coeffs +) { + auto target = options.file_output.get_output_target( "factor_glm_coefficients", "csv" ); + + // Write the header. One column per meta + (*target) << "Factor\tIntercept"; + for( auto const& col_name : meta.column_names ) { + (*target) << "\t" << col_name; + } + (*target) << "\n"; + + // Write the coefficients of the winning edge. + assert( glm_coeffs.size() == factors.size() ); + for( size_t i = 0; i < factors.size(); ++i ) { + auto const winning_edge_idx = factors[i].edge_index; + assert( glm_coeffs[i].edge_coefficients.size() == factors[i].all_objective_values.size() ); + auto const& edge_vals = glm_coeffs[i].edge_coefficients[winning_edge_idx]; + (*target) << (i+1); + for( auto v : edge_vals ) { + (*target) << "\t" << v; + } + (*target) << "\n"; + } +} + // ================================================================================================= // Run // ================================================================================================= @@ -397,6 +458,8 @@ void write_balances_table( void run_placement_factorization( PlacementFactorizationOptions const& options ) { using namespace genesis; + using namespace genesis::tree; + using namespace genesis::utils; // ------------------------------------------------------------------------- // Preparations @@ -411,8 +474,9 @@ void run_placement_factorization( PlacementFactorizationOptions const& options ) files_to_check.push_back({ "objective_values_" + std::to_string( i+1 ), e }); } } - files_to_check.push_back({ "factor_taxa", "txt" }); + files_to_check.push_back({ "factor_taxa", "csv" }); files_to_check.push_back({ "factor_balances", "csv" }); + files_to_check.push_back({ "factor_glm_coefficients", "csv" }); options.file_output.check_output_files_nonexistence( files_to_check ); // Print some user output. @@ -432,27 +496,27 @@ void run_placement_factorization( PlacementFactorizationOptions const& options ) // Calculations and Output // ------------------------------------------------------------------------- - auto const factors = tree::phylogenetic_factorization( + // We capture the GLM coefficients of all factors and edges. The outer vector has elements + // per factor (iteration), and the inner per edge index. + auto glm_coeffs = prepare_glm_coefficients( options, balances ); + + // Ruuuuuun! + auto const factors = phylogenetic_factorization( balances, - [&]( std::vector const& balances ){ - auto const fit = glm_fit( meta, balances, utils::glm_family_gaussian() ); - - // TODO return nan in the follwing cases. but make sure that this works with - // the phylofactor implementation first! - - // if( !fit.converged ) { - // LOG_DBG << "did not converge"; - // } - // if( ! std::isfinite(fit.null_deviance) ) { - // LOG_DBG << "fit.null_deviance " << fit.null_deviance; - // } - // if( ! std::isfinite(fit.deviance) ) { - // LOG_DBG << "fit.deviance " << fit.deviance; - // } - // if( ! std::isfinite(fit.null_deviance) || ! std::isfinite(fit.deviance) ) { - // LOG_DBG << "data.meta_vals_mat " << join( data.meta_vals_mat ); - // LOG_DBG << "balances " << join( balances ); - // } + [&]( size_t iteration, size_t edge_index, std::vector const& balances ){ + auto const fit = glm_fit( meta.matrix, balances, glm_family_gaussian() ); + + // Store the coefficients computed from the fitting + assert( iteration < glm_coeffs.size() ); + assert( edge_index < glm_coeffs[iteration].size() ); + auto& coeff = glm_coeffs[iteration].edge_coefficients[edge_index]; + coeff = glm_coefficients( meta.matrix, balances, fit ); + + // If something did not work in the GLM, we return a nan, + // so that this edge is not considered downstream. + if( !fit.converged || !std::isfinite(fit.null_deviance) || !std::isfinite(fit.deviance) ) { + return std::numeric_limits::quiet_NaN(); + } return fit.null_deviance - fit.deviance; }, @@ -467,4 +531,5 @@ void run_placement_factorization( PlacementFactorizationOptions const& options ) write_factor_objective_values( options, factors, balances.tree ); write_factor_taxa( options, factors, balances.tree ); write_balances_table( options, factors ); + write_glm_coefficients( options, factors, meta, glm_coeffs ); }