diff --git a/.github/workflows/check-full.yaml b/.github/workflows/check-full.yaml index 10095679..1c44678f 100644 --- a/.github/workflows/check-full.yaml +++ b/.github/workflows/check-full.yaml @@ -25,6 +25,7 @@ jobs: strategy: fail-fast: false + max-parallel: 2 matrix: config: - {os: macos-latest, r: 'release'} diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index bc18334f..a86396d0 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -48,7 +48,8 @@ jobs: - uses: codecov/codecov-action@v4 with: - fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }} + # Fail if error if not on PR, or if on PR and token is given + fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} file: ./cobertura.xml plugin: noop disable_search: true diff --git a/.lintr b/.lintr index 5e1afb31..4f17fb0d 100644 --- a/.lintr +++ b/.lintr @@ -2,15 +2,16 @@ linters: linters_with_defaults( line_length_linter = NULL, commented_code_linter = NULL, indentation_linter = NULL, - trailing_whitespace_linter = NULL, infix_spaces_linter = NULL, quotes_linter = NULL, - trailing_blank_lines_linter = NULL, brace_linter = NULL, commas_linter = NULL, whitespace_linter = NULL, object_name_linter = NULL, assignment_linter = NULL, - cyclocomp_linter = NULL + cyclocomp_linter = NULL, + object_usage_linter = NULL, + spaces_left_parentheses_linter = NULL, + object_length_linter = NULL ) encoding: "UTF-8" diff --git a/DESCRIPTION b/DESCRIPTION index 0c9cdd85..a8b7794e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TwoSampleMR Title: Two Sample MR Functions and Interface to MR Base Database -Version: 0.6.7 +Version: 0.6.8 Authors@R: c( person("Gibran", "Hemani", , "g.hemani@bristol.ac.uk", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-0920-1055")), diff --git a/NEWS.md b/NEWS.md index 4ad7e7f3..a586affb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# TwoSampleMR v0.6.8 + +(Release date 2024-09-06) + +* Replaced some `unique()` calls in `power_prune()` with `mean()` to ensure scalar `iv.se` values (thanks @phageghost) +* Slightly improved formatting of code base + # TwoSampleMR v0.6.7 (Release date 2024-08-21) diff --git a/R/add_rsq.r b/R/add_rsq.r index 79676d3e..eb08d886 100644 --- a/R/add_rsq.r +++ b/R/add_rsq.r @@ -2,7 +2,7 @@ #' #' Can be applied to exposure_dat, outcome_dat or harmonised_data. #' Note that it will be beneficial in some circumstances to add the meta data to -#' the data object using [add_metadata()] before running this function. +#' the data object using [add_metadata()] before running this function. #' Also adds effective sample size for case control data. #' #' @param dat exposure_dat, outcome_dat or harmonised_data @@ -76,7 +76,7 @@ add_rsq_one <- function(dat, what="exposure") ind1 <- !is.na(dat[[paste0("pval.", what)]]) & !is.na(dat[[paste0("samplesize.", what)]]) dat[[paste0("rsq.", what)]] <- NA if(sum(ind1) > 0) - { + { dat[[paste0("rsq.", what)]][ind1] <- get_r_from_bsen( dat[[paste0("beta.", what)]][ind1], dat[[paste0("se.", what)]][ind1], @@ -114,7 +114,7 @@ test_r_from_pn <- function() "Package \"tidyr\" must be installed to use this function.", call. = FALSE ) - } + } param <- expand.grid( n = c(10, 100, 1000, 10000, 100000), @@ -228,7 +228,7 @@ compareNA <- function(v1,v2) { #' Estimate proportion of variance of liability explained by SNP in general population #' -#' This uses equation 10 in Lee et al. A Better Coefficient of Determination for Genetic Profile Analysis. +#' This uses equation 10 in Lee et al. A Better Coefficient of Determination for Genetic Profile Analysis. #' Genetic Epidemiology 36: 214–224 (2012) \doi{10.1002/gepi.21614}. #' #' @param lor Vector of Log odds ratio. @@ -309,7 +309,7 @@ contingency <- function(af, prop, odds_ratio, eps=1e-15) z <- -c_ / b } else { d <- b^2 - 4*a*c_ - if (d < eps*eps) + if (d < eps*eps) { s <- 0 } else { @@ -327,7 +327,7 @@ contingency <- function(af, prop, odds_ratio, eps=1e-15) #' @param g Vector of 0/1/2 #' #' @export -#' @return Allele frequency +#' @return Allele frequency allele_frequency <- function(g) { (sum(g == 1) + 2 * sum(g == 2)) / (2 * sum(!is.na(g))) diff --git a/R/enrichment.R b/R/enrichment.R index 714ad17c..1f203a28 100644 --- a/R/enrichment.R +++ b/R/enrichment.R @@ -42,7 +42,7 @@ enrichment_method_list <- function() a <- lapply(a, as.data.frame) a <- plyr::rbind.fill(a) a <- as.data.frame(lapply(a, as.character), stringsAsFactors=FALSE) - return(a) + return(a) } diff --git a/R/eve.R b/R/eve.R index 054f9220..7caf1116 100644 --- a/R/eve.R +++ b/R/eve.R @@ -114,8 +114,8 @@ mr_mean_egger <- function(d) y2 <- ratios * weights2 egger2 <- summary(stats::lm(y2 ~ weights2)) eggeroutliers <- dplyr::tibble( - SNP=d$SNP, - Qj = weights2^2 * (ratios - stats::coefficients(egger2)[1,1] / weights2 - stats::coefficients(egger2)[2,1])^2, + SNP=d$SNP, + Qj = weights2^2 * (ratios - stats::coefficients(egger2)[1,1] / weights2 - stats::coefficients(egger2)[2,1])^2, Qpval=stats::pchisq(Qj,1,lower.tail=FALSE) ) @@ -209,7 +209,7 @@ mr_all <- function(dat, parameters=default_parameters()) m1$estimates <- dplyr::bind_rows(m1$estimates, m2, m3) } m1$info <- c(list( - id.exposure = dat$id.exposure[1], id.outcome = dat$id.outcome[1]), + id.exposure = dat$id.exposure[1], id.outcome = dat$id.outcome[1]), system_metrics(dat) ) %>% dplyr::as_tibble() return(m1) diff --git a/R/forest_plot.R b/R/forest_plot.R index e57eb018..ff8e17f0 100644 --- a/R/forest_plot.R +++ b/R/forest_plot.R @@ -43,7 +43,7 @@ #' @param decrease (logical) sort the studies by decreasing effect sizes `TRUE`/`FALSE`? #' @param se_Col (character) name of the column giving the standard error of the effect sizes. #' @param returnRobj (logical) return the graph as an internal R object `TRUE`/`FALSE`? -#' +#' #' @keywords internal #' @return grid object giving the forest plot (or plot as pdf) mr_forest_plot_grouped <- @@ -117,7 +117,7 @@ mr_forest_plot_grouped <- idx <- idx + 2 + n_std } # returns data frame giving spacings, primary annotation (content_list), typeface attributes for the primary annotation (attr_list), and mapping between rows in forest plot and rows in meta-analytic data frame - return( data.frame( spacing_vec = (1 + length(spacing_vec) - spacing_vec), content_list = unlist(content_list), row_list = unlist(row_list), attr_list = unlist(attr_list) ) ) + return(data.frame(spacing_vec = (1 + length(spacing_vec) - spacing_vec), content_list = unlist(content_list), row_list = unlist(row_list), attr_list = unlist(attr_list))) } @@ -162,7 +162,7 @@ mr_forest_plot_grouped <- data_Fm$eff_col <- log(as.numeric(data_Fm[,eff_col])) } # ggplot code to generate the forest plot using geom_segments and geom_points and to make a relatively minimal theme - raw_forest <- ggplot2::ggplot(data = data_Fm, ggplot2::aes( y = space_col, yend = space_col, x = as.numeric(lb_col), xend = as.numeric(ub_col) )) + ggplot2::geom_segment() + ggplot2::geom_point(ggplot2::aes( y = space_col, x = as.numeric(eff_col), size = 4 )) + ggplot2::theme_bw() + ggplot2::theme( axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank(), axis.title = ggplot2::element_blank(), panel.grid = ggplot2::element_blank(), rect = ggplot2::element_blank(), title = ggplot2::element_text(size = 23), legend.position = 'none') + ggplot2::expand_limits(y = c(data_Fm[,space_col] - 1, data_Fm[,space_col] + 2)) + ggplot2::labs(title = title_text) # returns ggplot2 object with the (un-annotated) forest plot + raw_forest <- ggplot2::ggplot(data = data_Fm, ggplot2::aes(y = space_col, yend = space_col, x = as.numeric(lb_col), xend = as.numeric(ub_col))) + ggplot2::geom_segment() + ggplot2::geom_point(ggplot2::aes(y = space_col, x = as.numeric(eff_col), size = 4)) + ggplot2::theme_bw() + ggplot2::theme(axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank(), axis.title = ggplot2::element_blank(), panel.grid = ggplot2::element_blank(), rect = ggplot2::element_blank(), title = ggplot2::element_text(size = 23), legend.position = 'none') + ggplot2::expand_limits(y = c(data_Fm[,space_col] - 1, data_Fm[,space_col] + 2)) + ggplot2::labs(title = title_text) # returns ggplot2 object with the (un-annotated) forest plot return(raw_forest) } @@ -180,10 +180,10 @@ mr_forest_plot_grouped <- data_Fm$text_col <- data_Fm[,text_col] # A hard rule to set the width of the annotation column, which sometimes truncates very wide columns (complex disease names, numbers with 16 digits, etc) - text_widths <- c(-1, max(10,0.5 * max(sapply( as.character(data_Fm[,text_col]),nchar )))) + text_widths <- c(-1, max(10,0.5 * max(sapply(as.character(data_Fm[,text_col]), nchar)))) # GGplot rendering of the annotation column - lefttext <- ggplot2::ggplot(data = data_Fm, ggplot2::aes( y = space_col, x = 0, label = text_col, fontface = attr_list )) + ggplot2::geom_text(hjust = 0) + ggplot2::theme_bw() + ggplot2::theme( axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank(),axis.text.x = ggplot2::element_text(colour = "white"),axis.ticks.x = ggplot2::element_line(colour = "white"), axis.title = ggplot2::element_blank(), rect = ggplot2::element_blank(), panel.grid = ggplot2::element_blank(), title = ggplot2::element_text(size = 23) ) + ggplot2::expand_limits(x = text_widths, y = c(data_Fm[,space_col] - 1, data_Fm[,space_col] + 2)) + ggplot2::labs(title = title_text, size = 40) # returns two-item list with left_text, the GGplot annotations, and text_widths, the x-axis limits of the plot + lefttext <- ggplot2::ggplot(data = data_Fm, ggplot2::aes(y = space_col, x = 0, label = text_col, fontface = attr_list)) + ggplot2::geom_text(hjust = 0) + ggplot2::theme_bw() + ggplot2::theme(axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank(), axis.text.x = ggplot2::element_text(colour = "white"), axis.ticks.x = ggplot2::element_line(colour = "white"), axis.title = ggplot2::element_blank(), rect = ggplot2::element_blank(), panel.grid = ggplot2::element_blank(), title = ggplot2::element_text(size = 23)) + ggplot2::expand_limits(x = text_widths, y = c(data_Fm[, space_col] - 1, data_Fm[, space_col] + 2)) + ggplot2::labs(title = title_text, size = 40) # returns two-item list with left_text, the GGplot annotations, and text_widths, the x-axis limits of the plot return(list(left_text = lefttext, text_widths = text_widths)) } @@ -203,7 +203,7 @@ mr_forest_plot_grouped <- for (i in seq_along(col_names)) { # loop to get the widths of each annotation column and to group the annotation objects together - col <- anot_col( data_Fm = data_Fm, text_col = col_names[i], space_col = space_col, title_text = title_list[[i]] ) + col <- anot_col(data_Fm = data_Fm, text_col = col_names[i], space_col = space_col, title_text = title_list[[i]]) relative_widths[i] <- col$text_widths[2] - col$text_widths[1] output[[col_names[i]]] <- ggplotGrob(col$left_text) } @@ -246,7 +246,7 @@ mr_forest_plot_grouped <- width_vec <- c(left_RW,0.34,right_RW) width_vec <- width_vec / sum(width_vec) # convert the grid objects (now grouped) into a table of grid objects that can be plotted using grid.draw - grp_FP <- gtable::gtable_matrix( name = "groupplot", grobs = matrix(grob_Bag, nrow = 1), widths = unit(width_vec, "npc"), heights = unit(1,"npc") ) + grp_FP <- gtable::gtable_matrix(name = "groupplot", grobs = matrix(grob_Bag, nrow = 1), widths = unit(width_vec, "npc"), heights = unit(1,"npc")) # return the grid object table, to be plotted return(grp_FP) @@ -263,18 +263,18 @@ mr_forest_plot_grouped <- ## order and structure effect sizes and CIs for forest plot - space1 <- spacer( exposure = exposure_Name, eff_col = eff_Col, outcome = outcome_Name, Data_Fm = data, decrease = decrease) + space1 <- spacer(exposure = exposure_Name, eff_col = eff_Col, outcome = outcome_Name, Data_Fm = data, decrease = decrease) expand_data <- space_Out(data_Fm = data, space_Fm = space1) ## Make the forest plot - fo1 <- ggforest( data_Fm = expand_data, space_col = 'spacing_vec', eff_col = eff_Col, lb_col = "lb" ,ub_col = "ub", log_ES = log_ES, title_text = forest_Title ) + fo1 <- ggforest(data_Fm = expand_data, space_col = 'spacing_vec', eff_col = eff_Col, lb_col = "lb" ,ub_col = "ub", log_ES = log_ES, title_text = forest_Title) ## Construct left-hand-side annotations - left <- anot_side( data_Fm = expand_data, space_col = 'spacing_vec', col_names = left_Col_Names, title_list = left_Col_Titles ) + left <- anot_side(data_Fm = expand_data, space_col = 'spacing_vec', col_names = left_Col_Names, title_list = left_Col_Titles) ## Construct right-hand-side annotations - right <- anot_side( data_Fm = expand_data, space_col = 'spacing_vec', col_names = right_Col_Names, title_list = right_Col_Titles ) + right <- anot_side(data_Fm = expand_data, space_col = 'spacing_vec', col_names = right_Col_Names, title_list = right_Col_Titles) ## group all plots together group <- group_Plots(forst_Pt = fo1, left_Hs = left, right_Hs = right) diff --git a/R/forest_plot2.R b/R/forest_plot2.R index 93846c6c..155fdc9f 100644 --- a/R/forest_plot2.R +++ b/R/forest_plot2.R @@ -1,7 +1,7 @@ #' Format MR results for forest plot #' #' This function takes the results from [mr()] and is particularly useful -#' if the MR has been applied using multiple exposures and multiple outcomes. +#' if the MR has been applied using multiple exposures and multiple outcomes. #' It creates a new data frame with the following: #' \itemize{ #' \item Variables: exposure, outcome, category, outcome sample size, effect, upper ci, lower ci, pval, nsnp @@ -9,18 +9,18 @@ #' \item exponentiated effects if required #' } #' -#' By default it uses the [available_outcomes()] function to retrieve the study level characteristics for the outcome trait, -#' including sample size and outcome category. +#' By default it uses the [available_outcomes()] function to retrieve the study level characteristics for the outcome trait, +#' including sample size and outcome category. #' This assumes the MR analysis was performed using outcome GWAS(s) contained in MR-Base. -#' -#' If \code{ao_slc} is set to \code{TRUE} then the user must supply their own study level characteristics. -#' This is useful when the user has supplied their own outcome GWAS results (i.e. they are not in MR-Base). -#' +#' +#' If \code{ao_slc} is set to \code{TRUE} then the user must supply their own study level characteristics. +#' This is useful when the user has supplied their own outcome GWAS results (i.e. they are not in MR-Base). +#' #' @param mr_res Results from [mr()]. #' @param exponentiate Convert effects to OR? The default is `FALSE`. #' @param single_snp_method Which of the single SNP methods to use when only 1 SNP was used to estimate the causal effect? The default is `"Wald ratio"`. #' @param multi_snp_method Which of the multi-SNP methods to use when there was more than 1 SNPs used to estimate the causal effect? The default is `"Inverse variance weighted"`. -#' @param ao_slc Logical; retrieve sample size and subcategory using [available_outcomes()]. If set to `FALSE` `mr_res` must contain the following additional columns: `subcategory` and `sample_size`. +#' @param ao_slc Logical; retrieve sample size and subcategory using [available_outcomes()]. If set to `FALSE` `mr_res` must contain the following additional columns: `subcategory` and `sample_size`. #' @param priority Name of category to prioritise at the top of the forest plot. The default is `"Cardiometabolic"`. #' #' @export @@ -28,8 +28,8 @@ format_mr_results <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald ratio", multi_snp_method="Inverse variance weighted", ao_slc=TRUE, priority="Cardiometabolic") { # Get extra info on outcomes - if(ao_slc) - { + if(ao_slc) + { ao <- available_outcomes() ao$subcategory[ao$subcategory == "Cardiovascular"] <- "Cardiometabolic" ao$subcategory[ao$trait == "Type 2 diabetes"] <- "Cardiometabolic" @@ -38,9 +38,9 @@ format_mr_results <- function(mr_res, exponentiate=FALSE, single_snp_method="Wal dat <- subset(mr_res, (nsnp==1 & method==single_snp_method) | (nsnp > 1 & method == multi_snp_method)) dat$index <- seq_len(nrow(dat)) - + if(ao_slc) - { + { dat <- merge(dat, ao, by.x="id.outcome", by.y="id") } dat <- dat[order(dat$b), ] @@ -56,11 +56,11 @@ format_mr_results <- function(mr_res, exponentiate=FALSE, single_snp_method="Wal dat$up_ci <- exp(dat$up_ci) dat$lo_ci <- exp(dat$lo_ci) } - + # Organise cats dat$subcategory <- as.factor(dat$subcategory) - - if(!ao_slc) #generate a simple trait column. this contains only the outcome name (ie excludes consortium and year from the outcome column generated by mr()). This step caters to the possibility that a user's results contain a mixture of results obtained via MR-Base and correspondence. The later won't be present in the MR-Base database. However, still need to split the outcome name into trait, year and consortium. + + if(!ao_slc) #generate a simple trait column. this contains only the outcome name (ie excludes consortium and year from the outcome column generated by mr()). This step caters to the possibility that a user's results contain a mixture of results obtained via MR-Base and correspondence. The later won't be present in the MR-Base database. However, still need to split the outcome name into trait, year and consortium. { dat$trait<-dat$outcome @@ -142,7 +142,7 @@ format_mr_results <- function(mr_res, exponentiate=FALSE, single_snp_method="Wal temp1 <- subset(dat, category==priority) temp2 <- subset(dat, category=="Other") dat <- rbind( - subset(dat, category==priority), + subset(dat, category==priority), subset(dat, !category %in% c(priority,"Other")), subset(dat, category=="Other") ) @@ -160,7 +160,7 @@ format_mr_results <- function(mr_res, exponentiate=FALSE, single_snp_method="Wal #' @keywords internal #' @return Character or array of character simple_cap <- function(x) { - sapply(x, function(x){ + sapply(x, function(x) { x <- tolower(x) s <- strsplit(x, " ")[[1]] paste(toupper(substring(s, 1,1)), substring(s, 2), sep="", collapse=" ") @@ -314,11 +314,11 @@ forest_plot_basic <- function(dat, section=NULL, colour_group=NULL, colour_group ggplot2::scale_fill_manual(values=c("#eeeeee", "#ffffff"), guide=FALSE) + ggplot2::theme( axis.line=ggplot2::element_blank(), - axis.text.y=ggplot2::element_blank(), - axis.ticks.y=ggplot2::element_blank(), - axis.text.x=text_colour, - axis.ticks.x=tick_colour, - # strip.text.y=ggplot2::element_text(angle=360, hjust=0), + axis.text.y=ggplot2::element_blank(), + axis.ticks.y=ggplot2::element_blank(), + axis.text.x=text_colour, + axis.ticks.x=tick_colour, + # strip.text.y=ggplot2::element_text(angle=360, hjust=0), strip.background=ggplot2::element_rect(fill="white", colour="white"), strip.text=ggplot2::element_text(family="Courier New", face="bold", size=9), legend.position="none", @@ -379,9 +379,9 @@ forest_plot_names <- function(dat, section=NULL, bottom=TRUE) point_plot <- ggplot2::geom_point(ggplot2::aes(colour=exposure), size=2) outcome_labels <- ggplot2::geom_text( - ggplot2::aes(label=outcome), - x=lo, - y=mean(c(1, length(unique(dat$exposure)))), + ggplot2::aes(label=outcome), + x=lo, + y=mean(c(1, length(unique(dat$exposure)))), hjust=0, vjust=0.5, size=3.5 ) main_title <- section @@ -404,11 +404,11 @@ forest_plot_names <- function(dat, section=NULL, bottom=TRUE) ggplot2::scale_fill_manual(values=c("#eeeeee", "#ffffff"), guide=FALSE) + ggplot2::theme( axis.line=ggplot2::element_blank(), - axis.text.y=ggplot2::element_blank(), - axis.ticks.y=ggplot2::element_blank(), - axis.text.x=text_colour, - axis.ticks.x=tick_colour, - # strip.text.y=ggplot2::element_text(angle=360, hjust=0), + axis.text.y=ggplot2::element_blank(), + axis.ticks.y=ggplot2::element_blank(), + axis.text.x=text_colour, + axis.ticks.x=tick_colour, + # strip.text.y=ggplot2::element_text(angle=360, hjust=0), strip.background=ggplot2::element_rect(fill="white", colour="white"), strip.text=ggplot2::element_text(family="Courier New", face="bold", size=11), legend.position="none", @@ -435,7 +435,7 @@ forest_plot_names <- function(dat, section=NULL, bottom=TRUE) #' Forest plot for multiple exposures and multiple outcomes #' #' Perform MR of multiple exposures and multiple outcomes. This plots the results. -#' +#' #' @param mr_res Results from [mr()]. #' @param exponentiate Convert effects to OR? Default is `FALSE`. #' @param single_snp_method Which of the single SNP methosd to use when only 1 SNP was used to estimate the causal effect? The default is `"Wald ratio"`. @@ -455,8 +455,8 @@ forest_plot_names <- function(dat, section=NULL, bottom=TRUE) forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald ratio", multi_snp_method="Inverse variance weighted", group_single_categories=TRUE, by_category=TRUE, in_columns=FALSE, threshold=NULL, xlab="", xlim=NULL, trans="identity",ao_slc=TRUE, priority="Cardiometabolic") { dat <- format_mr_results( - mr_res, - exponentiate=exponentiate, + mr_res, + exponentiate=exponentiate, single_snp_method=single_snp_method, multi_snp_method=multi_snp_method, # group_single_categories=group_single_categories, @@ -478,10 +478,10 @@ forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald rati dat$lab <- create_label(dat$sample_size, dat$outcome) legend <- cowplot::get_legend( - ggplot2::ggplot(dat, ggplot2::aes(x=effect, y=outcome)) + - ggplot2::geom_point(ggplot2::aes(colour=exposure)) + - ggplot2::scale_colour_brewer(type="qual") + - ggplot2::labs(colour="Exposure") + + ggplot2::ggplot(dat, ggplot2::aes(x=effect, y=outcome)) + + ggplot2::geom_point(ggplot2::aes(colour=exposure)) + + ggplot2::scale_colour_brewer(type="qual") + + ggplot2::labs(colour="Exposure") + ggplot2::theme(text=ggplot2::element_text(size=10)) ) @@ -490,9 +490,9 @@ forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald rati if(!in_columns) { return(forest_plot_basic( - dat, - bottom = TRUE, - xlab = xlab, + dat, + bottom = TRUE, + xlab = xlab, trans = trans, xlim = xlim, threshold = threshold @@ -500,8 +500,8 @@ forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald rati } else { l <- list() l[[1]] <- forest_plot_names( - dat, - section = NULL, + dat, + section = NULL, bottom = TRUE ) count <- 2 @@ -509,12 +509,12 @@ forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald rati for(i in seq_along(columns)) { l[[count]] <- forest_plot_basic( - dat, + dat, section=NULL, - bottom = TRUE, - colour_group=columns[i], - colour_group_first = FALSE, - xlab = paste0(xlab, " ", columns[i]), + bottom = TRUE, + colour_group=columns[i], + colour_group_first = FALSE, + xlab = paste0(xlab, " ", columns[i]), trans = trans, xlim = xlim, threshold = threshold @@ -524,9 +524,9 @@ forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald rati return( cowplot::plot_grid( gridExtra::arrangeGrob( - grobs=l, - ncol=length(columns) + 1, - nrow=1, + grobs=l, + ncol=length(columns) + 1, + nrow=1, widths=c(4, rep(5, length(columns))) ) ) @@ -542,10 +542,10 @@ forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald rati for(i in seq_along(sec)) { l[[i]] <- forest_plot_basic( - dat, - sec[i], - bottom = i==length(sec), - xlab = xlab, + dat, + sec[i], + bottom = i==length(sec), + xlab = xlab, trans = trans, xlim = xlim, threshold = threshold @@ -560,7 +560,7 @@ forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald rati gridExtra::arrangeGrob( legend, gridExtra::arrangeGrob(grobs=l, ncol=1, nrow=length(h), heights=h), - ncol=2, nrow=1, widths=c(1, 5) + ncol=2, nrow=1, widths=c(1, 5) ) ) ) @@ -574,20 +574,20 @@ forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald rati { h[i] <- length(unique(subset(dat, category==sec[i])$outcome)) l[[count]] <- forest_plot_names( - dat, - sec[i], + dat, + sec[i], bottom = i==length(sec) ) count <- count + 1 for(j in seq_along(columns)) { l[[count]] <- forest_plot_basic( - dat, - sec[i], - bottom = i==length(sec), - colour_group=columns[j], - colour_group_first = FALSE, - xlab = paste0(xlab, " ", columns[j]), + dat, + sec[i], + bottom = i==length(sec), + colour_group=columns[j], + colour_group_first = FALSE, + xlab = paste0(xlab, " ", columns[j]), trans = trans, xlim = xlim, threshold = threshold @@ -601,9 +601,9 @@ forest_plot <- function(mr_res, exponentiate=FALSE, single_snp_method="Wald rati return( cowplot::plot_grid( gridExtra::arrangeGrob( - grobs=l, - ncol=length(columns) + 1, - nrow=length(h), + grobs=l, + ncol=length(columns) + 1, + nrow=length(h), heights=h, widths=c(4, rep(5, length(columns))) ) diff --git a/R/forest_plot_1-to-many.R b/R/forest_plot_1-to-many.R index 8512a258..4083b492 100644 --- a/R/forest_plot_1-to-many.R +++ b/R/forest_plot_1-to-many.R @@ -1,14 +1,14 @@ #' Format MR results for a 1-to-many forest plot #' -#' This function formats user-supplied results for the [forest_plot_1_to_many()] function. -#' The user supplies their results in the form of a data frame. -#' The data frame is assumed to contain at least three columns of data: +#' This function formats user-supplied results for the [forest_plot_1_to_many()] function. +#' The user supplies their results in the form of a data frame. +#' The data frame is assumed to contain at least three columns of data: #' \enumerate{ -#' \item effect estimates, from an analysis of the effect of an exposure on an outcome; -#' \item standard errors for the effect estimates; and +#' \item effect estimates, from an analysis of the effect of an exposure on an outcome; +#' \item standard errors for the effect estimates; and #' \item a column of trait names, corresponding to the 'many' in a 1-to-many forest plot. #' } -#' +#' #' @param mr_res Data frame of results supplied by the user. #' @param b Name of the column specifying the effect of the exposure on the outcome. Default = `"b"`. #' @param se Name of the column specifying the standard error for b. Default = `"se"`. @@ -23,24 +23,24 @@ #' @return data frame. format_1_to_many <- function(mr_res, b="b",se="se",exponentiate=FALSE, ao_slc=FALSE,by=NULL,TraitM="outcome",addcols=NULL,weight=NULL) { - if(!is.null(by)){ + if (!is.null(by)) { mr_res<-mr_res[,names(mr_res)!="subcategory"] names(mr_res)[names(mr_res)==by]<-"subcategory" - }else{ + } else { mr_res$subcategory<-"" } - if(is.null(weight)) { + if (is.null(weight)) { mr_res$weight=3 } - if(TraitM=="exposure"){ #the plot function currently tries to plot separate plots for each unique exposure. This is a legacy of the original multiple exposures forest plot function and needs to be cleaned up. The function won't work if the TraitM column is called exposure + if (TraitM=="exposure") { #the plot function currently tries to plot separate plots for each unique exposure. This is a legacy of the original multiple exposures forest plot function and needs to be cleaned up. The function won't work if the TraitM column is called exposure names(mr_res)[names(mr_res)=="exposure"]<-"TraitM" TraitM<-"TraitM" } - names(mr_res)[names(mr_res)==b ]<-"b" - names(mr_res)[names(mr_res)==se ]<-"se" + names(mr_res)[names(mr_res)==b]<-"b" + names(mr_res)[names(mr_res)==se]<-"se" Letters<-c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z") Letters<-sort(c(paste0("A",Letters),paste0("B",Letters),paste0("C",Letters),paste0("D",Letters))) mr_res$outcome2<-mr_res[,TraitM] @@ -50,8 +50,8 @@ format_1_to_many <- function(mr_res, b="b",se="se",exponentiate=FALSE, ao_slc=FA mr_res$exposure<-"" # Get extra info on outcomes - if(ao_slc) - { + if(ao_slc) + { ao <- available_outcomes() ao$subcategory[ao$subcategory == "Cardiovascular"] <- "Cardiometabolic" ao$subcategory[ao$trait == "Type 2 diabetes"] <- "Cardiometabolic" @@ -60,9 +60,9 @@ format_1_to_many <- function(mr_res, b="b",se="se",exponentiate=FALSE, ao_slc=FA dat<-mr_res dat$index <- seq_len(nrow(dat)) - + if(ao_slc) - { + { dat <- merge(dat, ao, by.x="id.outcome", by.y="id") } dat <- dat[order(dat$b), ] @@ -78,11 +78,11 @@ format_1_to_many <- function(mr_res, b="b",se="se",exponentiate=FALSE, ao_slc=FA dat$up_ci <- exp(dat$up_ci) dat$lo_ci <- exp(dat$lo_ci) } - + # Organise cats dat$subcategory <- as.factor(dat$subcategory) - - if(!ao_slc) #generate a simple trait column. this contains only the outcome name (ie excludes consortium and year from the outcome column generated by mr()). This step caters to the possibility that a user's results contain a mixture of results obtained via MR-Base and correspondence. The later won't be present in the MR-Base database. However, still need to split the outcome name into trait, year and consortium. + + if(!ao_slc) #generate a simple trait column. this contains only the outcome name (ie excludes consortium and year from the outcome column generated by mr()). This step caters to the possibility that a user's results contain a mixture of results obtained via MR-Base and correspondence. The later won't be present in the MR-Base database. However, still need to split the outcome name into trait, year and consortium. { dat$trait<-as.character(dat[,TraitM]) @@ -113,18 +113,18 @@ format_1_to_many <- function(mr_res, b="b",se="se",exponentiate=FALSE, ao_slc=FA stringsAsFactors = FALSE ) - if(!is.null(addcols)){ + if (!is.null(addcols)) { dat2<-dat[,addcols] dat<-cbind(dat1,dat2) - if(length(addcols)==1){ + if (length(addcols)==1) { names(dat)[names(dat)=="dat2"]<-addcols } - }else{ + } else { dat<-dat1 } exps <- unique(dat$exposure) - + dat <- dat[order(dat$index), ] dat <- dat[order(dat$outcome), ] @@ -134,34 +134,34 @@ format_1_to_many <- function(mr_res, b="b",se="se",exponentiate=FALSE, ao_slc=FA #' Sort results for 1-to-many forest plot #' -#' This function sorts user-supplied results for the [forest_plot_1_to_many()] function. The user supplies their results in the form of a data frame. -#' +#' This function sorts user-supplied results for the [forest_plot_1_to_many()] function. The user supplies their results in the form of a data frame. +#' #' @param mr_res Data frame of results supplied by the user. #' @param b Name of the column specifying the effect of the exposure on the outcome. The default is `"b"`. #' @param trait_m The column specifying the names of the traits. Corresponds to 'many' in the 1-to-many forest plot. The default is `"outcome"`. -#' @param group Name of grouping variable in `mr_res`. -#' @param priority If `sort_action = 3`, choose which value of the `trait_m` variable should be given priority and go above the other `trait_m` values. -#' The trait with the largest effect size for the prioritised group will go to the top of the plot. -#' @param sort_action Choose how to sort results. +#' @param group Name of grouping variable in `mr_res`. +#' @param priority If `sort_action = 3`, choose which value of the `trait_m` variable should be given priority and go above the other `trait_m` values. +#' The trait with the largest effect size for the prioritised group will go to the top of the plot. +#' @param sort_action Choose how to sort results. #' \itemize{ -#' \item `sort_action = 1`: sort results by effect size within groups. Use the group order supplied by the user. -#' \item `sort_action = 2`: sort results by effect size and group. Overides the group ordering supplied by the user. +#' \item `sort_action = 1`: sort results by effect size within groups. Use the group order supplied by the user. +#' \item `sort_action = 2`: sort results by effect size and group. Overides the group ordering supplied by the user. #' \item `sort_action = 3`: group results for the same trait together (e.g. multiple results for the same trait from different MR methods). -#' \item `sort_action = 4`: sort by decreasing effect size (largest effect size at top and smallest at bottom). +#' \item `sort_action = 4`: sort by decreasing effect size (largest effect size at top and smallest at bottom). #' \item `sort_action = 5`: sort by increasing effect size (smallest effect size at top and largest at bottom). #' } #' #' @export #' @return data frame. -#' -sort_1_to_many <- function(mr_res,b="b",trait_m="outcome",sort_action=4,group=NULL,priority=NULL){ +#' +sort_1_to_many <- function(mr_res,b="b",trait_m="outcome",sort_action=4,group=NULL,priority=NULL) { mr_res[,trait_m]<-as.character(mr_res[,trait_m]) mr_res[,group]<-as.character(mr_res[,group]) - if(!b %in% names(mr_res)) warning("Column with effect estimates not found. Did you forget to specify the column of data containing your effect estimates?") - if(sort_action==1){ - if(is.null(group)) warning("You must indicate a grouping variable") - + if (!b %in% names(mr_res)) warning("Column with effect estimates not found. Did you forget to specify the column of data containing your effect estimates?") + if (sort_action==1) { + if (is.null(group)) warning("You must indicate a grouping variable") + # Numbers<-1:100 Letters<-c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z") Letters<-sort(c(paste0("A",Letters),paste0("B",Letters),paste0("C",Letters))) @@ -174,13 +174,13 @@ sort_1_to_many <- function(mr_res,b="b",trait_m="outcome",sort_action=4,group=NU mr_res<-mr_res[,!names(mr_res) %in% c("Index","Index2","Index3")] } - if(sort_action ==2){ - if(is.null(group)) warning("You must indicate a grouping variable") + if (sort_action ==2) { + if (is.null(group)) warning("You must indicate a grouping variable") mr_res<-mr_res[order(mr_res[,b],decreasing=TRUE),] mr_res<-mr_res[order(mr_res[,group]),] } - - if(sort_action==3){ + + if (sort_action==3) { if(is.null(group)) warning("You must indicate a grouping variable") if(is.null(priority)) warning("You must indicate which value of the grouping variable ",group," to use as the priority value") @@ -190,8 +190,7 @@ sort_1_to_many <- function(mr_res,b="b",trait_m="outcome",sort_action=4,group=NU mr_res1$b.sort[mr_res1[,trait_m]==priority]<-mr_res1[,b][mr_res1[,trait_m]==priority] # mr_res1$b.sort[mr_res1[,group]==priority]<-1000 - for(i in unique(mr_res1[,group])) - { + for (i in unique(mr_res1[,group])) { mr_res1$b.sort[mr_res1[,group] == i & is.na(mr_res1$b.sort)]<-mr_res1$b.sort[mr_res1[,group]== i & !is.na(mr_res1$b.sort)] } # mr_res1$b.sort[is.na(mr_res1$b.sort)]<-mr_res1$b.sort[!is.na(mr_res1$b.sort)] @@ -201,26 +200,25 @@ sort_1_to_many <- function(mr_res,b="b",trait_m="outcome",sort_action=4,group=NU mr_res<-mr_res[order(mr_res$b.sort,decreasing=TRUE),] groups<-unique(mr_res[,group]) List<-NULL - for(i in seq_along(groups)){ + for (i in seq_along(groups)) { Test<-mr_res[mr_res[,group]==groups[i],] Test1<-Test[Test[,trait_m] != priority,] Test2<-Test[Test[,trait_m] == priority,] List[[i]]<-rbind(Test2,Test1) } mr_res<-do.call(rbind,List) - } - if(sort_action ==4){ + if (sort_action ==4) { mr_res<-mr_res[order(mr_res[,b],decreasing=TRUE),] } - if(sort_action ==5){ + if (sort_action ==5) { mr_res<-mr_res[order(mr_res[,b],decreasing=FALSE),] } return(mr_res) - + } #' A basic forest plot @@ -270,7 +268,7 @@ forest_plot_basic2 <- function(dat, section=NULL, colour_group=NULL, colour_grou dat$up_ci <- pmin(dat$up_ci, xlim[2], na.rm=TRUE) } - if(is.null(up) || is.null(lo) ){ + if (is.null(up) || is.null(lo)) { up <- max(dat$up_ci, na.rm=TRUE) lo <- min(dat$lo_ci, na.rm=TRUE) } @@ -333,11 +331,11 @@ forest_plot_basic2 <- function(dat, section=NULL, colour_group=NULL, colour_grou ggplot2::scale_fill_manual(values=c("#eeeeee", "#ffffff"), guide=FALSE) + ggplot2::theme( axis.line=ggplot2::element_blank(), - axis.text.y=ggplot2::element_blank(), - axis.ticks.y=ggplot2::element_blank(), - axis.text.x=text_colour, - axis.ticks.x=tick_colour, - # strip.text.y=ggplot2::element_text(angle=360, hjust=0), + axis.text.y=ggplot2::element_blank(), + axis.ticks.y=ggplot2::element_blank(), + axis.text.x=text_colour, + axis.ticks.x=tick_colour, + # strip.text.y=ggplot2::element_text(angle=360, hjust=0), strip.background=ggplot2::element_rect(fill="white", colour="white"), strip.text=ggplot2::element_text(family="Courier New", face="bold", size=9), legend.position="none", @@ -398,20 +396,20 @@ forest_plot_names2 <- function(dat, section=NULL, var1="outcome2",bottom=TRUE,ti point_plot <- ggplot2::geom_point(ggplot2::aes(colour=exposure), size=2) outcome_labels <- ggplot2::geom_text( - ggplot2::aes(label=eval(parse(text=var1))), - x=lo, - y=mean(c(1, length(unique(dat$exposure)))), + ggplot2::aes(label=eval(parse(text=var1))), + x=lo, + y=mean(c(1, length(unique(dat$exposure)))), hjust=0, vjust=0.5, size=col_text_size,color=colour_scheme ) # print(paste0("title=",title)) if(section=="") main_title <- title - + dat$lab<-dat$outcome l <- data.frame(lab=sort(unique(dat$lab)), col="a", stringsAsFactors=FALSE) - l$col[1:nrow(l) %% 2 == 0] <- "b" + l$col[seq_len(nrow(l)) %% 2 == 0] <- "b" dat <- merge(dat, l, by="lab", all.x=TRUE) @@ -423,11 +421,11 @@ forest_plot_names2 <- function(dat, section=NULL, var1="outcome2",bottom=TRUE,ti ggplot2::scale_fill_manual(values=c("#eeeeee", "#ffffff"), guide=FALSE) + ggplot2::theme( axis.line=ggplot2::element_blank(), - axis.text.y=ggplot2::element_blank(), - axis.ticks.y=ggplot2::element_blank(), - axis.text.x=text_colour, - axis.ticks.x=tick_colour, - # strip.text.y=ggplot2::element_text(angle=360, hjust=0), + axis.text.y=ggplot2::element_blank(), + axis.ticks.y=ggplot2::element_blank(), + axis.text.x=text_colour, + axis.ticks.x=tick_colour, + # strip.text.y=ggplot2::element_text(angle=360, hjust=0), strip.background=ggplot2::element_rect(fill="white", colour="white"), strip.text=ggplot2::element_text(family="Courier New", face="bold", size=11), legend.position="none", @@ -485,9 +483,9 @@ forest_plot_addcol <- function(dat, section=NULL, addcol=NULL,bottom=TRUE,addcol point_plot <- ggplot2::geom_point(ggplot2::aes(colour=exposure), size=2) outcome_labels <- ggplot2::geom_text( - ggplot2::aes(label=eval(parse(text=addcol))), + ggplot2::aes(label=eval(parse(text=addcol))), x=lo, - y=mean(c(1, length(unique(dat$exposure)))), + y=mean(c(1, length(unique(dat$exposure)))), hjust=0, vjust=0.5, size=col_text_size,colour=colour_scheme ) @@ -507,11 +505,11 @@ forest_plot_addcol <- function(dat, section=NULL, addcol=NULL,bottom=TRUE,addcol ggplot2::scale_fill_manual(values=c("#eeeeee", "#ffffff"), guide=FALSE) + ggplot2::theme( axis.line=ggplot2::element_blank(), - axis.text.y=ggplot2::element_blank(), - axis.ticks.y=ggplot2::element_blank(), - axis.text.x=text_colour, - axis.ticks.x=tick_colour, - # strip.text.y=ggplot2::element_text(angle=360, hjust=0), + axis.text.y=ggplot2::element_blank(), + axis.ticks.y=ggplot2::element_blank(), + axis.text.x=text_colour, + axis.ticks.x=tick_colour, + # strip.text.y=ggplot2::element_text(angle=360, hjust=0), strip.background=ggplot2::element_rect(fill="white", colour="white"), strip.text=ggplot2::element_text(family="Courier New", face="bold", size=11), legend.position="none", @@ -533,13 +531,13 @@ forest_plot_addcol <- function(dat, section=NULL, addcol=NULL,bottom=TRUE,addcol return(p) } -#' 1-to-many forest plot +#' 1-to-many forest plot #' #' Plot results from an analysis of multiple exposures against a single outcome or a single exposure against multiple outcomes. #' Plots effect estimates and 95 percent confidence intervals. #' The ordering of results in the plot is determined by the order supplied by the user. -#' Users may find [sort_1_to_many()] helpful for sorting their results prior to using the 1-to-many forest plot. The plot function works best for 50 results and is not designed to handle more than 100 results. -#' +#' Users may find [sort_1_to_many()] helpful for sorting their results prior to using the 1-to-many forest plot. The plot function works best for 50 results and is not designed to handle more than 100 results. +#' #' @param mr_res Data frame of results supplied by the user. The default is `"mr_res"`. #' @param b Name of the column specifying the effect of the exposure on the outcome. The default is `"b"`. #' @param se Name of the column specifying the standard error for b. The default is `"se"`. @@ -555,43 +553,43 @@ forest_plot_addcol <- function(dat, section=NULL, addcol=NULL,bottom=TRUE,addcol #' @param exponentiate Convert log odds ratios to odds ratios? Default is `FALSE`. #' @param ao_slc Logical; retrieve trait subcategory information using available_outcomes(). Default is `FALSE`. #' @param trans Specify x-axis scale. e.g. "identity", "log2", etc. If set to "identity" an additive scale is used. If set to log2 the x-axis is plotted on a multiplicative / doubling scale (preferable when plotting odds ratios). Default is `"identity"`. -#' @param lo Lower limit of X axis to plot. -#' @param up upper limit of X axis to plot. -#' @param colour_scheme the general colour scheme for the plot. Default is to make all text and data points `"black"`. +#' @param lo Lower limit of X axis to plot. +#' @param up upper limit of X axis to plot. +#' @param colour_scheme the general colour scheme for the plot. Default is to make all text and data points `"black"`. #' @param shape_points the shape of the data points to pass to [ggplot2::geom_point()]. Default is set to `15` (filled square). #' @param col_text_size The default is `5`. #' @param weight The default is `NULL`. #' #' @export #' @return grid plot object -#' -forest_plot_1_to_many <- function(mr_res="mr_res", b="b",se="se",TraitM="outcome",col1_width=1,col1_title="",exponentiate=FALSE, trans="identity",ao_slc=TRUE,lo=NULL,up=NULL,by=NULL,xlab="Effect (95% confidence interval)",addcols=NULL,addcol_widths=NULL,addcol_titles="",subheading_size=6,shape_points=15,colour_scheme="black",col_text_size=5,weight=NULL){ +#' +forest_plot_1_to_many <- function(mr_res="mr_res", b="b",se="se",TraitM="outcome",col1_width=1,col1_title="",exponentiate=FALSE, trans="identity",ao_slc=TRUE,lo=NULL,up=NULL,by=NULL,xlab="Effect (95% confidence interval)",addcols=NULL,addcol_widths=NULL,addcol_titles="",subheading_size=6,shape_points=15,colour_scheme="black",col_text_size=5,weight=NULL) { # if(is.null(lo) | is.null(up)) warning("Values missing for the lower or upper bounds of the x axis. Did you forget to set the lo and up arguments?") - + xlim=NULL ncols=1+length(addcols) - if(addcol_titles==""){ + if (addcol_titles=="") { addcol_titles<-rep(addcol_titles,length(addcols)) } - + dat <- format_1_to_many( - mr_res=mr_res, + mr_res=mr_res, b=b, se=se, - exponentiate=exponentiate, + exponentiate=exponentiate, ao_slc=ao_slc, by=by, TraitM=TraitM, addcols=addcols, weight=weight ) - + legend <- cowplot::get_legend( - ggplot2::ggplot(dat, ggplot2::aes(x=effect, y=outcome)) + - ggplot2::geom_point(ggplot2::aes(colour=exposure)) + - ggplot2::scale_colour_brewer(type="qual") + - ggplot2::labs(colour="Exposure") + + ggplot2::ggplot(dat, ggplot2::aes(x=effect, y=outcome)) + + ggplot2::geom_point(ggplot2::aes(colour=exposure)) + + ggplot2::scale_colour_brewer(type="qual") + + ggplot2::labs(colour="Exposure") + ggplot2::theme(text=ggplot2::element_text(size=10)) ) @@ -607,7 +605,7 @@ forest_plot_1_to_many <- function(mr_res="mr_res", b="b",se="se",TraitM="outcome h[i] <- length(unique(subset(dat, category==sec[i])$outcome)) l[[count]] <- forest_plot_names2( - dat, + dat, sec[i], bottom = i==length(sec), title=col1_title, @@ -619,9 +617,9 @@ forest_plot_1_to_many <- function(mr_res="mr_res", b="b",se="se",TraitM="outcome count <- count + 1 - if(!is.null(addcols)){ + if (!is.null(addcols)) { - for(j in seq_along(addcols)){ + for (j in seq_along(addcols)) { l[[count]]<-forest_plot_addcol( dat, sec[i], @@ -639,15 +637,14 @@ forest_plot_1_to_many <- function(mr_res="mr_res", b="b",se="se",TraitM="outcome } - for(j in seq_along(columns)) - { + for (j in seq_along(columns)) { l[[count]] <- forest_plot_basic2( - dat, - sec[i], - bottom = i==length(sec), - colour_group=columns[j], - colour_group_first = FALSE, - xlab = paste0(xlab, " ", columns[j]), + dat, + sec[i], + bottom = i==length(sec), + colour_group=columns[j], + colour_group_first = FALSE, + xlab = paste0(xlab, " ", columns[j]), lo=lo, up=up, trans = trans, @@ -664,12 +661,12 @@ forest_plot_1_to_many <- function(mr_res="mr_res", b="b",se="se",TraitM="outcome return( cowplot::plot_grid( gridExtra::arrangeGrob( - grobs=l, - ncol=length(columns) + ncols, - nrow=length(h), + grobs=l, + ncol=length(columns) + ncols, + nrow=length(h), heights=h, widths=c(col1_width,addcol_widths, rep(5, length(columns))) - + ) ) ) diff --git a/R/format_mr_results2.R b/R/format_mr_results2.R index ab906663..41414d21 100644 --- a/R/format_mr_results2.R +++ b/R/format_mr_results2.R @@ -1,4 +1,4 @@ -#' Split outcome column +#' Split outcome column #' #' This function takes the outcome column from the results generated by [mr()] and splits it into separate columns for 'outcome name' and 'id'. #' @@ -9,7 +9,7 @@ split_outcome <- function(mr_res) { Pos<-grep("\\|\\|",mr_res$outcome) #the "||"" indicates that the outcome column was derived from summary data in MR-Base. Sometimes it wont look like this e.g. if the user has supplied their own outcomes - if(sum(Pos)!=0){ + if (sum(Pos)!=0) { Outcome<-as.character(mr_res$outcome[Pos]) Vars<-strsplit(Outcome,split= "\\|\\|") Vars<-unlist(Vars) @@ -22,7 +22,7 @@ split_outcome <- function(mr_res) return(mr_res) } -#' Split exposure column +#' Split exposure column #' #' This function takes the exposure column from the results generated by [mr()] and splits it into separate columns for 'exposure name' and 'id'. #' @@ -33,10 +33,10 @@ split_outcome <- function(mr_res) split_exposure <- function(mr_res) { Pos<-grep("\\|\\|",mr_res$exposure) #the "||"" indicates that the outcome column was derived from summary data in MR-Base. Sometimes it wont look like this e.g. if the user has supplied their own outcomes - # Pos2<-grep("\\|\\|",mr_res$exposure,invert=T) + # Pos2<-grep("\\|\\|",mr_res$exposure,invert=T) # mr_res2 <-mr_res[Pos2,] # mr_res1<-mr_res[Pos,] - if(sum(Pos)!=0){ + if (sum(Pos)!=0) { Exposure<-as.character(mr_res$exposure[Pos]) Vars<-strsplit(as.character(Exposure),split= "\\|\\|") Vars<-unlist(Vars) @@ -49,15 +49,15 @@ split_exposure <- function(mr_res) } -#' Generate odds ratios +#' Generate odds ratios #' #' This function takes b and se from [mr()] and generates odds ratios and 95 percent confidence intervals. #' #' @param mr_res Results from [mr()]. #' #' @export -#' @return data frame -generate_odds_ratios <- function(mr_res) +#' @return data frame +generate_odds_ratios <- function(mr_res) { mr_res$lo_ci <- mr_res$b - 1.96 * mr_res$se mr_res$up_ci <- mr_res$b + 1.96 * mr_res$se @@ -86,25 +86,25 @@ subset_on_method <- function(mr_res, single_snp_method="Wald ratio", multi_snp_m #' Combine all mr results #' -#' This function combines results of [mr()], [mr_heterogeneity()], [mr_pleiotropy_test()] and [mr_singlesnp()] into a single data frame. -#' It also merges the results with outcome study level characteristics in [available_outcomes()]. -#' If desired it also exponentiates results (e.g. if the user wants log odds ratio converted into odds ratios with 95 percent confidence intervals). -#' The exposure and outcome columns from the output from [mr()] contain both the trait names and trait ids. -#' The `combine_all_mrresults()` function splits these into separate columns by default. -#' +#' This function combines results of [mr()], [mr_heterogeneity()], [mr_pleiotropy_test()] and [mr_singlesnp()] into a single data frame. +#' It also merges the results with outcome study level characteristics in [available_outcomes()]. +#' If desired it also exponentiates results (e.g. if the user wants log odds ratio converted into odds ratios with 95 percent confidence intervals). +#' The exposure and outcome columns from the output from [mr()] contain both the trait names and trait ids. +#' The `combine_all_mrresults()` function splits these into separate columns by default. +#' #' @param res Results from [mr()]. #' @param het Results from [mr_heterogeneity()]. #' @param plt Results from [mr_pleiotropy_test()]. #' @param sin Results from [mr_singlesnp()]. -#' @param ao_slc Logical; if set to `TRUE` then outcome study level characteristics are retrieved from [available_outcomes()]. Default is `TRUE`. -#' @param Exp Logical; if set to `TRUE` results are exponentiated. Useful if user wants log odds ratios expressed as odds ratios. Default is `FALSE`. -#' @param split.exposure Logical; if set to `TRUE` the exposure column is split into separate columns for the exposure name and exposure ID. Default is `FALSE`. -#' @param split.outcome Logical; if set to `TRUE` the outcome column is split into separate columns for the outcome name and outcome ID. Default is `FALSE`. -#' +#' @param ao_slc Logical; if set to `TRUE` then outcome study level characteristics are retrieved from [available_outcomes()]. Default is `TRUE`. +#' @param Exp Logical; if set to `TRUE` results are exponentiated. Useful if user wants log odds ratios expressed as odds ratios. Default is `FALSE`. +#' @param split.exposure Logical; if set to `TRUE` the exposure column is split into separate columns for the exposure name and exposure ID. Default is `FALSE`. +#' @param split.outcome Logical; if set to `TRUE` the outcome column is split into separate columns for the outcome name and outcome ID. Default is `FALSE`. +#' #' @export #' @return data frame -# +# # library(TwoSampleMR) # library(MRInstruments) @@ -116,7 +116,7 @@ subset_on_method <- function(mr_res, single_snp_method="Wald ratio", multi_snp_m # ) # dat <- harmonise_data( -# exposure_dat = exp_dat, +# exposure_dat = exp_dat, # outcome_dat = chd_out_dat # ) @@ -135,11 +135,11 @@ subset_on_method <- function(mr_res, single_snp_method="Wald ratio", multi_snp_m combine_all_mrresults <- function(res,het,plt,sin,ao_slc=TRUE,Exp=FALSE,split.exposure=FALSE,split.outcome=FALSE) { het<-het[,c("id.exposure","id.outcome","method","Q","Q_df","Q_pval")] - + # Convert all factors to character # lapply(names(Res), FUN=function(x) class(Res[,x])) - Class<-unlist(lapply(names(res), FUN=function(x) class(res[,x]))) - if(any(Class == "factor")) { + Class<-unlist(lapply(names(res), FUN=function(x) class(res[,x]))) + if(any(Class == "factor")) { Pos<-which(unlist(lapply(names(res), FUN=function(x) class(res[,x])))=="factor") for(i in seq_along(Pos)){ res[,Pos[i]]<-as.character(res[,Pos[i]]) @@ -148,7 +148,7 @@ combine_all_mrresults <- function(res,het,plt,sin,ao_slc=TRUE,Exp=FALSE,split.ex # lapply(names(Het), FUN=function(x) class(Het[,x])) Class<-unlist(lapply(names(het), FUN=function(x) class(het[,x]))) - if(any(Class == "factor")) { + if(any(Class == "factor")) { Pos<-which(unlist(lapply(names(het), FUN=function(x) class(het[,x])))=="factor") for(i in seq_along(Pos)){ het[,Pos[i]]<-as.character(het[,Pos[i]]) @@ -157,7 +157,7 @@ combine_all_mrresults <- function(res,het,plt,sin,ao_slc=TRUE,Exp=FALSE,split.ex # lapply(names(Sin), FUN=function(x) class(Sin[,x])) Class<-unlist(lapply(names(sin), FUN=function(x) class(sin[,x]))) - if(any(Class == "factor")) { + if(any(Class == "factor")) { Pos<-which(unlist(lapply(names(sin), FUN=function(x) class(sin[,x])))=="factor") for(i in seq_along(Pos)){ sin[,Pos[i]]<-as.character(sin[,Pos[i]]) @@ -181,7 +181,7 @@ combine_all_mrresults <- function(res,het,plt,sin,ao_slc=TRUE,Exp=FALSE,split.ex if(ao_slc) { ao<-available_outcomes() - names(ao)[names(ao)=="nsnp" ]<-"nsnps.outcome.array" + names(ao)[names(ao)=="nsnp"]<-"nsnps.outcome.array" res<-merge(res,ao[,!names(ao) %in% c("unit","priority","sd","path","note","filename","access","mr")],by.x="id.outcome",by.y="id") } @@ -197,7 +197,7 @@ combine_all_mrresults <- function(res,het,plt,sin,ao_slc=TRUE,Exp=FALSE,split.ex } } - if(Exp){ + if (Exp) { res$or<-exp(res$b) res$or_lci95<-exp(res$b-res$se*1.96) res$or_uci95<-exp(res$b+res$se*1.96) @@ -212,11 +212,11 @@ combine_all_mrresults <- function(res,het,plt,sin,ao_slc=TRUE,Exp=FALSE,split.ex res<-merge(res,plt,by=c("id.outcome","id.exposure","Method"),all.x=TRUE) - if(split.exposure){ + if (split.exposure) { res<-split_exposure(res) } - - if(split.outcome){ + + if (split.outcome) { res<-split_outcome(res) } @@ -228,29 +228,29 @@ combine_all_mrresults <- function(res,het,plt,sin,ao_slc=TRUE,Exp=FALSE,split.ex return(res) } -#' Power prune +#' Power prune #' -#' When there are duplicate summary sets for a particular exposure-outcome combination, this function keeps the -#' exposure-outcome summary set with the highest expected statistical power. -#' This can be done by dropping the duplicate summary sets with the smaller sample sizes. -#' Alternatively, the pruning procedure can take into account instrument strength and outcome sample size. -#' The latter is useful, for example, when there is considerable variation in SNP coverage between duplicate summary sets -#' (e.g. because some studies have used targeted or fine mapping arrays). -#' If there are a large number of SNPs available to instrument an exposure, -#' the outcome GWAS with the better SNP coverage may provide better power than the outcome GWAS with the larger sample size. +#' When there are duplicate summary sets for a particular exposure-outcome combination, this function keeps the +#' exposure-outcome summary set with the highest expected statistical power. +#' This can be done by dropping the duplicate summary sets with the smaller sample sizes. +#' Alternatively, the pruning procedure can take into account instrument strength and outcome sample size. +#' The latter is useful, for example, when there is considerable variation in SNP coverage between duplicate summary sets +#' (e.g. because some studies have used targeted or fine mapping arrays). +#' If there are a large number of SNPs available to instrument an exposure, +#' the outcome GWAS with the better SNP coverage may provide better power than the outcome GWAS with the larger sample size. #' #' @param dat Results from [harmonise_data()]. -#' @param method Should the duplicate summary sets be pruned on the basis of sample size alone (`method = 1`) -#' or a combination of instrument strength and sample size (`method = 2`)? Default set to `1`. -#' When set to 1, the duplicate summary sets are first dropped on the basis of the outcome sample size (smaller duplicates dropped). -#' If duplicates are still present, remaining duplicates are dropped on the basis of the exposure sample size (smaller duplicates dropped). -#' When method is set to `2`, duplicates are dropped on the basis of instrument strength -#' (amount of variation explained in the exposure by the instrumental SNPs) and sample size, -#' and assumes that the SNP-exposure effects correspond to a continuous trait with a normal distribution (i.e. exposure cannot be binary). -#' The SNP-outcome effects can correspond to either a binary or continuous trait. If the exposure is binary then `method=1` should be used. -#' -#' @param dist.outcome The distribution of the outcome. Can either be `"binary"` or `"continuous"`. Default set to `"binary"`. -#' +#' @param method Should the duplicate summary sets be pruned on the basis of sample size alone (`method = 1`) +#' or a combination of instrument strength and sample size (`method = 2`)? Default set to `1`. +#' When set to 1, the duplicate summary sets are first dropped on the basis of the outcome sample size (smaller duplicates dropped). +#' If duplicates are still present, remaining duplicates are dropped on the basis of the exposure sample size (smaller duplicates dropped). +#' When method is set to `2`, duplicates are dropped on the basis of instrument strength +#' (amount of variation explained in the exposure by the instrumental SNPs) and sample size, +#' and assumes that the SNP-exposure effects correspond to a continuous trait with a normal distribution (i.e. exposure cannot be binary). +#' The SNP-outcome effects can correspond to either a binary or continuous trait. If the exposure is binary then `method=1` should be used. +#' +#' @param dist.outcome The distribution of the outcome. Can either be `"binary"` or `"continuous"`. Default set to `"binary"`. +#' #' @export #' @return data.frame with duplicate summary sets removed @@ -258,12 +258,12 @@ power_prune <- function(dat,method=1,dist.outcome="binary") { # dat[,c("eaf.exposure","beta.exposure","se.exposure","samplesize.outcome","ncase.outcome","ncontrol.outcome")] - if(method==1){ - L<-NULL + if (method==1) { + L<-NULL id.sets<-paste(split_exposure(dat)$exposure,split_outcome(dat)$outcome) id.set.unique<-unique(id.sets) dat$id.set<-as.numeric(factor(id.sets)) - for(i in seq_along(id.set.unique)){ + for (i in seq_along(id.set.unique)) { # print(i) print(paste("finding summary set for --", id.set.unique[i],"-- with largest sample size", sep="")) dat1<-dat[id.sets == id.set.unique[i],] @@ -271,14 +271,14 @@ power_prune <- function(dat,method=1,dist.outcome="binary") id.subset.unique<-unique(id.subset) dat1$id.subset<-as.numeric(factor(id.subset)) ncase<-dat1$ncase.outcome - if(is.null(ncase)){ + if (is.null(ncase)) { ncase<-NA } - if(any(is.na(ncase))){ + if (any(is.na(ncase))) { ncase<-dat1$samplesize.outcome if(dist.outcome=="binary") warning(paste("dist.outcome set to binary but case sample size is missing. Will use total sample size instead but power pruning may be less accurate")) } - if(any(is.na(ncase))) stop("sample size missing for at least 1 summary set") + if (any(is.na(ncase))) stop("sample size missing for at least 1 summary set") dat1<-dat1[order(ncase,decreasing=TRUE),] # id.expout<-paste(split_exposure(dat)$exposure,split_outcome(dat)$outcome) ncase<-ncase[order(ncase,decreasing=TRUE)] @@ -295,7 +295,7 @@ power_prune <- function(dat,method=1,dist.outcome="binary") # dat1<-dat1[,!names(dat1) %in% c("power.prune.ncase","power.prune.nexp")] # dat1[,c("samplesize.exposure","ncase.outcome","exposure","outcome")] dat1<-dat1[nexp==nexp[1],] - L[[i]]<-dat1 + L[[i]]<-dat1 } dat<-do.call(rbind,L) dat<-dat[,!names(dat1) %in% c("id.set","id.subset")] @@ -305,19 +305,19 @@ power_prune <- function(dat,method=1,dist.outcome="binary") return(dat) } - if(method==2){ - L<-NULL + if (method==2) { + L<-NULL id.sets<-paste(split_exposure(dat)$exposure,split_outcome(dat)$outcome) id.set.unique<-unique(id.sets) dat$id.set<-as.numeric(factor(id.sets)) - for(i in seq_along(id.set.unique)){ + for (i in seq_along(id.set.unique)) { dat1<-dat[id.sets == id.set.unique[i],] # unique(dat1[,c("exposure","outcome")]) id.subset<-paste(dat1$exposure,dat1$id.exposure,dat1$outcome,dat1$id.outcome) id.subset.unique<-unique(id.subset) dat1$id.subset<-as.numeric(factor(id.subset)) L1<-NULL - for(j in seq_along(id.subset.unique)){ + for (j in seq_along(id.subset.unique)) { # print(j) print(paste("identifying best powered summary set: ",id.subset.unique[j],sep="")) dat2<-dat1[id.subset ==id.subset.unique[j], ] @@ -326,27 +326,27 @@ power_prune <- function(dat,method=1,dist.outcome="binary") se<-dat2$se.exposure z<-dat2$beta.exposure/dat2$se.exposure n<-dat2$samplesize.exposure - b<-z/sqrt(2*p*(1-p)*(n+z^2)) - if(any(is.na(dat2$ncase.outcome))) stop(paste("number of cases missing for summary set: ",id.subset.unique[j],sep="")) + b<-z/sqrt(2*p*(1-p)*(n+z^2)) + if (any(is.na(dat2$ncase.outcome))) stop(paste("number of cases missing for summary set: ",id.subset.unique[j],sep="")) n.cas<-dat2$ncase.outcome n.con<-dat2$ncontrol.outcome - var<-1 # variance of risk factor assumed to be 1 + var<-1 # variance of risk factor assumed to be 1 r2<-2*b^2*p*(1-p)/var - if(any(is.na(r2))) warning("beta or allele frequency missing for some SNPs, which could affect accuracy of power pruning") + if (any(is.na(r2))) warning("beta or allele frequency missing for some SNPs, which could affect accuracy of power pruning") r2<-r2[!is.na(r2)] # k<-length(p[!is.na(p)]) #number of SNPs in the instrument / associated with the risk factor # n<-min(n) #sample size of the exposure/risk factor GWAS r2sum<-sum(r2) # sum of the r-squares for each SNP in the instrument # F<-r2sum*(n-1-k)/((1-r2sum*k ) - if(dist.outcome == "continuous"){ - iv.se<- 1/sqrt(mean(dat2$samplesize.outcome)*r2sum) #standard error of the IV should be proportional to this + if (dist.outcome == "continuous") { + iv.se<- 1/sqrt(mean(dat2$samplesize.outcome, na.rm = TRUE)*r2sum) #standard error of the IV should be proportional to this } - if(dist.outcome == "binary"){ + if (dist.outcome == "binary") { if(any(is.na(n.cas)) || any(is.na(n.con))) { warning("dist.outcome set to binary but number of cases or controls is missing. Will try using total sample size instead but power pruning will be less accurate") - iv.se<- 1/sqrt(mean(dat2$samplesize.outcome)*r2sum) + iv.se<- 1/sqrt(mean(dat2$samplesize.outcome, na.rm = TRUE)*r2sum) } else { - iv.se<-1/sqrt(mean(n.cas)*mean(n.con)*r2sum) #standard error of the IV should be proportional to this + iv.se<-1/sqrt(mean(n.cas, na.rm = TRUE)*mean(n.con, na.rm = TRUE)*r2sum) #standard error of the IV should be proportional to this } } # Power calculations to implement at some point @@ -355,24 +355,24 @@ power_prune <- function(dat,method=1,dist.outcome="binary") # ratio<-unique(n.cas/n.con) # sig<-alpha #alpha # b1=log(or) # assumed log odds ratio - # power<-pnorm(sqrt(n.outcome*r2sum*(ratio/(1+ratio))*(1/(1+ratio)))*b1-qnorm(1-sig/2)) + # power<-pnorm(sqrt(n.outcome*r2sum*(ratio/(1+ratio))*(1/(1+ratio)))*b1-qnorm(1-sig/2)) dat2$iv.se<-iv.se # dat2$power<-power L1[[j]]<-dat2 } - L[[i]]<-do.call(rbind,L1) + L[[i]]<-do.call(rbind,L1) } dat2<-do.call(rbind,L) dat2<-dat2[order(dat2$id.set,dat2$iv.se),] id.sets<-unique(dat2$id.set) id.keep<-NULL - for(i in seq_along(id.sets)){ + for (i in seq_along(id.sets)) { # print(i) # print(id.sets[i]) id.temp<-unique(dat2[dat2$id.set==id.sets[i],c("id.set","id.subset")]) id.keep[[i]]<-paste(id.temp$id.set,id.temp$id.subset)[1] } - + dat2$power.prune<-"drop" dat2$power.prune[paste(dat2$id.set,dat2$id.subset) %in% id.keep]<-"keep" # if(drop.duplicates == T) { @@ -387,18 +387,18 @@ power_prune <- function(dat,method=1,dist.outcome="binary") # dat2[dat2$id.set==1,c("iv.se","id.set","id.subset")] return(dat) - + } } -#' Size prune +#' Size prune #' -#' Whens there are duplicate summary sets for a particular exposure-outcome combination, -#' this function drops the duplicates with the smaller total sample size -#' (for binary outcomes, the number of cases is used instead of total sample size). +#' Whens there are duplicate summary sets for a particular exposure-outcome combination, +#' this function drops the duplicates with the smaller total sample size +#' (for binary outcomes, the number of cases is used instead of total sample size). #' #' @param dat Results from [harmonise_data()]. -#' +#' #' @export #' @return data frame diff --git a/R/harmonise.R b/R/harmonise.R index 13852809..82e585fd 100644 --- a/R/harmonise.R +++ b/R/harmonise.R @@ -6,7 +6,7 @@ #' @details #' Expects data in the format generated by [read_exposure_data()] and [extract_outcome_data()]. #' This means the inputs must be dataframes with the following columns: -#' +#' #' \code{outcome_dat}: #' \itemize{ #' \item \code{SNP} @@ -28,18 +28,18 @@ #' \item \code{eaf.exposure} #' } #' -#' The function tries to harmonise INDELs. If they are coded as sequence strings things work more smoothly. -#' If they are coded as D/I in one dataset it will try to convert them to sequences if the other dataset has adequate information. -#' If coded as D/I in one dataset and as a variant with equal length INDEL alleles in the other, the variant is dropped. +#' The function tries to harmonise INDELs. If they are coded as sequence strings things work more smoothly. +#' If they are coded as D/I in one dataset it will try to convert them to sequences if the other dataset has adequate information. +#' If coded as D/I in one dataset and as a variant with equal length INDEL alleles in the other, the variant is dropped. #' If one or both the datasets only has one allele (i.e. the effect allele) then harmonisation is naturally going to be more ambiguous and more variants will be dropped. #' #' @param exposure_dat Output from [read_exposure_data()]. #' @param outcome_dat Output from [extract_outcome_data()]. -#' @param action Level of strictness in dealing with SNPs. +#' @param action Level of strictness in dealing with SNPs. #' * `action = 1`: Assume all alleles are coded on the forward strand, i.e. do not attempt to flip alleles -#' * `action = 2`: Try to infer positive strand alleles, using allele frequencies for palindromes (default, conservative); -#' * `action = 3`: Correct strand for non-palindromic SNPs, and drop all palindromic SNPs from the analysis (more conservative). -#' If a single value is passed then this action is applied to all outcomes. +#' * `action = 2`: Try to infer positive strand alleles, using allele frequencies for palindromes (default, conservative); +#' * `action = 3`: Correct strand for non-palindromic SNPs, and drop all palindromic SNPs from the analysis (more conservative). +#' If a single value is passed then this action is applied to all outcomes. #' But multiple values can be supplied as a vector, each element relating to a different outcome. #' #' @export @@ -656,7 +656,7 @@ check_required_columns <- function(dat, type="exposure") index <- required_columns %in% names(dat) if(!all(index)) { - stop("The following required columns are missing from ", type, ": ", paste(required_columns[!index], collapse=", ")) + stop("The following required columns are missing from ", type, ": ", paste(required_columns[!index], collapse=", ")) } return(NULL) } diff --git a/R/heterogeneity.R b/R/heterogeneity.R index 9796ba4c..b063bb6d 100644 --- a/R/heterogeneity.R +++ b/R/heterogeneity.R @@ -1,5 +1,5 @@ #' Get heterogeneity statistics -#' +#' #' Get heterogeneity statistics. #' #' @param dat Harmonised exposure and outcome data. Output from [harmonise_data()]. @@ -21,7 +21,7 @@ mr_heterogeneity <- function(dat, parameters=default_parameters(), method_list = } res <- lapply(method_list, function(meth) { - get(meth)(x$beta.exposure, x$beta.outcome, x$se.exposure, x$se.outcome, parameters) + get(meth)(x$beta.exposure, x$beta.outcome, x$se.exposure, x$se.outcome, parameters) }) methl <- mr_method_list() het_tab <- data.frame( diff --git a/R/knit.R b/R/knit.R index d982e051..20c9d77f 100644 --- a/R/knit.R +++ b/R/knit.R @@ -43,11 +43,11 @@ knit_report <- function(input_filename, output_filename, ...) else if (is.md) return(knitr::knit(input_filename, output=paste0(name, ".md"), envir=parent.frame(), ...)) else if (is.pdf) - { + { return(rmarkdown::render(input_filename, rmarkdown::pdf_document(), intermediates_dir=getwd(), output_dir=getwd(), output_file=paste0(name, ".pdf"), clean = TRUE, envir=parent.frame(), ...)) } else if (is.docx) - { + { return(rmarkdown::render(input_filename, rmarkdown::word_document(), intermediates_dir=getwd(), output_dir=getwd(), output_file=paste0(name, ".docx"), clean = TRUE, envir=parent.frame(), ...)) } else diff --git a/R/ld.R b/R/ld.R index aa634a86..65bb6340 100644 --- a/R/ld.R +++ b/R/ld.R @@ -1,12 +1,12 @@ #' Perform LD clumping on SNP data #' #' Uses PLINK clumping method, where SNPs in LD within a particular window will be pruned. The SNP with the lowest p-value is retained. -#' +#' #' @details #' This function interacts with the OpenGWAS API, which houses LD reference panels for the 5 super-populations in the 1000 genomes reference panel. #' It includes only bi-allelic SNPs with MAF > 0.01, so it's quite possible that a variant you want to include in the clumping process will be absent. #' If it is absent, it will be automatically excluded from the results. -#' +#' #' You can check if your variants are present in the LD reference panel using [ieugwasr::ld_reflookup()]. #' #' This function does put load on the OpenGWAS servers, which makes life more difficult for other users. @@ -48,7 +48,7 @@ clump_data <- function(dat, clump_kb=10000, clump_r2=0.001, clump_p1=1, clump_p2 } else { pval_column <- "pval.exposure" } - + if(! "id.exposure" %in% names(dat)) { dat$id.exposure <- random_string(1) @@ -72,7 +72,7 @@ clump_data <- function(dat, clump_kb=10000, clump_r2=0.001, clump_p1=1, clump_p2 #' The data used for generating the LD matrix includes only bi-allelic SNPs with MAF > 0.01, #' so it's quite possible that a variant you want to include will be absent. #' If it is absent, it will be automatically excluded from the results. -#' +#' #' You can check if your variants are present in the LD reference panel using [ieugwasr::ld_reflookup()]. #' #' This function does put load on the OpenGWAS servers, which makes life more difficult for other users, diff --git a/R/ldsc.r b/R/ldsc.r index 2681da74..b0796a95 100644 --- a/R/ldsc.r +++ b/R/ldsc.r @@ -1,7 +1,6 @@ -Ghuber <- function (u, k = 30, deriv = 0) +Ghuber <- function(u, k = 30, deriv = 0) { - if (!deriv) - { + if (!deriv) { return(pmin(1, k/abs(u))) } else { return(abs(u) <= k) @@ -21,15 +20,13 @@ Ghuber <- function (u, k = 30, deriv = 0) #' @return model fit ldsc_h2_internal <- function(Z, r2, N, W=NULL) { - if (is.null(W)) - { + if (is.null(W)) { W <- rep(1, length(Z)) } tau <- (mean(Z^2) - 1) / mean(N * r2) Wv <- 1 / (1 + tau * N * r2)^2 id <- which(Z^2 > 30) - if (length(id) > 0) - { + if (length(id) > 0) { Wv[id] <- sqrt(Wv[id]) } mod <- MASS::rlm(I(Z^2) ~ I(N * r2), weight = W * Wv, @@ -57,14 +54,13 @@ ldsc_h2_internal <- function(Z, r2, N, W=NULL) #' #' Guo,B. and Wu,B. (2018) Principal component based adaptive association test of multiple traits using GWAS summary statistics. bioRxiv 269597; doi: 10.1101/269597 #' -#' Gua,B. and Wu,B. (2019) Integrate multiple traits to detect novel trait-gene association using GWAS summary data with an adaptive test approach. Bioinformatics. 2019 Jul 1;35(13):2251-2257. doi: 10.1093/bioinformatics/bty961. +#' Gua,B. and Wu,B. (2019) Integrate multiple traits to detect novel trait-gene association using GWAS summary data with an adaptive test approach. Bioinformatics. 2019 Jul 1;35(13):2251-2257. doi: 10.1093/bioinformatics/bty961. #' -#' https://github.com/baolinwu/MTAR +#' https://github.com/baolinwu/MTAR #' @keywords internal ldsc_rg_internal <- function(Zs, r2, h1, h2, N1, N2, Nc=0, W=NULL) { - if(is.null(W)) - { + if (is.null(W)) { W = rep(1,length(r2)) } @@ -76,8 +72,7 @@ ldsc_rg_internal <- function(Zs, r2, h1, h2, N1, N2, Nc=0, W=NULL) r0 <- 0 ## 1st round - if(any(Nc > 0)) - { + if (any(Nc > 0)) { rcf <- as.vector(MASS::rlm(Y ~ X, psi = Ghuber)$coef) r0 <- rcf[1] gv <- rcf[-1] @@ -88,13 +83,11 @@ ldsc_rg_internal <- function(Zs, r2, h1, h2, N1, N2, Nc=0, W=NULL) ## 2nd round Wv <- 1 / ((h1 * N1r2 + 1) * (h2 * N2r2 + 1) + (X * gv + r0)^2) id <- which(abs(Zs[,1] * Zs[,2]) > 30) - if(length(id) > 0) - { + if(length(id) > 0) { Wv[id] <- sqrt(Wv[id]) } - if(any(Nc > 0)) - { + if (any(Nc > 0)) { rcf <- MASS::rlm(Y ~ X, weight = W * Wv, psi = Ghuber, k = 30) } else { rcf <- MASS::rlm(Y ~ X - 1, weight = W * Wv, psi = Ghuber, k = 30) @@ -119,13 +112,12 @@ ldsc_rg_internal <- function(Zs, r2, h1, h2, N1, N2, Nc=0, W=NULL) #' #' Guo,B. and Wu,B. (2018) Principal component based adaptive association test of multiple traits using GWAS summary statistics. bioRxiv 269597; doi: 10.1101/269597 #' -#' Gua,B. and Wu,B. (2019) Integrate multiple traits to detect novel trait-gene association using GWAS summary data with an adaptive test approach. Bioinformatics. 2019 Jul 1;35(13):2251-2257. doi: 10.1093/bioinformatics/bty961. +#' Gua,B. and Wu,B. (2019) Integrate multiple traits to detect novel trait-gene association using GWAS summary data with an adaptive test approach. Bioinformatics. 2019 Jul 1;35(13):2251-2257. doi: 10.1093/bioinformatics/bty961. #' #' ldsc_h2 <- function(id, ancestry="infer", snpinfo = NULL, splitsize=20000) { - if(is.null(snpinfo)) - { + if (is.null(snpinfo)) { snpinfo <- ieugwasr::afl2_list("hapmap3") } @@ -140,12 +132,11 @@ ldsc_h2 <- function(id, ancestry="infer", snpinfo = NULL, splitsize=20000) stopifnot(nrow(d) > 0) - if(ancestry == "infer") - { + if (ancestry == "infer") { ancestry <- ieugwasr::infer_ancestry(d, snpinfo)$pop[1] } - d <- snpinfo %>% + d <- snpinfo %>% dplyr::select(rsid, l2=paste0("L2.", ancestry)) %>% dplyr::inner_join(., d, by="rsid") %>% dplyr::filter(complete.cases(.)) @@ -167,8 +158,7 @@ ldsc_h2 <- function(id, ancestry="infer", snpinfo = NULL, splitsize=20000) #' @return model fit ldsc_rg <- function(id1, id2, ancestry="infer", snpinfo = NULL, splitsize=20000) { - if(is.null(snpinfo)) - { + if (is.null(snpinfo)) { snpinfo <- ieugwasr::afl2_list("hapmap3") } @@ -189,22 +179,20 @@ ldsc_rg <- function(id1, id2, ancestry="infer", snpinfo = NULL, splitsize=20000) stopifnot(nrow(d2) > 0) - if(ancestry == "infer") - { + if (ancestry == "infer") { ancestry1 <- ieugwasr::infer_ancestry(d1, snpinfo) ancestry2 <- ieugwasr::infer_ancestry(d2, snpinfo) - if(ancestry1$pop[1] != ancestry2$pop[1]) - { + if (ancestry1$pop[1] != ancestry2$pop[1]) { stop("d1 ancestry is ", ancestry1$pop[1], " and d2 ancestry is ", ancestry2$pop[1]) } ancestry <- ancestry1$pop[1] } - d1 <- snpinfo %>% + d1 <- snpinfo %>% dplyr::select(rsid, l2=paste0("L2.", ancestry)) %>% dplyr::inner_join(., d1, by="rsid") - d2 <- snpinfo %>% + d2 <- snpinfo %>% dplyr::select(rsid, l2=paste0("L2.", ancestry)) %>% dplyr::inner_join(., d2, by="rsid") @@ -247,5 +235,6 @@ extract_split <- function(snplist, id, splitsize=20000) pbapply::pblapply(., function(x) { ieugwasr::associations(x, id, proxies=FALSE) - }) %>% dplyr::bind_rows() + }) %>% + dplyr::bind_rows() } diff --git a/R/leaveoneout.R b/R/leaveoneout.R index ab40326f..b845bee7 100644 --- a/R/leaveoneout.R +++ b/R/leaveoneout.R @@ -77,7 +77,7 @@ mr_leaveoneout <- function(dat, parameters=default_parameters(), method=mr_ivw) #' Plot results from leaveoneout analysis -#' +#' #' Plot results from leaveoneout analysis. #' #' @param leaveoneout_results Output from [mr_leaveoneout()]. @@ -119,8 +119,8 @@ mr_leaveoneout_plot <- function(leaveoneout_results) ggplot2::scale_linewidth_manual(values=c(0.3, 1)) + # xlim(c(min(c(0, d$b), na.rm=T), max(c(0, d$b), na.rm=T))) + ggplot2::theme( - legend.position="none", - axis.text.y=ggplot2::element_text(size=8), + legend.position="none", + axis.text.y=ggplot2::element_text(size=8), axis.ticks.y=ggplot2::element_line(linewidth=0), axis.title.x=ggplot2::element_text(size=8)) + ggplot2::labs(y="", x=paste0("MR leave-one-out sensitivity analysis for\n'", d$exposure[1], "' on '", d$outcome[1], "'")) diff --git a/R/make_dat.R b/R/make_dat.R index e5708d85..08874e3c 100644 --- a/R/make_dat.R +++ b/R/make_dat.R @@ -1,7 +1,7 @@ #' Convenient function to create a harmonised dataset #' #' Convenient function to create a harmonised dataset. -#' +#' #' @param exposures The default is `c("ieu-a-2", "ieu-a-301")` (BMI and LDL). #' @param outcomes The default is `c("ieu-a-7", "ieu-a-1001")` (CHD and EDU). #' @param proxies Look for proxies? Default = `TRUE` diff --git a/R/moe.R b/R/moe.R index b723436f..5fbf71b0 100644 --- a/R/moe.R +++ b/R/moe.R @@ -107,7 +107,7 @@ get_rsq <- function(dat) ind1 <- !is.na(dat$pval.exposure) & !is.na(dat$samplesize.exposure) dat$rsq.exposure <- NA if(sum(ind1) > 0) - { + { dat$rsq.exposure[ind1] <- get_r_from_pn( dat$pval.exposure[ind1], dat$samplesize.exposure[ind1] @@ -138,7 +138,7 @@ get_rsq <- function(dat) ind1 <- !is.na(dat$pval.outcome) & !is.na(dat$samplesize.outcome) dat$rsq.outcome <- NA if(sum(ind1) > 0) - { + { dat$rsq.outcome[ind1] <- get_r_from_pn( dat$pval.outcome[ind1], dat$samplesize.outcome[ind1] @@ -156,15 +156,15 @@ get_rsq <- function(dat) #' #' @param res Output from [mr_wrapper()]. #' @param rf The trained random forest for the methods. This is available to download at . -#' +#' #' @details #' The `mr_moe()` function modifies the `estimates` item in the list of results from the [mr_wrapper()] function. It does three things: -#' 1. Adds the MOE column, which is a predictor for each method for how well it performs in terms of high power and low type 1 error (scaled 0-1, where 1 is best performance). -#' 2. It renames the methods to be the estimating method + the instrument selection method. There are 4 instrument selection methods: Tophits (i.e. no filtering), directional filtering (DF, an unthresholded version of Steiger filtering), heterogeneity filtering (HF, removing instruments that make substantial (p < 0.05) contributions to Cochran's Q statistic), and DF + HF which is where DF is applied and the HF applied on top of that. +#' 1. Adds the MOE column, which is a predictor for each method for how well it performs in terms of high power and low type 1 error (scaled 0-1, where 1 is best performance). +#' 2. It renames the methods to be the estimating method + the instrument selection method. There are 4 instrument selection methods: Tophits (i.e. no filtering), directional filtering (DF, an unthresholded version of Steiger filtering), heterogeneity filtering (HF, removing instruments that make substantial (p < 0.05) contributions to Cochran's Q statistic), and DF + HF which is where DF is applied and the HF applied on top of that. #' 3. It orders the table to be in order of best performing method. -#' +#' #' Note that the mixture of experts has only been trained on datasets with at least 5 SNPs. If your dataset has fewer than 5 SNPs this function might return errors. -#' +#' #' @export #' @return List #' @examples @@ -174,17 +174,17 @@ get_rsq <- function(dat) #' a <- extract_instruments("ieu-a-2") #' b <- extract_outcome_data(a$SNP, 7) #' dat <- harmonise_data(a, b) -#' +#' #' # Apply all MR methods #' r <- mr_wrapper(dat) -#' +#' #' # Load the rf object containing the trained models #' load("rf.rdata") #' # Update the results with mixture of experts #' r <- mr_moe(r, rf) -#' -#' # Now you can view the estimates, and see that they have -#' # been sorted in order from most likely to least likely to +#' +#' # Now you can view the estimates, and see that they have +#' # been sorted in order from most likely to least likely to #' # be accurate, based on MOE prediction #' r[[1]]$estimates #'} @@ -226,7 +226,9 @@ mr_moe_single <- function(res, rf) MOE = predict(rf[[m]], metric, type="prob")[,2] ) return(d) - }) %>% bind_rows %>% arrange(desc(MOE)) + }) %>% + bind_rows %>% + arrange(desc(MOE)) if("MOE" %in% names(res$estimates)) { message("Overwriting previous MOE estimate") diff --git a/R/mr.R b/R/mr.R index 6d56a554..b50be4b0 100644 --- a/R/mr.R +++ b/R/mr.R @@ -46,7 +46,7 @@ mr <- function(dat, parameters=default_parameters(), method_list=subset(mr_metho #' Get list of available MR methods -#' +#' #' @export #' @return character vector of method names mr_method_list <- function() @@ -523,7 +523,7 @@ linreg <- function(x, y, w=rep(x,1)) bhat <- stats::cov(x*w,y*w, use="pair") / stats::var(x*w, na.rm=TRUE) ahat <- mean(y, na.rm=TRUE) - mean(x, na.rm=TRUE) * bhat yhat <- ahat + bhat * x - se <- sqrt(sum((yp - yhat)^2) / (sum(!is.na(yhat)) - 2) / t(x)%*%x ) + se <- sqrt(sum((yp - yhat)^2) / (sum(!is.na(yhat)) - 2) / t(x)%*%x) sum(w * (y-yhat)^2) se <- sqrt(sum(w*(y-yhat)^2) / (sum(!is.na(yhat)) - 2) / (sum(w*x^2))) diff --git a/R/mr_mode.R b/R/mr_mode.R index 04808a87..22533138 100644 --- a/R/mr_mode.R +++ b/R/mr_mode.R @@ -12,7 +12,7 @@ mr_mode <- function(dat, parameters=default_parameters(), mode_method="all") { if("mr_keep" %in% names(dat)) dat <- subset(dat, mr_keep) - if(nrow(dat) < 3) + if(nrow(dat) < 3) { warning("Need at least 3 SNPs") return(NULL) @@ -44,7 +44,7 @@ mr_mode <- function(dat, parameters=default_parameters(), mode_method="all") h <- max(0.00000001, s*cur_phi) #Compute the smoothed empirical density function densityIV <- stats::density(BetaIV.in, weights=weights, bw=h) - #Extract the point with the highest density as the point estimate + #Extract the point with the highest density as the point estimate beta[length(beta)+1] <- densityIV$x[densityIV$y==max(densityIV$y)] } return(beta) @@ -61,7 +61,7 @@ mr_mode <- function(dat, parameters=default_parameters(), mode_method="all") #Set up a matrix to store the results from each bootstrap iteration beta.boot <- matrix(nrow=nboot, ncol=length(beta_Mode.in)) - for(i in 1:nboot) + for(i in 1:nboot) { #Re-sample each ratio estimate using SEs derived not assuming NOME BetaIV.boot <- stats::rnorm(length(BetaIV.in), mean=BetaIV.in, sd=seBetaIV.in[,1]) @@ -106,7 +106,7 @@ mr_mode <- function(dat, parameters=default_parameters(), mode_method="all") beta_WeightedMode <- beta(BetaIV.in=BetaIV, seBetaIV.in=seBetaIV[,1], phi=phi) weights <- 1/seBetaIV[,1]^2 penalty <- stats::pchisq(weights * (BetaIV-beta_WeightedMode)^2, df=1, lower.tail=FALSE) - pen.weights <- weights*pmin(1, penalty*parameters$penk) # penalized + pen.weights <- weights*pmin(1, penalty*parameters$penk) # penalized beta_PenalisedMode <- beta(BetaIV.in=BetaIV, seBetaIV.in=sqrt(1/pen.weights), phi=phi) @@ -132,15 +132,15 @@ mr_mode <- function(dat, parameters=default_parameters(), mode_method="all") id.exposure <- ifelse("id.exposure" %in% names(dat), dat$id.exposure[1], "") id.outcome <- ifelse("id.outcome" %in% names(dat), dat$id.outcome[1], "") Results <- data.frame( - id.exposure = id.exposure, - id.outcome = id.outcome, - method = Method, - nsnp = length(b_exp), - b = beta_Mode, - se = se_Mode, - ci_low = CIlow_Mode, - ci_upp = CIupp_Mode, - pval = P_Mode, + id.exposure = id.exposure, + id.outcome = id.outcome, + method = Method, + nsnp = length(b_exp), + b = beta_Mode, + se = se_Mode, + ci_low = CIlow_Mode, + ci_upp = CIupp_Mode, + pval = P_Mode, stringsAsFactors=FALSE ) @@ -174,7 +174,7 @@ mr_mode <- function(dat, parameters=default_parameters(), mode_method="all") #' \item{se}{Standard error} #' \item{pval}{p-value} #' } -mr_weighted_mode <- function(b_exp, b_out, se_exp, se_out, parameters=default_parameters()) +mr_weighted_mode <- function(b_exp, b_out, se_exp, se_out, parameters=default_parameters()) { index <- !is.na(b_exp) & !is.na(b_out) & !is.na(se_exp) & !is.na(se_out) if(sum(index) < 3) @@ -204,7 +204,7 @@ mr_weighted_mode <- function(b_exp, b_out, se_exp, se_out, parameters=default_pa #' \item{se}{Standard error} #' \item{pval}{p-value} #' } -mr_simple_mode <- function(b_exp, b_out, se_exp, se_out, parameters=default_parameters()) +mr_simple_mode <- function(b_exp, b_out, se_exp, se_out, parameters=default_parameters()) { index <- !is.na(b_exp) & !is.na(b_out) & !is.na(se_exp) & !is.na(se_out) if(sum(index) < 3) @@ -236,7 +236,7 @@ mr_simple_mode <- function(b_exp, b_out, se_exp, se_out, parameters=default_para #' \item{se}{Standard error} #' \item{pval}{p-value} #' } -mr_weighted_mode_nome <- function(b_exp, b_out, se_exp, se_out, parameters=default_parameters()) +mr_weighted_mode_nome <- function(b_exp, b_out, se_exp, se_out, parameters=default_parameters()) { index <- !is.na(b_exp) & !is.na(b_out) & !is.na(se_exp) & !is.na(se_out) if(sum(index) < 3) @@ -252,7 +252,7 @@ mr_weighted_mode_nome <- function(b_exp, b_out, se_exp, se_out, parameters=defau #' MR simple mode estimator (NOME) -#' +#' #' MR simple mode estimator (NOME). #' #' @param b_exp Vector of genetic effects on exposure @@ -268,7 +268,7 @@ mr_weighted_mode_nome <- function(b_exp, b_out, se_exp, se_out, parameters=defau #' \item{se}{Standard error} #' \item{pval}{p-value} #' } -mr_simple_mode_nome <- function(b_exp, b_out, se_exp, se_out, parameters=default_parameters()) +mr_simple_mode_nome <- function(b_exp, b_out, se_exp, se_out, parameters=default_parameters()) { index <- !is.na(b_exp) & !is.na(b_out) & !is.na(se_exp) & !is.na(se_out) if(sum(index) < 3) diff --git a/R/multivariable_mr.R b/R/multivariable_mr.R index f095a874..27d07541 100644 --- a/R/multivariable_mr.R +++ b/R/multivariable_mr.R @@ -1,6 +1,6 @@ #' Extract exposure variables for multivariable MR #' -#' Requires a list of IDs from available_outcomes. For each ID, it extracts instruments. Then, it gets the full list of all instruments and extracts those SNPs for every exposure. Finally, it keeps only the SNPs that are a) independent and b) present in all exposures, and harmonises them to be all on the same strand. +#' Requires a list of IDs from available_outcomes. For each ID, it extracts instruments. Then, it gets the full list of all instruments and extracts those SNPs for every exposure. Finally, it keeps only the SNPs that are a) independent and b) present in all exposures, and harmonises them to be all on the same strand. #' #' @param id_exposure Array of IDs (e.g. c(299, 300, 302) for HDL, LDL, trigs) #' @param clump_r2 The default is `0.01`. @@ -45,7 +45,7 @@ mv_extract_exposures <- function(id_exposure, clump_r2=0.001, clump_kb=10000, ha tab <- table(d$SNP) keepsnps <- names(tab)[tab == length(id_exposure)-1] d <- subset(d, SNP %in% keepsnps) - + # Reshape exposures dh1 <- subset(d, id.outcome == id.outcome[1], select=c(SNP, exposure, id.exposure, effect_allele.exposure, other_allele.exposure, eaf.exposure, beta.exposure, se.exposure, pval.exposure)) dh2 <- subset(d, select=c(SNP, outcome, id.outcome, effect_allele.outcome, other_allele.outcome, eaf.outcome, beta.outcome, se.outcome, pval.outcome)) @@ -58,8 +58,8 @@ mv_extract_exposures <- function(id_exposure, clump_r2=0.001, clump_kb=10000, ha #' Attempt to perform MVMR using local data #' -#' Allows you to read in summary data from text files to format the multivariable exposure dataset. -#' +#' Allows you to read in summary data from text files to format the multivariable exposure dataset. +#' #' Note that you can provide an array of column names for each column, which is of length `filenames_exposure` #' #' @param filenames_exposure Filenames for each exposure dataset. Must have header with at least SNP column present. Following arguments are used for determining how to read the filename and clumping etc. @@ -91,8 +91,8 @@ mv_extract_exposures <- function(id_exposure, clump_r2=0.001, clump_kb=10000, ha #' @export #' @return List mv_extract_exposures_local <- function( - filenames_exposure, - sep = " ", + filenames_exposure, + sep = " ", phenotype_col = "Phenotype", snp_col = "SNP", beta_col = "beta", @@ -151,7 +151,7 @@ mv_extract_exposures_local <- function( for(i in seq_along(filenames_exposure)) { if(flag == "character") { - l_full[[i]] <- read_outcome_data(filenames_exposure[i], + l_full[[i]] <- read_outcome_data(filenames_exposure[i], sep = sep[i], phenotype_col = phenotype_col[i], snp_col = snp_col[i], @@ -171,7 +171,7 @@ mv_extract_exposures_local <- function( log_pval = log_pval[i] ) } else { - l_full[[i]] <- format_data(filenames_exposure[[i]], + l_full[[i]] <- format_data(filenames_exposure[[i]], type="outcome", phenotype_col = phenotype_col[i], snp_col = snp_col[i], @@ -228,7 +228,7 @@ mv_extract_exposures_local <- function( tab <- table(d$SNP) keepsnps <- names(tab)[tab == length(id_exposure)-1] d <- subset(d, SNP %in% keepsnps) - + # Reshape exposures dh1 <- subset(d, id.outcome == id.outcome[1], select=c(SNP, exposure, id.exposure, effect_allele.exposure, other_allele.exposure, eaf.exposure, beta.exposure, se.exposure, pval.exposure)) dh2 <- subset(d, select=c(SNP, outcome, id.outcome, effect_allele.outcome, other_allele.outcome, eaf.outcome, beta.outcome, se.outcome, pval.outcome)) @@ -246,7 +246,7 @@ mv_extract_exposures_local <- function( #' @param harmonise_strictness See the `action` option of [harmonise_data()]. The default is `2`. #' #' @export -#' @return List of vectors and matrices required for mv analysis. +#' @return List of vectors and matrices required for mv analysis. #' \describe{ #' \item{exposure_beta}{a matrix of beta coefficients, in which rows correspond to SNPs and columns correspond to exposures.} #' \item{exposure_se}{is the same as `exposure_beta`, but for standard errors.} @@ -257,7 +257,7 @@ mv_extract_exposures_local <- function( #' \item{outcome_pval}{an array of p-values for the outcome.} #' \item{outname}{A data frame with two variables, `id.outcome` and `outcome` which are character strings.} #' } -#' +#' mv_harmonise_data <- function(exposure_dat, outcome_dat, harmonise_strictness=2) { @@ -328,7 +328,7 @@ mv_harmonise_data <- function(exposure_dat, outcome_dat, harmonise_strictness=2) #' @return List of results mv_residual <- function(mvdat, intercept=FALSE, instrument_specific=FALSE, pval_threshold=5e-8, plots=FALSE) { - # This is a matrix of + # This is a matrix of beta.outcome <- mvdat$outcome_beta beta.exposure <- mvdat$exposure_beta pval.exposure <- mvdat$exposure_pval @@ -366,7 +366,7 @@ mv_residual <- function(mvdat, intercept=FALSE, instrument_specific=FALSE, pval_ } else { marginal_outcome[,i] <- stats::lm(beta.outcome ~ 0 + beta.exposure[, -c(i), drop=FALSE])$res mod <- summary(stats::lm(marginal_outcome[,i] ~ 0 + beta.exposure[,i])) - } + } } if(sum(index) > (nexp + as.numeric(intercept))) { @@ -410,7 +410,7 @@ mv_residual <- function(mvdat, intercept=FALSE, instrument_specific=FALSE, pval_ #' #' Performs modified multivariable MR analysis. #' For each exposure the instruments are selected then all exposures for those SNPs are regressed against the outcome together, weighting for the inverse variance of the outcome. -#' +#' #' @param mvdat Output from [mv_harmonise_data()]. #' @param intercept Should the intercept by estimated (`TRUE`) or force line through the origin (`FALSE`, default). #' @param instrument_specific Should the estimate for each exposure be obtained by using all instruments from all exposures (`FALSE`, default) or by using only the instruments specific to each exposure (`TRUE`). @@ -421,7 +421,7 @@ mv_residual <- function(mvdat, intercept=FALSE, instrument_specific=FALSE, pval_ #' @return List of results mv_multiple <- function(mvdat, intercept=FALSE, instrument_specific=FALSE, pval_threshold=5e-8, plots=FALSE) { - # This is a matrix of + # This is a matrix of beta.outcome <- mvdat$outcome_beta beta.exposure <- mvdat$exposure_beta pval.exposure <- mvdat$exposure_pval @@ -499,7 +499,7 @@ mv_multiple <- function(mvdat, intercept=FALSE, instrument_specific=FALSE, pval_ } #' Perform basic multivariable MR -#' +#' #' Performs initial multivariable MR analysis from Burgess et al 2015. #' For each exposure the outcome is residualised for all the other exposures, then unweighted regression is applied. #' @@ -510,7 +510,7 @@ mv_multiple <- function(mvdat, intercept=FALSE, instrument_specific=FALSE, pval_ #' @return List of results mv_basic <- function(mvdat, pval_threshold=5e-8) { - # This is a matrix of + # This is a matrix of beta.outcome <- mvdat$outcome_beta beta.exposure <- mvdat$exposure_beta pval.exposure <- mvdat$exposure_pval @@ -571,7 +571,7 @@ mv_basic <- function(mvdat, pval_threshold=5e-8) #' @return List of results mv_ivw <- function(mvdat, pval_threshold=5e-8) { - # This is a matrix of + # This is a matrix of beta.outcome <- mvdat$outcome_beta beta.exposure <- mvdat$exposure_beta pval.exposure <- mvdat$exposure_pval @@ -636,7 +636,7 @@ mv_lasso_feature_selection <- function(mvdat) } #' Perform multivariable MR on subset of features -#' +#' #' The function proceeds as follows: #' \enumerate{ #' \item Select features (by default this is done using LASSO feature selection). @@ -665,7 +665,7 @@ mv_subset <- function(mvdat, features=mv_lasso_feature_selection(mvdat), interce mvdat$exposure_beta <- mvdat$exposure_beta[instruments,,drop=FALSE] mvdat$exposure_se <- mvdat$exposure_se[instruments,,drop=FALSE] - mvdat$exposure_pval <- mvdat$exposure_pval[instruments,,drop=FALSE] + mvdat$exposure_pval <- mvdat$exposure_pval[instruments,,drop=FALSE] mvdat$outcome_beta <- mvdat$outcome_beta[instruments] mvdat$outcome_se <- mvdat$outcome_se[instruments] mvdat$outcome_pval <- mvdat$outcome_pval[instruments] diff --git a/R/other_formats.R b/R/other_formats.R index 6b49a33f..d594c696 100644 --- a/R/other_formats.R +++ b/R/other_formats.R @@ -1,12 +1,12 @@ #' Convert TwoSampleMR format to MendelianRandomization format #' -#' The MendelianRandomization package offers MR methods that can +#' The MendelianRandomization package offers MR methods that can #' be used with the same data used in the TwoSampleMR package. This #' function converts from the TwoSampleMR format to the MRInput class. #' #' @param dat Output from the [harmonise_data()] function. #' @param get_correlations Default `FALSE`. If `TRUE` then extract the LD matrix for the SNPs from the European 1000 genomes data on the MR-Base server. -#' @param pop If `get_correlations` is `TRUE` then use the following +#' @param pop If `get_correlations` is `TRUE` then use the following #' #' @export #' @return List of MRInput objects for each exposure/outcome combination @@ -186,7 +186,7 @@ dat_to_RadialMR <- function(dat) } #' Radial IVW analysis -#' +#' #' @param b_exp Vector of genetic effects on exposure. #' @param b_out Vector of genetic effects on outcome. #' @param se_exp Standard errors of genetic effects on exposure. @@ -256,10 +256,10 @@ run_mrmix <- function(dat) x$eaf.exposure[index] <- x$eaf.outcome[index] } l <- MRMix::standardize( - x$beta.exposure, - x$beta.outcome, - x$se.exposure, - x$se.outcome, + x$beta.exposure, + x$beta.outcome, + x$se.exposure, + x$se.outcome, xunit, yunit, x$samplesize.exposure, diff --git a/R/query.R b/R/query.R index 0325a1e8..45006818 100644 --- a/R/query.R +++ b/R/query.R @@ -1,13 +1,13 @@ #' Get list of studies with available GWAS summary statistics through API #' #' @param opengwas_jwt Used to authenticate protected endpoints. Login to to obtain a jwt. Provide the jwt string here, or store in .Renviron under the keyname OPENGWAS_JWT. -#' +#' #' @export #' @return Dataframe of details for all available studies available_outcomes <- function(opengwas_jwt = ieugwasr::get_opengwas_jwt()) { # .Deprecated("ieugwasr::gwasinfo()") - a <- ieugwasr::gwasinfo(opengwas_jwt=opengwas_jwt) + a <- ieugwasr::gwasinfo(opengwas_jwt=opengwas_jwt) return(a) } @@ -85,7 +85,7 @@ extract_outcome_data_internal <- function(snps, outcomes, proxies = TRUE, rsq = { d <- ieugwasr::associations( - variants = snps, + variants = snps, id = outcomes, proxies = proxies, r2 = rsq, @@ -98,20 +98,20 @@ extract_outcome_data_internal <- function(snps, outcomes, proxies = TRUE, rsq = } else if(length(snps) > length(outcomes)) { - # Split snps - n <- length(snps) + # Split snps + n <- length(snps) splits <- data.frame(snps=snps, chunk_id=rep(1:(ceiling(n/splitsize)), each=splitsize)[1:n]) d <- list() for(i in seq_along(outcomes)) { message(i, " of ", length(outcomes), " outcomes") - + d[[i]] <- plyr::ddply(splits, c("chunk_id"), function(x) { x <- plyr::mutate(x) message(" [>] ", x$chunk_id[1], " of ", max(splits$chunk_id), " chunks") out <- ieugwasr::associations( - variants = x$snps, + variants = x$snps, id = outcomes[i], proxies = proxies, r2 = rsq, @@ -135,14 +135,14 @@ extract_outcome_data_internal <- function(snps, outcomes, proxies = TRUE, rsq = for(i in seq_along(snps)) { message(i, " of ", length(snps), " snps") - + d[[i]] <- plyr::ddply(splits, c("chunk_id"), function(x) { x <- plyr::mutate(x) message(" [>] ", x$chunk_id[1], " of ", max(splits$chunk_id), " chunks") out <- ieugwasr::associations( - variants = snps[i], + variants = snps[i], id = x$outcomes, proxies = proxies, r2 = rsq, @@ -166,7 +166,7 @@ extract_outcome_data_internal <- function(snps, outcomes, proxies = TRUE, rsq = return(NULL) } d <- format_d(d) - if (nrow(d)>0){ + if (nrow(d)>0) { d$data_source.outcome <- "igd" return(d) } else { @@ -286,5 +286,5 @@ format_d <- function(d) warning("None of the provided SNPs can be used for MR analysis, they are missing required information.") } - return(d) + return(d) } diff --git a/R/read_data.R b/R/read_data.R index be04eb2f..5869b67c 100644 --- a/R/read_data.R +++ b/R/read_data.R @@ -122,8 +122,8 @@ read_exposure_data <- function(filename, clump=FALSE, sep=" ", phenotype_col="Ph #' Read exposure or outcome data #' -#' Reads in exposure data. Checks and organises columns for use with MR or enrichment tests. -#' Infers p-values when possible from beta and se. +#' Reads in exposure data. Checks and organises columns for use with MR or enrichment tests. +#' Infers p-values when possible from beta and se. #' #' @param dat Data frame. Must have header with at least SNP column present. #' @param type Is this the exposure or the outcome data that is being read in? The default is `"exposure"`. @@ -152,15 +152,15 @@ read_exposure_data <- function(filename, clump=FALSE, sep=" ", phenotype_col="Ph #' #' @export #' @return data frame -format_data <- function(dat, type="exposure", snps=NULL, header=TRUE, - phenotype_col="Phenotype", snp_col="SNP", - beta_col="beta", se_col="se", eaf_col="eaf", - effect_allele_col="effect_allele", - other_allele_col="other_allele", pval_col="pval", - units_col="units", ncase_col="ncase", - ncontrol_col="ncontrol", samplesize_col="samplesize", - gene_col="gene", id_col="id", min_pval=1e-200, - z_col="z", info_col="info", chr_col="chr", +format_data <- function(dat, type="exposure", snps=NULL, header=TRUE, + phenotype_col="Phenotype", snp_col="SNP", + beta_col="beta", se_col="se", eaf_col="eaf", + effect_allele_col="effect_allele", + other_allele_col="other_allele", pval_col="pval", + units_col="units", ncase_col="ncase", + ncontrol_col="ncontrol", samplesize_col="samplesize", + gene_col="gene", id_col="id", min_pval=1e-200, + z_col="z", info_col="info", chr_col="chr", pos_col="pos", log_pval=FALSE) { @@ -197,7 +197,7 @@ if (inherits(dat, "data.table")) { { dat <- subset(dat, SNP %in% snps) } - + if(! phenotype_col %in% names(dat)) { message("No phenotype name specified, defaulting to '", type, "'.") @@ -216,7 +216,7 @@ if (inherits(dat, "data.table")) { } # Remove duplicated SNPs - dat <- plyr::ddply(dat, type, function(x){ + dat <- plyr::ddply(dat, type, function(x) { x <- plyr::mutate(x) dup <- duplicated(x$SNP) if(any(dup)) @@ -224,11 +224,11 @@ if (inherits(dat, "data.table")) { warning("Duplicated SNPs present in exposure data for phenotype '", x[[type]][1], ". Just keeping the first instance:\n", paste(x$SNP[dup], collapse="\n")) x <- x[!dup,] } - return(x) + return(x) }) # Check if columns required for MR are present - mr_cols_required <- c(snp_col, beta_col, se_col, effect_allele_col) + mr_cols_required <- c(snp_col, beta_col, se_col, effect_allele_col) mr_cols_desired <- c(other_allele_col, eaf_col) if(! all(mr_cols_required %in% names(dat))) { @@ -372,7 +372,7 @@ if (inherits(dat, "data.table")) { } } } - + # If no pval column then create it from beta and se if available if("beta.outcome" %in% names(dat) && "se.outcome" %in% names(dat) && ! "pval.outcome" %in% names(dat)) { @@ -380,7 +380,7 @@ if (inherits(dat, "data.table")) { dat$pval.outcome <- stats::pnorm(abs(dat$beta.outcome)/dat$se.outcome, lower.tail=FALSE) * 2 dat$pval_origin.outcome <- "inferred" } - + if(ncase_col %in% names(dat)) { @@ -401,7 +401,7 @@ if (inherits(dat, "data.table")) { } } - + if(samplesize_col %in% names(dat)) { @@ -418,7 +418,7 @@ if (inherits(dat, "data.table")) { if(any(index)) { message("Generating sample size from ncase and ncontrol") - dat$samplesize.outcome[index] <- dat$ncase.outcome[index] + dat$ncontrol.outcome[index] + dat$samplesize.outcome[index] <- dat$ncase.outcome[index] + dat$ncontrol.outcome[index] } } } else if("ncontrol.outcome" %in% names(dat) && "ncase.outcome" %in% names(dat)) @@ -431,7 +431,7 @@ if (inherits(dat, "data.table")) { { names(dat)[which(names(dat) == gene_col)[1]] <- "gene.outcome" } - + if(info_col %in% names(dat)) { names(dat)[which(names(dat) == info_col)[1]] <- "info.outcome" @@ -570,7 +570,7 @@ format_gtex_eqtl <- function(gtex_eqtl_subset, type="exposure") #' Get data from metabolomic QTL results #' #' See [format_data()]. -#' +#' #' @param metab_qtls_subset Selected rows from \code{metab_qtls} data loaded from \code{MRInstruments} package. #' @param type Are these data used as `"exposure"` or `"outcome"`? Default is `"exposure"`. #' @@ -579,7 +579,7 @@ format_gtex_eqtl <- function(gtex_eqtl_subset, type="exposure") format_metab_qtls <- function(metab_qtls_subset, type="exposure") { stopifnot(type %in% c("exposure", "outcome")) - + if(length(unique(metab_qtls_subset$phenotype)) > 1) { @@ -606,7 +606,7 @@ format_metab_qtls <- function(metab_qtls_subset, type="exposure") format_proteomic_qtls <- function(proteomic_qtls_subset, type="exposure") { stopifnot(type %in% c("exposure", "outcome")) - + if(length(unique(proteomic_qtls_subset$analyte)) > 1) { @@ -633,7 +633,7 @@ format_proteomic_qtls <- function(proteomic_qtls_subset, type="exposure") format_aries_mqtl <- function(aries_mqtl_subset, type="exposure") { stopifnot(type %in% c("exposure", "outcome")) - + aries_mqtl_subset$Phenotype <- paste0(aries_mqtl_subset$cpg, " (", aries_mqtl_subset$age, ")") if(length(unique(aries_mqtl_subset$Phenotype)) > 1) diff --git a/R/rucker.R b/R/rucker.R index 3a7b8b1c..0ffd531a 100644 --- a/R/rucker.R +++ b/R/rucker.R @@ -1,8 +1,8 @@ #' I-squared calculation #' -#' This function calculates the \eqn{I^2} statistic. +#' This function calculates the \eqn{I^2} statistic. #' To use it for the \eqn{I^2_{GX}} metric ensure that the effects are all the same sign (e.g. \code{abs(y)}). -#' +#' #' @param y Vector of effects. #' @param s Vector of standard errors. #' @@ -41,8 +41,10 @@ PM <- function(y = y, s = s, Alpha = 0.1) typS = sum(v*(k-1))/(sum.v^2 - sum(v^2)) for(j in 1:L) { - tausq = 0 ; F = 1 ;TAUsq = NULL - while(F>0) + tausq = 0 + Fstat = 1 + TAUsq = NULL + while(Fstat>0) { TAUsq = c(TAUsq, tausq) w = 1/(s^2+tausq) @@ -51,9 +53,9 @@ PM <- function(y = y, s = s, Alpha = 0.1) yW = sum(y*w)/sum.w Q1 = sum(w*(y-yW)^2) Q2 = sum(w2*(y-yW)^2) - F = Q1-Quant[j] - Ftau = max(F,0) - delta = F/Q2 + Fstat = Q1-Quant[j] + Ftau = max(Fstat,0) + delta = Fstat/Q2 tausq = tausq + delta } MU[j] = yW @@ -97,7 +99,7 @@ mr_rucker_internal <- function(dat, parameters=default_parameters()) { if("mr_keep" %in% names(dat)) dat <- subset(dat, mr_keep) - if(nrow(dat) < 3) + if(nrow(dat) < 3) { warning("Need at least 3 SNPs") return(NULL) @@ -179,7 +181,7 @@ mr_rucker_internal <- function(dat, parameters=default_parameters()) se0_egger_fe <- stats::coefficients(mod_egger)[1,2] / max(mod_egger$sigma, 1) if(parameters$test_dist == "z") { - pval0_egger_fe <- stats::pnorm(abs(b0_egger_fe/se0_egger_fe), lower.tail=FALSE) * 2 + pval0_egger_fe <- stats::pnorm(abs(b0_egger_fe/se0_egger_fe), lower.tail=FALSE) * 2 } else { pval0_egger_fe <- stats::pt(abs(b0_egger_fe/se0_egger_fe), nsnp-2, lower.tail=FALSE) * 2 } @@ -344,7 +346,7 @@ mr_rucker_bootstrap <- function(dat, parameters=default_parameters()) ggplot2::geom_density(ggplot2::aes_string(fill="model_name"), alpha=0.4) + ggplot2::geom_vline(data=res, ggplot2::aes_string(xintercept="Estimate", colour="Method")) + ggplot2::scale_colour_brewer(type="qual") + - ggplot2::scale_fill_brewer(type="qual") + + ggplot2::scale_fill_brewer(type="qual") + ggplot2::labs(fill="Bootstrap estimates", colour="") return(list(rucker=rucker, res=res, bootstrap_estimates=modsel, boostrap_q=bootstrap, q_plot=p1, e_plot=p2)) @@ -464,7 +466,7 @@ mr_rucker_jackknife_internal <- function(dat, parameters=default_parameters()) ggplot2::geom_density(ggplot2::aes_string(fill="model_name"), alpha=0.4) + ggplot2::geom_vline(data=res, ggplot2::aes_string(xintercept="Estimate", colour="Method")) + ggplot2::scale_colour_brewer(type="qual") + - ggplot2::scale_fill_brewer(type="qual") + + ggplot2::scale_fill_brewer(type="qual") + ggplot2::labs(fill="Bootstrap estimates", colour="") return(list(rucker=rucker, res=res, bootstrap_estimates=modsel, boostrap_q=bootstrap, q_plot=p1, e_plot=p2)) @@ -504,7 +506,7 @@ mr_rucker_cooksdistance <- function(dat, parameters=default_parameters()) index <- rucker$cooksdistance > cooks_threshold i <- i + 1 } - + rucker$removed_snps <- dat_orig$SNP[! dat_orig$SNP %in% dat$SNP] rucker$selected$Method <- "Rucker (CD)" rucker$rucker$Method <- paste0(rucker$rucker$Method, " (CD)") diff --git a/R/scatterplot.R b/R/scatterplot.R index bc5f7c8d..858525f9 100644 --- a/R/scatterplot.R +++ b/R/scatterplot.R @@ -1,7 +1,7 @@ #' Create scatter plot with lines showing the causal estimate for different MR tests #' #' Requires dev version of ggplot2 -#' +#' #' @param mr_results Output from [mr()]. #' @param dat Output from [harmonise_data()]. #' @export @@ -50,8 +50,8 @@ mr_scatter_plot <- function(mr_results, dat) blank_plot <- function(message) { - ggplot2::ggplot(data.frame(a=0,b=0,n=message)) + - ggplot2::geom_text(ggplot2::aes(x=a,y=b,label=n)) + - ggplot2::labs(x=NULL,y=NULL) + + ggplot2::ggplot(data.frame(a=0,b=0,n=message)) + + ggplot2::geom_text(ggplot2::aes(x=a,y=b,label=n)) + + ggplot2::labs(x=NULL,y=NULL) + ggplot2::theme(axis.text=ggplot2::element_blank(), axis.ticks=ggplot2::element_blank()) } diff --git a/R/singlesnp.R b/R/singlesnp.R index 21e18aa9..26c3bfde 100644 --- a/R/singlesnp.R +++ b/R/singlesnp.R @@ -121,8 +121,8 @@ mr_forest_plot <- function(singlesnp_results, exponentiate=FALSE) ggplot2::scale_linewidth_manual(values=c(0.3, 1)) + # xlim(c(min(c(0, d$b), na.rm=T), max(c(0, d$b), na.rm=T))) + ggplot2::theme( - legend.position="none", - axis.text.y=ggplot2::element_text(size=8), + legend.position="none", + axis.text.y=ggplot2::element_text(size=8), axis.ticks.y=ggplot2::element_line(linewidth=0), axis.title.x=ggplot2::element_text(size=8)) + ggplot2::labs(y="", x=paste0("MR effect size for\n'", d$exposure[1], "' on '", d$outcome[1], "'")) @@ -200,9 +200,9 @@ mr_funnel_plot <- function(singlesnp_results) ggplot2::geom_point() + ggplot2::geom_vline(data=subset(d, SNP %in% am), ggplot2::aes(xintercept=b, colour = SNP)) + # ggplot2::scale_colour_brewer(type="qual") + - ggplot2::scale_colour_manual(values = c("#a6cee3", - "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", - "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", + ggplot2::scale_colour_manual(values = c("#a6cee3", + "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", + "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928")) + ggplot2::labs(y=expression(1/SE[IV]), x=expression(beta[IV]), colour="MR Method") + ggplot2::theme(legend.position="top", legend.direction="vertical") diff --git a/R/steiger.R b/R/steiger.R index 663761e9..86f4c6cf 100644 --- a/R/steiger.R +++ b/R/steiger.R @@ -29,17 +29,17 @@ steiger_sensitivity <- function(rgx_o, rgy_o, ...) d$rgx <- rgx_o / d$rxx_o d$z <- d$rgy - d$rgx d$z[d$type=="A"] <- 0 - mycolors.trans = grDevices::rgb(c(255,0), c(0,0), - c(0,255),alpha = c(70,255), maxColorValue = 255) + mycolors.trans = grDevices::rgb(c(255,0), c(0,0), + c(0,255),alpha = c(70,255), maxColorValue = 255) temp <- lattice::wireframe( - z ~ rxx_o * ryy_o, - groups=type, - data=d, - scales=list(arrows=FALSE), - col.groups = mycolors.trans, - drape=FALSE, - ylab=expression(rho[xx[o]]), + z ~ rxx_o * ryy_o, + groups=type, + data=d, + scales=list(arrows=FALSE), + col.groups = mycolors.trans, + drape=FALSE, + ylab=expression(rho[xx[o]]), xlab=expression(rho[yy[o]]), zlab=expression(rho[gy]-rho[gx]), par.settings = list(axis.line=list(col="transparent")), @@ -130,13 +130,13 @@ mr_steiger <- function(p_exp, p_out, n_exp, n_out, r_exp, r_out, r_xxo = 1, r_yy rtest <- psych::r.test(n = mean(n_exp), n2 = mean(n_out), r12 = r_exp, r34 = r_out) rtest_adj <- psych::r.test(n = mean(n_exp), n2 = mean(n_out), r12 = r_exp_adj, r34 = r_out_adj) l <- list( - r2_exp = r_exp^2, - r2_out = r_out^2, - r2_exp_adj = r_exp_adj^2, - r2_out_adj = r_out_adj^2, - correct_causal_direction = r_exp > r_out, + r2_exp = r_exp^2, + r2_out = r_out^2, + r2_exp_adj = r_exp_adj^2, + r2_out_adj = r_out_adj^2, + correct_causal_direction = r_exp > r_out, steiger_test = stats::pnorm(-abs(rtest[["z"]])) * 2, - correct_causal_direction_adj = r_exp_adj > r_out_adj, + correct_causal_direction_adj = r_exp_adj > r_out_adj, steiger_test_adj = stats::pnorm(-abs(rtest_adj[["z"]])) * 2, vz = sensitivity$vz, vz0 = sensitivity$vz0, @@ -173,11 +173,11 @@ directionality_test <- function(dat) } dtest <- plyr::ddply(dat, c("id.exposure", "id.outcome"), function(x) { - if(!"r.exposure" %in% names(x)) + if(!"r.exposure" %in% names(x)) { x$r.exposure <- NA } - if(!"r.outcome" %in% names(x)) + if(!"r.outcome" %in% names(x)) { x$r.outcome <- NA } @@ -247,13 +247,13 @@ mr_steiger2 <- function(r_exp, r_out, n_exp, n_out, r_xxo = 1, r_yyo=1, ...) rtest <- psych::r.test(n = mean(n_exp), n2 = mean(n_out), r12 = r_exp, r34 = r_out) rtest_adj <- psych::r.test(n = mean(n_exp), n2 = mean(n_out), r12 = r_exp_adj, r34 = r_out_adj) l <- list( - r2_exp = r_exp^2, - r2_out = r_out^2, - r2_exp_adj = r_exp_adj^2, - r2_out_adj = r_out_adj^2, - correct_causal_direction = r_exp > r_out, + r2_exp = r_exp^2, + r2_out = r_out^2, + r2_exp_adj = r_exp_adj^2, + r2_out_adj = r_out_adj^2, + correct_causal_direction = r_exp > r_out, steiger_test = stats::pnorm(-abs(rtest[["z"]]))*2, - correct_causal_direction_adj = r_exp_adj > r_out_adj, + correct_causal_direction_adj = r_exp_adj > r_out_adj, steiger_test_adj = stats::pnorm(-abs(rtest_adj[["z"]]))*2, vz = sensitivity$vz, vz0 = sensitivity$vz0, diff --git a/R/steiger_filtering.R b/R/steiger_filtering.R index 6acea46d..9badb56e 100644 --- a/R/steiger_filtering.R +++ b/R/steiger_filtering.R @@ -1,23 +1,23 @@ #' Steiger filtering function -#' +#' #' This function takes an object from [harmonise_data()] and does the following: #' If there is no rsq.exposure or rsq.outcome column it will try to estimate it. -#' This is done differently for traits that have "log odds" units. -#' To estimate rsq for quantitative traits there must be either p-values and sample sizes for each SNP, -#' or effect sizes and standard errors AND the units are in SD units (the column must contain "SD"). -#' To estimate rsq for binary traits the units must be called "log odds" and there must be beta.exposure, -#' eaf.exposure, ncase.exposure, ncontrol.exposure, prevalence.exposure. -#' The same principles apply for calculating the rsq for the outcome trait, except column names are beta.outcome etc. +#' This is done differently for traits that have "log odds" units. +#' To estimate rsq for quantitative traits there must be either p-values and sample sizes for each SNP, +#' or effect sizes and standard errors AND the units are in SD units (the column must contain "SD"). +#' To estimate rsq for binary traits the units must be called "log odds" and there must be beta.exposure, +#' eaf.exposure, ncase.exposure, ncontrol.exposure, prevalence.exposure. +#' The same principles apply for calculating the rsq for the outcome trait, except column names are beta.outcome etc. #' If prevalence isn't supplied then it uses 0.1 by default. -#' -#' Once rsq is calculated for the exposure and outcome, it will then perform the Steiger test for each SNP to see if the rsq of the exposure is larger than the rsq of the outcome. -#' -#' Note that Steiger filtering, while useful, does have its own pitfalls. -#' Try to use replication effect estimates for the exposure (which are not biased by winner's curse), +#' +#' Once rsq is calculated for the exposure and outcome, it will then perform the Steiger test for each SNP to see if the rsq of the exposure is larger than the rsq of the outcome. +#' +#' Note that Steiger filtering, while useful, does have its own pitfalls. +#' Try to use replication effect estimates for the exposure (which are not biased by winner's curse), #' and note that if there is strong antagonistic confounding or differential measurement error between the exposure and outcome then the causal directions could be inferred incorrectly. #' #' @param dat Output from [harmonise_data()]. -#' +#' #' @export #' @return [harmonise_data()] style data frame with additional columns rsq.exposure, rsq.outcome, steiger_dir (which is `TRUE` if the rsq.exposure is larger than rsq.outcome) and steiger_pval which is a test to see if rsq.exposure is significantly larger than rsq.outcome. steiger_filtering <- function(dat) @@ -45,9 +45,9 @@ steiger_filtering_internal <- function(dat) dat <- add_rsq(dat) st <- psych::r.test( - n = dat$effective_n.exposure, - n2 = dat$effective_n.outcome, - r12 = sqrt(dat$rsq.exposure), + n = dat$effective_n.exposure, + n2 = dat$effective_n.outcome, + r12 = sqrt(dat$rsq.exposure), r34 = sqrt(dat$rsq.outcome) ) dat$steiger_dir <- dat$rsq.exposure > dat$rsq.outcome diff --git a/inst/reports/mr_report.Rmd b/inst/reports/mr_report.Rmd index 19cc1aa6..072632f0 100644 --- a/inst/reports/mr_report.Rmd +++ b/inst/reports/mr_report.Rmd @@ -1,9 +1,7 @@ ```{r init, echo=FALSE} - suppressPackageStartupMessages(library(knitr)) suppressPackageStartupMessages(library(Cairo)) opts_chunk$set(warning=FALSE, echo=FALSE, message=FALSE, results='asis', fig.width=6, fig.height=6, dev="png", fig.path=paste0('figure/', sanitise_string(title))) - ``` # Two sample MR report @@ -18,10 +16,8 @@ Date: **`r format(Sys.time(), '%d %B, %Y')`** ### Results from two sample MR: -```{r chunk1 } - +```{r chunk1} kable(tablist$mr, row.names=FALSE) - ``` --- @@ -29,29 +25,23 @@ kable(tablist$mr, row.names=FALSE) ### Heterogeneity tests ```{r chunk2} - kable(tablist$mr_heterogeneity, row.names=FALSE) - ``` --- ### Test for directional horizontal pleiotropy -```{r chunk3 } - +```{r chunk3} kable(tablist$mr_pleiotropy_test, row.names=FALSE) - ``` --- ### Test that the exposure is upstream of the outcome -```{r chunk4 } - +```{r chunk4} kable(tablist$directionality_test, row.names=FALSE) - ``` Note - R^2 values are approximate @@ -60,38 +50,30 @@ Note - R^2 values are approximate ### Forest plot of single SNP MR -```{r chunk5 } - +```{r chunk5} plotlist$mr_forest_plot - ``` --- ### Comparison of results using different MR methods -```{r chunk6 } - +```{r chunk6} plotlist$mr_scatter_plot - ``` --- ### Funnel plot -```{r chunk7 } - +```{r chunk7} plotlist$mr_funnel_plot - ``` --- ### Leave-one-out sensitivity analysis -```{r chunk8 } - +```{r chunk8} plotlist$mr_leaveoneout_plot - ``` diff --git a/inst/sandpit/harmonise.R b/inst/sandpit/harmonise.R index ecdd14a4..16f03c91 100644 --- a/inst/sandpit/harmonise.R +++ b/inst/sandpit/harmonise.R @@ -2,7 +2,7 @@ # Possibilities # Both alleles in E and O and freq for both -# - +# - # 2E + 2O + Ef + Of # 2E + 2O + Ef diff --git a/inst/sandpit/mysql.R b/inst/sandpit/mysql.R index 70adfdf6..2dec8bcb 100644 --- a/inst/sandpit/mysql.R +++ b/inst/sandpit/mysql.R @@ -36,11 +36,11 @@ dim(d) # TMG_F1WnTL # use mrbase; -# +# # describe assoc; # describe snps; # describe study; -# +# # SELECT COUNT(*) FROM study; # SELECT COUNT(*) FROM snps; # SELECT COUNT(*) FROM assoc; @@ -49,18 +49,18 @@ dim(d) # SELECT * FROM study limit 10; # SELECT * FROM snps WHERE name='rs13078807'; # SELECT * FROM assoc WHERE snp=207707; -# +# # SELECT * FROM assoc limit 10; # SELECT * FROM assoc WHERE snp=2223704; -# +# # SELECT a.*, b.*, c.* # FROM assoc a, snps b, study c # WHERE a.snp=b.id AND a.study=c.id # AND (b.name='rs10900000' OR b.name='rs10000010' OR b.name='rs10000092') # AND (c.filename='cardiogramplusc4d_180814_update_data.txt.uniform.af.txt' OR c.filename='All_ancestries_SNP_gwas_mc_merge_nogc.tbl.uniq.gz.uniform.af.txt') # ORDER BY filename; -# -# +# +# # SELECT a.*, b.*, c.* # FROM assoc a, snps b, study c # WHERE a.snp=b.id AND a.study=c.id diff --git a/inst/sandpit/sorting_gwas_catalog.R b/inst/sandpit/sorting_gwas_catalog.R index cb4f2988..6b0f55fb 100644 --- a/inst/sandpit/sorting_gwas_catalog.R +++ b/inst/sandpit/sorting_gwas_catalog.R @@ -17,7 +17,7 @@ b <- subset(a, select=c(DISEASE.TRAIT, PUBMEDID, FIRST.AUTHOR, DATE, SNPS,STRONG b$RISK.ALLELE.FREQUENCY <- as.numeric(b$RISK.ALLELE.FREQUENCY) #exclude SNPs with missing rsids -b<-b[grep("rs",b$SNPS,ignore.case=TRUE,),] #exclude SNPs without an rsid +b<-b[grep("rs",b$SNPS,ignore.case=TRUE,),] #exclude SNPs without an rsid b[grep("-",b$STRONGEST.SNP.RISK.ALLELE,ignore.case=TRUE,invert=TRUE),] # Get effect allele @@ -28,7 +28,7 @@ b$effect_allele<-lapply(seq_along(b$STRONGEST.SNP.RISK.ALLELE),FUN=function(x) s b$DATE <- as.Date(b$DATE, format="%d-%b-%y") b$year <- format(b$DATE, "%Y") -# Try to get the units +# Try to get the units Start<-unlist(lapply(b$X95..CI..TEXT.,FUN=function(x) unlist(gregexpr("] ",x))+2)) Stop<-nchar(b$X95..CI..TEXT.) b$units<-unlist(lapply(seq_along(Start) ,FUN=function(x) substr(b$X95..CI..TEXT.[x],Start[x],Stop[x]))) diff --git a/inst/sandpit/test_rucker.R b/inst/sandpit/test_rucker.R index bbbb9258..224ce437 100644 --- a/inst/sandpit/test_rucker.R +++ b/inst/sandpit/test_rucker.R @@ -201,8 +201,10 @@ if (weights==2) {W = 1/(seBetaYG^2/BXG^2 + (BYG^2)*seBetaXG^2/BXG^4)} BIVw = BIV*sqrt(W) sW = sqrt(W) -IR = lm(BIVw ~ -1+sW);IVW[i] = IR$coef[1] -MR = lm(BIVw ~ sW);E[i] = MR$coef[2] +IR = lm(BIVw ~ -1+sW) +IVW[i] = IR$coef[1] +MR = lm(BIVw ~ sW) +E[i] = MR$coef[2] DF1 = length(BetaYG)-1 phi_IVW = summary(IR)$sigma^2 diff --git a/inst/sandpit/test_selection.R b/inst/sandpit/test_selection.R index ad48c5b9..1a96c0df 100644 --- a/inst/sandpit/test_selection.R +++ b/inst/sandpit/test_selection.R @@ -10,9 +10,9 @@ find_invalid_instruments <- function(d1, d2, d3, steiger_thresh=0.05) for (i in seq_len(nrow(d1))) { l0[[i]] <- mr_steiger( - d2$pval[i], - d1$pval[i], - d2$n[i], + d2$pval[i], + d1$pval[i], + d2$n[i], d1$n[i] ) } diff --git a/inst/sandpit/workflow.R b/inst/sandpit/workflow.R index 56ad6212..1e01d9a0 100644 --- a/inst/sandpit/workflow.R +++ b/inst/sandpit/workflow.R @@ -48,8 +48,8 @@ m <- mr(d) mrs <- mr_leaveoneout(dat) p <- mr_leaveoneout_plot(mrs) mr_report(mr_results, dat, path="inst/reports", output_path=".") -ggplot(mrs[[3]], aes(x=SNP, y=b)) + -geom_point() + +ggplot(mrs[[3]], aes(x=SNP, y=b)) + +geom_point() + geom_errorbar(aes(ymin=b-se, ymax=b+se),width=0, colour=grey) + coord_flip() diff --git a/man/mr_moe.Rd b/man/mr_moe.Rd index dcd79ac9..c29da716 100644 --- a/man/mr_moe.Rd +++ b/man/mr_moe.Rd @@ -44,8 +44,8 @@ load("rf.rdata") # Update the results with mixture of experts r <- mr_moe(r, rf) -# Now you can view the estimates, and see that they have -# been sorted in order from most likely to least likely to +# Now you can view the estimates, and see that they have +# been sorted in order from most likely to least likely to # be accurate, based on MOE prediction r[[1]]$estimates } diff --git a/tests/testthat/test_add_metadata.r b/tests/testthat/test_add_metadata.r index 7aa71c54..f893197d 100644 --- a/tests/testthat/test_add_metadata.r +++ b/tests/testthat/test_add_metadata.r @@ -14,7 +14,7 @@ skip_on_cran() # d6 <- extract_instruments("ieu-a-2")[1:5,] # d7 <- extract_outcome_data(exposure$SNP, 'ieu-a-2') # d8 <- extract_outcome_data(exposure$SNP, 'ukb-d-30710_irnt') -# d9 <- extract_instruments("bbj-a-1") +# d9 <- extract_instruments("bbj-a-1") # d10 <- extract_instruments("ieu-b-109") # save(d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, file="inst/extdata/test_add_metadata.RData", compress = "xz") diff --git a/tests/testthat/test_instruments.R b/tests/testthat/test_instruments.R index 0d2a5c06..937b9c61 100644 --- a/tests/testthat/test_instruments.R +++ b/tests/testthat/test_instruments.R @@ -18,7 +18,7 @@ test_that("server and mrinstruments 2", { if (class(exp_dat) == "try-error") skip("Server issues") expect_true(length(unique(exp_dat$id)) == 1) }) - + test_that("server and mrinstruments 3", { # yes no exp_dat <- try(extract_instruments(outcomes=c("ieu-a-2", "ieu-a-1032"))) diff --git a/tests/testthat/test_otherformats.R b/tests/testthat/test_otherformats.R index b610866e..d2689048 100644 --- a/tests/testthat/test_otherformats.R +++ b/tests/testthat/test_otherformats.R @@ -7,7 +7,7 @@ load(system.file("extdata", "test_commondata.RData", package="TwoSampleMR")) test_that("MRInput", { skip_if(getRversion() < '4.4.0') # because MendelianRandomization will not be installed w <- dat_to_MRInput(dat, get_correlations=FALSE) - expect_true(length(w) == 1) + expect_true(length(w) == 1) expect_true(class(w) == "list") expect_true(class(w[[1]]) == "MRInput") }) @@ -26,7 +26,7 @@ test_that("MRInput with cor", { test_that("mrpresso", { expect_warning(w <- run_mr_presso(dat)) - expect_true(length(w) == 1) + expect_true(length(w) == 1) expect_true(class(w) == "list") expect_true(class(w[[1]]) == "list") expect_true(length(w[[1]]) == 2) diff --git a/vignettes/harmonise.Rmd b/vignettes/harmonise.Rmd index cb0101fd..e04e0637 100644 --- a/vignettes/harmonise.Rmd +++ b/vignettes/harmonise.Rmd @@ -53,7 +53,7 @@ To harmonise the exposure and outcome data, do the following: ```{r} dat <- harmonise_data( - exposure_dat = bmi_exp_dat, + exposure_dat = bmi_exp_dat, outcome_dat = chd_out_dat ) ``` diff --git a/vignettes/outcome.Rmd b/vignettes/outcome.Rmd index e98d0e41..2061f23c 100644 --- a/vignettes/outcome.Rmd +++ b/vignettes/outcome.Rmd @@ -33,7 +33,7 @@ chd_out_dat1 <- extract_outcome_data( outcomes = 'ieu-a-7' ) chd_out_dat2 <- extract_outcome_data( - snps = c("rs234", "rs17097147"), + snps = c("rs234", "rs17097147"), outcomes = c('ieu-a-2', 'ieu-a-7') ) save(ao, bmi_exp_dat, chd_out_dat1, chd_out_dat2, file = file.path("inst", "extdata", "vig_outcome.RData"), compress = "xz") @@ -103,7 +103,7 @@ chd_out_dat1 <- extract_outcome_data( The `extract_outcome_data()` function is flexible. The `snps` argument only requires an array of rsIDs, and the `outcomes` argument can be a vector of outcomes, e.g. ```{r eval=FALSE} chd_out_dat2 <- extract_outcome_data( - snps = c("rs234", "rs17097147"), + snps = c("rs234", "rs17097147"), outcomes = c('ieu-a-2', 'ieu-a-7') ) ``` diff --git a/vignettes/perform_mr.Rmd b/vignettes/perform_mr.Rmd index e0f289ec..59d83c85 100644 --- a/vignettes/perform_mr.Rmd +++ b/vignettes/perform_mr.Rmd @@ -235,7 +235,7 @@ p4[[1]] ## 1-to-many forest plot -A 1-to-many MR analysis interrogates the effect of a single exposure on multiple outcomes or multiple exposures on a single outcome. The results of this analysis can be visualised using the 1-to-many forest plot, with or without stratification on a categorical variable. From a visual point of view, the function works best for 50 or fewer results and is not really designed to handle more than a 100 results. If your number of results is much greater than 50, it may be better to split these across two separate plots. For example, if you have 100 sets of results you could divide these equally across two plots and then combine the two plots together in another programme like Powerpoint. The function assumes the results are already in the right order for plotting. As such, users are advised to sort their results according to how they would like them to appear in the plot. Users can use their own code to do this or they can use the `sort_1_to_many()` function. +A 1-to-many MR analysis interrogates the effect of a single exposure on multiple outcomes or multiple exposures on a single outcome. The results of this analysis can be visualised using the 1-to-many forest plot, with or without stratification on a categorical variable. From a visual point of view, the function works best for 50 or fewer results and is not really designed to handle more than a 100 results. If your number of results is much greater than 50, it may be better to split these across two separate plots. For example, if you have 100 sets of results you could divide these equally across two plots and then combine the two plots together in another programme like Powerpoint. The function assumes the results are already in the right order for plotting. As such, users are advised to sort their results according to how they would like them to appear in the plot. Users can use their own code to do this or they can use the `sort_1_to_many()` function. ### Step 1: generate 1-to-many MR results @@ -248,7 +248,7 @@ chd_out_dat <- extract_outcome_data( ) dat2 <- harmonise_data( - exposure_dat = exp_dat, + exposure_dat = exp_dat, outcome_dat = chd_out_dat ) res <- mr(dat2) @@ -261,9 +261,9 @@ res <- mr(dat2) In this example we wish to plot results from an MR analysis of the effect of multiple exposures on coronary heart disease, with results sorted by decreasing effect size (largest effect at the top of the plot) and with one MR method for each unique exposure-outcome combination. We will also make the size of each point estimate proportional to its inverse variance. This is a useful way to draw attention towards the most reliable results and away from results with very wide confidence intervals. To specify the size of the point estimate, set the weight argument to the name of the column in the data with the weight information. ```{r cache=FALSE, warning=FALSE, eval=FALSE} -res <- subset_on_method(res) # default is to subset on either the IVW method (>1 instrumental SNP) or Wald ratio method (1 instrumental SNP). +res <- subset_on_method(res) # default is to subset on either the IVW method (>1 instrumental SNP) or Wald ratio method (1 instrumental SNP). res <- sort_1_to_many(res, b = "b", sort_action = 4) # this sorts results by decreasing effect size (largest effect at top of the plot) -res <- split_exposure(res) # to keep the Y axis label clean we exclude the exposure ID labels from the exposure column +res <- split_exposure(res) # to keep the Y axis label clean we exclude the exposure ID labels from the exposure column res$weight <- 1/res$se min(exp(res$b - 1.96*res$se)) # identify value for 'lo' in forest_plot_1_to_many @@ -343,7 +343,7 @@ In this next example we plot the results from an analysis of the effect of multi ```{r cache=FALSE, warning=FALSE, fig.height=10, eval=FALSE} res <- mr(dat2) -res <- split_exposure(res) # to keep the Y axis label clean we exclude the exposure ID labels from the exposure column +res <- split_exposure(res) # to keep the Y axis label clean we exclude the exposure ID labels from the exposure column res <- sort_1_to_many( @@ -377,7 +377,7 @@ forest_plot_1_to_many( In this next example we plot the same results as above but with results stratified by a grouping variable. We also select one MR method for each unique exposure-outcome combination and sort the results by decreasing effect size within each group (i.e. largest effect at the top). ```{r cache=FALSE, warning=FALSE, eval=FALSE} -res <- mr(dat2) +res <- mr(dat2) res <- split_exposure(res) res <- subset_on_method(res) res$subcategory[res$exposure %in% c("Adiponectin", "Hip circumference", "Waist circumference")] <- "Group 1" @@ -422,12 +422,12 @@ dis_dat <- extract_outcome_data( ) dat3 <- harmonise_data( - exposure_dat = exp_dat, + exposure_dat = exp_dat, outcome_dat = dis_dat ) res <- mr(dat3, method_list = c("mr_wald_ratio", "mr_ivw")) -res <- split_outcome(res) # to keep the Y axis label clean we exclude the exposure ID labels from the exposure column +res <- split_outcome(res) # to keep the Y axis label clean we exclude the exposure ID labels from the exposure column res <- sort_1_to_many(res, b = "b", sort_action = 4) # this sorts results by decreasing effect size (largest effect at top of the plot) ``` @@ -474,7 +474,7 @@ plot2 pdf("plot1.pdf", height = 10, width = 8) plot1 -dev.off() +dev.off() ``` * * * @@ -538,11 +538,11 @@ These tests are obtained using: ```{r eval=FALSE, warnings=FALSE} mr_steiger( - p_exp = dat$pval.exposure, - p_out = dat$pval.outcome, - n_exp = dat$samplesize.exposure, - n_out = dat$samplesize.outcome, - r_xxo = 1, + p_exp = dat$pval.exposure, + p_out = dat$pval.outcome, + n_exp = dat$samplesize.exposure, + n_out = dat$samplesize.outcome, + r_xxo = 1, r_yyo = 1, r_exp=0 ) @@ -635,7 +635,7 @@ Here `ld_matrix()` returns the LD correlation values (not R^2^) for each pair of ```{r} dat <- harmonise_data( - exposure_dat = bmi_exp_dat, + exposure_dat = bmi_exp_dat, outcome_dat = chd_out_dat ) ```