Skip to content

Commit

Permalink
New functions for v0.3.0
Browse files Browse the repository at this point in the history
  • Loading branch information
alex-sandercock authored Jun 25, 2024
1 parent 207b103 commit ca2727c
Show file tree
Hide file tree
Showing 7 changed files with 436 additions and 4 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: BIGr
Title: (B)reeding (I)nsight (G)enomics Functions for Polypoid and Diploid Species
Version: 0.2.0
Version: 0.3.0
Author: Alexander M. Sandercock, Josue Chinchilla Vargas, Shufen Chen, Manoj Sapkota, Meng Lin, Dongyan Zhao, and Breeding Insight Team
Maintainer: Alexander M. Sandercock <ams866@cornell.edu>
Description: This package contains the functions developed within Breeding Insight to analyze diploid and polyploid breeding and genetic data.
Expand All @@ -16,5 +16,6 @@ Imports:
Rdpack (>= 0.7),
readr (>= 2.1.5),
reshape2 (>= 1.4.4),
tidyr (>= 1.3.1)
tidyr (>= 1.3.1),
vcfR (>= 1.15.0)
RdMacros: Rdpack
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

S3method(filter,VCF)
export(calculate_Het)
export(calculate_MAF)
export(capture_diversity.Gmat)
Expand All @@ -23,3 +24,7 @@ importFrom(stats,sd)
importFrom(stats,setNames)
importFrom(utils,read.table)
importFrom(utils,write.table)
importFrom(vcfR,extract.gt)
importFrom(vcfR,maf)
importFrom(vcfR,read.vcfR)
importFrom(vcfR,write.vcf)
2 changes: 1 addition & 1 deletion R/dosage2vcf.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ dosage2vcf <- function(dart.report, dart.counts, ploidy, output.file) {
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype, where 1 is the count of alternate alleles">',
'##FORMAT=<ID=UD,Number=1,Type=Integer,Description="Dosage count of reference alleles from updog, where 0 = homozygous reference">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">',
'##FORMAT=<ID=RA,Number=1,Type=String,Description="Reference allele read depth">',
'##FORMAT=<ID=RA,Number=1,Type=Integer,Description="Reference allele read depth">',
'##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">',
'##FORMAT=<ID=MPP,Number=1,Type=Float,Description="Maximum posterior probability for that dosage call from updog">'
)
Expand Down
357 changes: 357 additions & 0 deletions R/filter.VCF.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,357 @@
#' Filter a VCF file
#'
#' This function will filter a VCF file or vcfR object and export the updated version
#'
#' This function will input a VCF file or vcfR object and filter based on the user defined options.
#' The output file will be saved to the location and with the name that is specified.
#' The VCF format is v4.3
#'
#' @param vcf.file vcfR object or path to VCF file. Can be unzipped (.vcf) or gzipped (.vcf.gz).
#' @param filter.OD Updog filter
#' @param filter.BIAS.min Updog filter (requires a value for both BIAS.min and BIAS.max)
#' @param filter.BIAS.max Updog filter (requires a value for both BIAS.min and BIAS.max)
#' @param filter.DP Total read depth at each SNP filter
#' @param filter.MPP Updog filter
#' @param filter.PMC Updog filter
#' @param filter.MAF Minor allele frequency filter
#' @param filter.SAMPLE.miss Sample missing data filter
#' @param filter.SNP.miss SNP missing data filter
#' @param ploidy The ploidy of the species being analyzed
#' @param output.file output file name
#' @return A gzipped vcf file
#' @importFrom vcfR read.vcfR
#' @importFrom vcfR write.vcf
#' @importFrom vcfR maf
#' @importFrom vcfR extract.gt
#' @examples
#' ## Use file paths for each file on the local system
#'
#'
#' #filter.VCF(vcf.file = "example_dart_Dosage_Report.csv",
#' # filter.OD = 0.5,
#' # ploidy = 2,
#' # output.file = "name_for_vcf")
#'
#' ##The function will output the filtered VCF to the current working directory
#'
#' @export
filter.VCF <- function(vcf.file,
filter.OD = NULL,
filter.BIAS.min = NULL,
filter.BIAS.max = NULL,
filter.DP = NULL,
filter.MPP = NULL,
filter.PMC = NULL,
filter.MAF = NULL,
filter.SAMPLE.miss = NULL,
filter.SNP.miss = NULL,
ploidy,
output.file) {

#Should allow for any INFO field to be entered to be filtered

# Import VCF (can be .vcf or .vcf.gz)
if (class(vcf.file) != "vcfR"){
vcf <- read.vcfR(vcf.file)
}

#Getting starting number of SNPs and Samples
starting_snps <- nrow(vcf)
starting_samples <- ncol(vcf@gt)-1 #subtract 1 to not include the FORMAT column

# Determine the number of items in the FORMAT field
format_string <- vcf@gt[1, "FORMAT"]
format_fields <- strsplit(format_string, ":")[[1]]
num_fields <- length(format_fields)
gt_pos <- which(format_fields == "GT")
# Create the NA format string, replacing only GT with "./."
missing_gt <- paste(rep(".", ploidy), collapse = "/")
na_fields <- rep(".", num_fields)
na_fields[gt_pos] <- missing_gt
na_format <- paste(na_fields, collapse = ":")

# Extract the DP values
if ("DP" %in% format_fields && !is.null(filter.DP)) {
cat("Filtering by DP\n")
dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE)
# Identify cells to modify based on the DP threshold
threshold <- as.numeric(filter.DP)
to_modify <- dp < threshold
# Replace cells in the vcf@gt matrix with NA format string where to_modify is TRUE
vcf@gt[, -1][to_modify] <- na_format
# Remove extra matrices
rm(to_modify)
rm(dp)
}

#Filter if the MPP field is present
if ("MPP" %in% format_fields && !is.null(filter.MPP)) {
cat("Filtering by MPP\n")
# Extract the MPP values
mpp <- extract.gt(vcf, element = "MPP", as.numeric = TRUE)
# Identify cells to modify based on the DP threshold
threshold <- as.numeric(filter.MPP) #Need to make a variable for user to enter
to_modify <- mpp > threshold
# Replace cells in the vcf@gt matrix with NA format string where to_modify is TRUE
vcf@gt[, -1][to_modify] <- na_format
#remove extra matrices
rm(to_modify)
rm(mpp)
}

## Filter based on INFO column (example: DP > 10)
#Get INFO column
info <- vcf@fix[, "INFO"]
# Function to extract INFO IDs from a single INFO string
extract_info_ids <- function(info_string) {
# Split the INFO string by ';'
info_parts <- strsplit(info_string, ";")[[1]]
# Extract the part before the '=' in each segment
info_ids <- gsub("=.*", "", info_parts)
return(info_ids)
}

# Apply the function to the first INFO string
info_ids <- extract_info_ids(info[1])

##Filter by the INFO fields if present (maybe make small internal function)
#DP
#if ("DP" %in% info_ids) {
# dp_values <- as.numeric(sub(".*DP=([0-9]+);.*", "\\1", info))
# vcf <- vcf[dp_values > 1000, ]
#}

#OD
if ("OD" %in% info_ids && !is.null(filter.OD)) {
cat("Filtering by OD\n")
od_values <- as.numeric(sub(".*OD=([0-9]+);.*", "\\1", info))
vcf <- vcf[od_values < as.numeric(filter.OD), ]
snps_removed <- starting_snps - nrow(vcf)
}

#BIAS (need to add user variables)
if ("BIAS" %in% info_ids && !is.null(filter.BIAS.min) && !is.null(filter.BIAS.max)) {
cat("Filtering by BIAS\n")
bias_values <- as.numeric(sub(".*BIAS=([0-9]+);.*", "\\1", info))
vcf <- vcf[bias_values > as.numeric(filter.BIAS.min) & bias_values < as.numeric(filter.BIAS.max), ]
}

#PMC (need to add user variables)
if ("PMC" %in% info_ids && !is.null(filter.PMC)) {
cat("Filtering by PMC\n")
pmc_values <- as.numeric(sub(".*PMC=([0-9]+);.*", "\\1", info))
vcf <- vcf[pmc_values < as.numeric(filter.PMC), ]
}

# Example: Filter based on missing data for samples and SNPs
if (!is.null(filter.SAMPLE.miss) || !is.null(filter.SNP.miss)){
gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf))

if (!is.null(filter.SNP.miss)) {
cat("Filtering by SNP missing data\n")
snp_missing_data <- rowMeans(is.na(gt_matrix))
vcf <- vcf[snp_missing_data < as.numeric(filter.SNP.miss), ]
gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)
}

if (!is.null(filter.SAMPLE.miss)) {
cat("Filtering by Sample missing data\n")
# Calculate the proportion of missing data for each sample
sample_missing_data <- colMeans(is.na(gt_matrix))
# Identify samples to keep based on the missing data threshold
samples_to_keep <- names(sample_missing_data)[sample_missing_data < as.numeric(filter.SAMPLE.miss)]
# Include "FORMAT" column in the samples to keep
samples_to_keep <- c("FORMAT", samples_to_keep)
# Subset the VCF object to keep only the desired samples
vcf <- vcf[, colnames(vcf@gt) %in% samples_to_keep]
}

# Remove matrices
rm(gt_matrix)
}

##Convert GT to dosage
#gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf))

# Function to determine the ploidy level from a genotype string
#determine_ploidy <- function(gt) {
# if (is.na(gt)) {
# return(NA)
# }
# return(length(strsplit(gt, "[|/]")[[1]]))
#}

# Function to find a non-NA example genotype to determine ploidy
#find_example_gt <- function(matrix) {
# for (i in seq_len(nrow(matrix))) {
# for (j in seq_len(ncol(matrix))) {
# if (!is.na(matrix[i, j])) {
# return(matrix[i, j])
# }
# }
# }
# return(NA) # Return NA if no non-NA genotype is found
#}

# Find a non-NA example genotype
#example_gt <- find_example_gt(gt_matrix)

# Determine the ploidy level
#if (!is.na(example_gt)) {
# ploidy <- determine_ploidy(example_gt)
#} else {
# stop("No non-NA genotype found to determine ploidy.")
#}

# Generate lookup table for genotypes to dosage conversion
#generate_lookup_table <- function(ploidy) {
# possible_alleles <- 0:ploidy
# genotypes <- expand.grid(rep(list(possible_alleles), ploidy))
# genotypes <- apply(genotypes, 1, function(x) paste(x, collapse = "/"))
# dosage_values <- rowSums(expand.grid(rep(list(possible_alleles), ploidy)))
# lookup_table <- setNames(dosage_values, genotypes)
# return(lookup_table)
#}

# Generate the lookup table
#lookup_table <- generate_lookup_table(ploidy)

# Function to convert genotype to dosage using the lookup table
#genotype_to_dosage <- function(gt, lookup_table) {
# if (is.na(gt)) {
# return(NA)
# }
# return(lookup_table[[gt]])
#}

# Function to convert genotype matrix to dosage matrix using vectorized operations
#convert_genotypes_to_dosage <- function(gt_matrix, lookup_table) {
# unique_gts <- unique(gt_matrix)
# gt_to_dosage <- setNames(rep(NA, length(unique_gts)), unique_gts)
# valid_gts <- unique_gts[unique_gts %in% names(lookup_table)]
# gt_to_dosage[valid_gts] <- lookup_table[valid_gts]
# dosage_matrix <- gt_to_dosage[gt_matrix]
#colnames(dosage_matrix) <- colnames(gt_matrix)
#row.names(dosage_matrix) <- row.names(gt_matrix)
# return(matrix(as.numeric(dosage_matrix), nrow = nrow(gt_matrix), ncol = ncol(gt_matrix)))
#}

# Convert the genotype matrix to dosage matrix
#dosage_matrix <- convert_genotypes_to_dosage(gt_matrix, lookup_table)

##MAF filter
#Compare my lengthy process to estimate MAF with vcfR::maf() function
#The BIGr::calculate_MAF(dosage_matrix, ploidy) is the exact same as the vcfR::maf() calculations
#The step where I extract UD and calculate MAF is different...
#if ("UD" %in% format_fields) {
# maf_df <- BIGr::calculate_MAF(extract.gt(vcf, element = "UD", as.numeric = TRUE), ploidy = ploidy)
#} else {
#convert genotypes to dosage and filter
# maf_df <- BIGr::calculate_MAF(dosage_matrix, ploidy)
#}
#Need to confirm that vcfR::maf will work with any ploidy...if not, use my code
if (!is.null(filter.MAF)) {
cat("Filtering by MAF\n")
maf_df <- data.frame(vcfR::maf(vcf, element = 2))
vcf <- vcf[maf_df$Frequency > as.numeric(filter.MAF), ]
}
### Export the modified VCF file (this exports as a .vcf.gz, so make sure to have the name end in .vcf.gz)
cat("Exporting VCF\n")
if (!class(vcf.file) == "vcfR"){
base_name <- basename(vcf.file)
output_name <- paste0("filtered_",base_name,".vcf.gz")
vcfR::write.vcf(vcf, file = output_name)
}else{
base_name <- output.file
output_name <- paste0("filtered_",base_name,".vcf.gz")
vcfR::write.vcf(vcf, file = output_name)
}

#Message that includes the output vcf stats
print(vcf)

#Message
samples_removed <- starting_samples - (ncol(vcf@gt)-1)
SNPs_removed <- starting_snps - nrow(vcf)
message("Samples removed due to filtering: ",samples_removed)
message("SNPs removed due to filtering: ",SNPs_removed)
message("Complete!")
}
#This is not reliable, so no longer use this shortcut to get dosage matrix
#test2 <- vcfR2genlight(vcf)


#####Testing custom VCF reading function######
# Open the gzipped VCF file
#con <- gzfile("/Users/ams866/Desktop/output.vcf", "rt")

# Read in the entire file
#lines <- readLines(con)
#close(con)
# Read in the entire file
#lines <- readLines("/Users/ams866/Desktop/output.vcf")
# Filter out lines that start with ##
#filtered_lines <- lines[!grepl("^##", lines)]
# Create a temporary file to write the filtered lines
#temp_file <- tempfile()
#writeLines(filtered_lines, temp_file)
# Read in the filtered data using read.table or read.csv
#vcf_data <- read.table(temp_file, header = TRUE, sep = "\t", comment.char = "", check.names = FALSE)
# Clean up the temporary file
#unlink(temp_file)

##Extract INFO column and Filter SNPs by those values
#Update the filtering options by the items present in the INFO column?

# Load required library
#library(dplyr)

# Split INFO column into key-value pairs
#vcf_data_parsed <- vcf_data %>%
# mutate(INFO_PARSED = strsplit(INFO, ";")) %>%
# unnest(INFO_PARSED) %>%
# separate(INFO_PARSED, into = c("KEY", "VALUE"), sep = "=") %>%
# spread(KEY, VALUE)

#Filter by DP
#filtered_vcf_data <- vcf_data_parsed %>%
# filter(as.numeric(DP) > 10)

# View the filtered dataframe
#print(filtered_vcf_data)

##Extracting and filtering by FORMAT column
# Identify the columns that are not sample columns
#non_sample_cols <- c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT")
# Identify the sample columns
#sample_cols <- setdiff(names(vcf_data), non_sample_cols)
# Extract FORMAT keys
#format_keys <- strsplit(as.character(vcf_data$FORMAT[1]), ":")[[1]]
# Split SAMPLE columns based on FORMAT
#vcf_data_samples <- vcf_data %>%
# mutate(across(all_of(sample_cols), ~strsplit(as.character(.), ":"))) %>%
# mutate(across(all_of(sample_cols), ~map(., ~setNames(as.list(.), format_keys)))) %>%
# unnest_wider(all_of(sample_cols), names_sep = "_")

# View the parsed dataframe
#print(head(vcf_data_samples))

# Create separate dataframes for each FORMAT variable
#format_dfs <- lapply(format_keys, function(format_key) {
# vcf_data_samples %>%
# select(ID, ends_with(paste0("_", format_key))) %>%
# column_to_rownames("ID")
#})

# Assign names to the list elements
#names(format_dfs) <- format_keys

# Access the separate dataframes
#gt_df <- format_dfs$GT # Genotype dataframe
#ad_df <- format_dfs$AD # Allelic depths dataframe

#*I think the above method is okay if you only need to filter at the INFO level,
#*But I think if you want to filter for FORMAT, that vcfR is probably best,
#*Will need to explore further if I can easily just filter for MPP by checking if it is above a
#*threshold, and then converting the GT and UD values to NA if so...
#*If that is efficient and works, then I will just use this custom VCF method...
Loading

0 comments on commit ca2727c

Please sign in to comment.