Skip to content

Source code used to generate the Figures for Kleidendorst et al, Nat Prot

License

Notifications You must be signed in to change notification settings

GuidoBarzaghi/Kleinendorst_et_al

 
 

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

9 Commits
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Kleinendorst_et_al

GuidoBarzaghi 18/07/2021

Fig.2

library(BSgenome.Mmusculus.UCSC.mm10)
library(tidyverse)

# Get list of ChIP confirmed JASPAR TFBSs
JASPAR_TFBSs = readRDS("/g/krebs/krebs/analysis/SMF/MM/analysis/mapped_jaspar_ChIP_annotated.rds")
JASPAR_TFBSs_ChIP = JASPAR_TFBSs[JASPAR_TFBSs$isBound == TRUE]

UpstreamBins = GRanges(seqnames(JASPAR_TFBSs_ChIP), IRanges(start = end(JASPAR_TFBSs_ChIP) - (width(JASPAR_TFBSs_ChIP)/2) - 35, end = end(JASPAR_TFBSs_ChIP) - (width(JASPAR_TFBSs_ChIP)/2) - 25))
UpstreamBinSeq = getSeq(BSgenome.Mmusculus.UCSC.mm10, UpstreamBins)
TFBSBins = GRanges(seqnames(JASPAR_TFBSs_ChIP), IRanges(start = end(JASPAR_TFBSs_ChIP) - (width(JASPAR_TFBSs_ChIP)/2) - 15, end = end(JASPAR_TFBSs_ChIP) - (width(JASPAR_TFBSs_ChIP)/2) + 15))
TFBSBinsSeq = getSeq(BSgenome.Mmusculus.UCSC.mm10, TFBSBins)
DownstreamBins = GRanges(seqnames(JASPAR_TFBSs_ChIP), IRanges(start = end(JASPAR_TFBSs_ChIP) - (width(JASPAR_TFBSs_ChIP)/2) + 25, end = end(JASPAR_TFBSs_ChIP) - (width(JASPAR_TFBSs_ChIP)/2) + 35))
DownstreamBinSeq = getSeq(BSgenome.Mmusculus.UCSC.mm10, DownstreamBins)

Covered_SE = vcountPattern(pattern = "DGCHN", subject = UpstreamBinSeq, fixed = F) > 0 & 
  vcountPattern(pattern = "DGCHN", subject = TFBSBinsSeq, fixed = F) > 0 &
  vcountPattern(pattern = "DGCHN", subject = DownstreamBinSeq, fixed = F) > 0

UpstreamCoverage = vcountPattern(pattern = "GC", subject = UpstreamBinSeq) > 0 | vcountPattern(pattern = "CG", subject = UpstreamBinSeq) > 0
TFBSCoverage = vcountPattern(pattern = "GC", subject = TFBSBinsSeq) > 0 | vcountPattern(pattern = "CG", subject = TFBSBinsSeq) > 0
DownstreamCoverage = vcountPattern(pattern = "GC", subject = DownstreamBinSeq) > 0 | vcountPattern(pattern = "CG", subject = DownstreamBinSeq) > 0
Covered_DE = UpstreamCoverage & TFBSCoverage & DownstreamCoverage

JASPAR_TFBSs_ChIP %>%
  as_tibble() %>%
  mutate(TFBS = names(JASPAR_TFBSs_ChIP)) %>%
  mutate(Covered_SE = Covered_SE, Covered_DE = Covered_DE) %>%
  select(TFBS, name, Covered_SE, Covered_DE) -> TFBS_coverage_DF

# N.b.: because I only ask for TFBSs to contain AT LEAST one cytosine, overcounting GCGs has no consequence

# TF-wise
TFBS_coverage_DF %>%
  group_by(name) %>%
  summarise(Count = sum(Covered_SE)) %>%
  mutate(ExpType = "SE SMF") -> SE_TFBSs
TFBS_coverage_DF %>%
  group_by(name) %>%
  summarise(Count = sum(Covered_DE)) %>%
  mutate(ExpType = "DE SMF") -> DE_TFBSs
TFBS_coverage_DF %>%
  group_by(name) %>%
  summarise(Count = n()) %>%
  mutate(ExpType = "Total") -> Tot_TFBSs

rbind(SE_TFBSs, DE_TFBSs, Tot_TFBSs) %>%
  spread(ExpType, Count) %>%
  gather(ExpType, nrTFBSs, -name, -Total) %>%
  mutate(perc = (nrTFBSs / Total)*100) %>%
  select(-nrTFBSs) %>%
  spread(ExpType, perc) %>%
  ggplot() +
  geom_point(aes(x=`DE SMF`, y=`SE SMF`, size=Total)) +
  ggrepel::geom_label_repel(aes(x=`DE SMF`, y=`SE SMF`, label=name), force = 3, box.padding = 0.5, max.overlaps = 15) +
  xlab("% TFBS covered with DE SMF") +
  ylab("% TFBS covered with SE SMF") +
  theme_classic()

Fig.3

library(SingleMoleculeFootprinting)
library(BSgenome)
library(tidyverse)

Qinput = "/g/krebs/barzaghi/HTS/SMF/MM/2021-05-06-000000000-JFM6H_Lambda/aln/Qinput.txt"
LambdaGenomeSeq = readDNAStringSet(filepath = "/g/krebs/barzaghi/analyses/misc/NP_rebuttal/LambdaGenome.fasta")
names(LambdaGenomeSeq) = "J02459.1"
QuasRprj = QuasR::qAlign(sampleFile = Qinput, genome = "/g/krebs/barzaghi/analyses/misc/NP_rebuttal/LambdaGenome.fasta", paired = 'fr', bisulfite = "undir")
QuasRprj@aligner = "Rbowtie"
Methylation = QuasR::qMeth(proj = QuasRprj, mode = "allC")

GCs = FilterContextCytosines(MethGR = Methylation, genome = LambdaGenomeSeq, context = "GC")
CGs = FilterContextCytosines(MethGR = Methylation, genome = LambdaGenomeSeq, context = "CG")

GCs_collapsed = CollapseStrands(MethGR = GCs, context = "GC")
CGs_collapsed = CollapseStrands(MethGR = CGs, context = "HCG")

GCs_filtered = CoverageFilter(MethGR = GCs_collapsed, thr = 20)
CGs_filtered = CoverageFilter(MethGR = CGs_collapsed, thr = 20)

start(GCs_filtered) = start(GCs_filtered) - 2
end(GCs_filtered) = end(GCs_filtered) + 1
start(CGs_filtered) = start(CGs_filtered) - 1
end(CGs_filtered) = end(CGs_filtered) + 2
AllCs = c(GCs_filtered, CGs_filtered)

KMERS = getSeq(LambdaGenomeSeq, AllCs)
AllCs$Kmer_Context = KMERS

AllCs %>%
  as_tibble() %>%
  filter(GenomicContext == "CG") %>%
  select(-seqnames, -end, -strand, -width) %>%
  gather(Measure, Value, -Kmer_Context, -GenomicContext, -start) %>%
  separate(Measure, into = c("Sample", "Measure"), sep = "_", remove = TRUE) %>%
  spread(Measure, Value) %>%
  filter(str_detect(Sample, "CG")) -> EDA_DF_CGs
AllCs %>%
  as_tibble() %>%
  filter(GenomicContext == "GC") %>%
  select(-seqnames, -end, -strand, -width) %>%
  gather(Measure, Value, -Kmer_Context, -GenomicContext, -start) %>%
  separate(Measure, into = c("Sample", "Measure"), sep = "_", remove = TRUE) %>%
  spread(Measure, Value)  %>%
  filter(str_detect(Sample, "GC"))-> EDA_DF_GCs

grep("Kmer_Context|GenomicContext", sort(unique(gsub("_Coverage$|_MethRate$", "", colnames(elementMetadata(AllCs))))), invert = TRUE, value = TRUE) %>%
  data.frame() %>%
  rename("Sample" = ".") %>%
  mutate(Enzyme = gsub("[[:digit:]]|R", "", Sample), Conc = gsub("[[:alpha:]]", "", Sample)) %>%
  mutate(Concentration = c(0,0.75,10,1.5,2,3,4,
                           0,0.2,0.7,10,1.2,1.5,2,
                           0.2,0.7,10,1.2,1.5,2)) %>%
  mutate(Replicate = c(rep(1, 14), rep(2, 6))) %>%
  select(-Conc) -> SamplesDict

plyr::join(rbind(EDA_DF_CGs, EDA_DF_GCs), SamplesDict) -> Plotting_DF
Plotting_DF %>%
  group_by(Kmer_Context, Enzyme, Concentration) %>%
  summarise(Median_MethRate = median(MethRate, na.rm = TRUE)) %>%
  ungroup() -> Summarised_Plotting_DF

Summarised_Plotting_DF %>%
  group_by(Kmer_Context, Enzyme) %>%
  arrange((Concentration)) %>%
  mutate(MethRate_seqDiff = c(Median_MethRate[1], diff(Median_MethRate))) %>%
  ungroup() %>%
  mutate(Concentration = factor(Concentration, levels = rev(unique(Concentration)))) %>%
  ggplot() +
  geom_bar(aes(reorder(Kmer_Context, Median_MethRate, na.rm=TRUE), MethRate_seqDiff*100, fill=Concentration), stat='identity', position = "stack") +
  facet_wrap(~Enzyme, scales = "free_x", labeller = as_labeller(c("GC" = "M.CviPI", "CG" = "M.SssI"))) +
  xlab("") +
  ylab("Methylation (%)") +
  theme_classic() +
  labs(fill = "Concentration\n[U/\u03bcg]") +
  theme(axis.text.x = element_text(angle=90, vjust = 0.5), text = element_text(size = 25)) +
  scale_y_continuous(breaks = c(0,100)) +
  scale_fill_manual(values = rev(c(RColorBrewer::brewer.pal(name = "Reds", n=9), "black")))

Fig.6

library(SingleMoleculeFootprinting) # for this use the dev version https://github.com/Krebslabrep/SingleMoleculeFootprinting/tree/dev
library(ggplot2)
library(tidyverse)
library(BSgenome.Mmusculus.UCSC.mm10)

Qinput = "/g/krebs/barzaghi/HTS/SMF/MM/QuasR_input_files/QuasR_input_AllCanWGpooled_dprm.txt"

MiSeq_Samples = grep("NO.*_MiSeq$", readr::read_delim(Qinput, "\t")$SampleName, value = TRUE)
NextSeq_Sample = "SMF_MM_TKO_as_NO_R_NextSeq"

chr19 = GRanges(seqinfo(BSgenome.Mmusculus.UCSC.mm10))[19]

CallContextMethylation(sampleSheet = Qinput,
                       sample = MiSeq_Samples,
                       genome = BSgenome.Mmusculus.UCSC.mm10,
                       RegionOfInterest = chr19,
                       coverage = 1,
                       returnSM = FALSE) -> MiSeq
CallContextMethylation(sampleSheet = Qinput,
                       sample = NextSeq_Sample,
                       genome = BSgenome.Mmusculus.UCSC.mm10,
                       RegionOfInterest = chr19,
                       coverage = 20,
                       returnSM = FALSE) -> NextSeq

LowCoverageMethRateDistribution(LowCoverage = MiSeq$DGCHN,
                                LowCoverage_samples = MiSeq_Samples,
                                HighCoverage = NextSeq$DGCHN,
                                HighCoverage_samples = NextSeq_Sample,
                                bins = 50, returnDF = TRUE, returnPlot = TRUE, MSE = TRUE, return_MSE_DF = TRUE, return_MSE_plot = TRUE) -> Results

Results$MethylationDistribution_DF %>%
  mutate(`Sample quality` = ifelse(Sample %in% c("SMF_MM_NP_NO_R3_MiSeq", "SMF_MM_NP_NO_R4_MiSeq"), "bad", "Good")) %>%
  ggplot(aes(x=ExpectedMeth*100,y=ObservedMeth*100, group=interaction(Sample,Coverage), color=`Sample quality`)) +
  geom_line(aes(linetype=Coverage, size=Coverage)) +
  ylab("Observed Methylation (%)") +
  xlab("Expected Methylation (%)") +
  theme_classic() +
  guides(color = guide_legend(override.aes = list(size = 1)), linetype = guide_legend(override.aes = list(size = 1))) +
  theme(text = element_text(size = 15), legend.key.width = unit(5, "mm")) +
  scale_x_continuous(breaks = c(0,100)) +
  scale_y_continuous(breaks = c(0,100)) +
  scale_size_manual("type", values = c(2.5, 1), guide = "none") -> pl1

Results$MSE_DF %>%
  mutate(`Sample quality` = ifelse(Sample %in% c("SMF_MM_NP_NO_R3_MiSeq", "SMF_MM_NP_NO_R4_MiSeq"), "bad", "Good")) %>%
  filter(Sample != "SMF_MM_TKO_as_NO_R_NextSeq") %>%
  mutate(Sample = gsub("_", " ",
                       gsub("NP", "Neural_progenitor",
                            gsub("ES", "mESC",
                                 gsub("_MiSeq$|_NO|SMF_|MM_", "", Sample))))) %>%
  ggplot(aes(Sample, MSE, fill=`Sample quality`)) +
    geom_bar(stat = 'identity') +
    ylab("Mean squared error\n(MSE)") +
    xlab("") +
    theme_classic() +
    scale_y_continuous(breaks = c(0, 0.06)) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5), text = element_text(size = 15)) -> pl2
  
cowplot::plot_grid(pl1, pl2, nrow = 1, align = "none")

Fig.7

The right panels in Fig.7 was produced as shown in the vignette of the SingleMoleculeFootprinting Bioconductor package. The left panel of the same figure was produced using analogous code on a bait capture dataset.

About

Source code used to generate the Figures for Kleidendorst et al, Nat Prot

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published