Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
LacroixLaurent committed Jul 26, 2024
0 parents commit 1692746
Show file tree
Hide file tree
Showing 136 changed files with 12,878 additions and 0 deletions.
49 changes: 49 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# History files
.Rhistory
.Rapp.history

# Session Data files
.RData
.RDataTmp

# User-specific files
.Ruserdata

# Example code in package build process
*-Ex.R

# Output files from R CMD build
/*.tar.gz

# Output files from R CMD check
/*.Rcheck/

# RStudio files
.Rproj.user/
*.Rproj
# produced vignettes
vignettes/*.html
vignettes/*.pdf

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# knitr and R markdown default cache directories
*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md

# R Environment Variables
.Renviron

# pkgdown site
docs/

# translation temp files
po/*~

# RStudio Connect folder
rsconnect/
31 changes: 31 additions & 0 deletions BrdU_Basecalling/BrdU_Configuration4megalodon.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Basic configuration file for ONT Guppy basecaller software.

# Data trimming.
trim_strategy = dna
trim_threshold = 2.5
trim_min_events = 3

# Basecalling.
model_file = ./BrdU_Basecalling/model_adversial_bug.json
chunk_size = 2000
gpu_runners_per_device = 4
chunks_per_runner = 512
chunks_per_caller = 10000
overlap = 50
qscore_offset = 0.4364
qscore_scale = 0.8409
builtin_scripts = 1

# Calibration strand detection
calib_reference = lambda_3.6kb.fasta
calib_min_sequence_length = 3000
calib_max_sequence_length = 3800
calib_min_coverage = 0.6

# Output.
records_per_fastq = 4000
min_qscore = 7.0

# Telemetry
#ping_url = https://ping.oxfordnanoportal.com/basecall
#ping_segment_duration = 60
431 changes: 431 additions & 0 deletions BrdU_Basecalling/README.html

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions BrdU_Basecalling/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# NanoTiming
### Laurent Lacroix (laurent.lacroix@inserm.fr)
***
### BrdU Base calling

Raw data are available from ENA repository under accession number PRJEB76824.
A fast5 sample from the WT_rep3 is available from the Zenodo repository under the DOI [10.5281/zenodo.12668295](https://doi.org/10.5281/zenodo.12668295).

In order to test this procedure, the WT_rep3.tar.gz file should be downloaded and expanded into a WT_rep3/fast5 folder.

This procedure is for ONT R9.4.1 data type acquired in a fast5 format.

*basecalling_sample.sh* contains a example of the base calling procedure going from RawData to the megalodon mod_mappings.bam file.

The mod_mappings.bam data should then be processed using the script in the *NanoTiming_Data* folder.

For this project, we have built a reference genome from ONT and Ilumina reads the the BT1 yeast strain. This strain was reported in Theulot et all 2022. The procedure used to build this reference genome is explained in the [manuscript](https://doi.org/10.1101/2024.07.05.602252) and the corresponding annotation is detailed in the *Reference_Genome* folder.

***
25 changes: 25 additions & 0 deletions BrdU_Basecalling/basecalling_sample.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Sample of the basecalling procedure used with our BrdU trained model
# This requires the ONT megalodon and guppy software
# We used guppy v4.4.1 GPU and megalogon v2.2.9 with minimap2 v2.24.
# the configuration file should point to the modified base model
# see BrdU_Configutation4megalodon.cfg
# (the model file is provided as a compressed archive (model_adversial_bug.json.zip))
# a conda env was created using the yml file provided within miniconda3
conda env create -f meg2.2.9env.yml
# then within this env, we used megalodon with the following parameters
conda activate mega_2.2.9
WORKDIR=NanoTiming
EXP=WT_rep3
REF="$WORKDIR"/Reference_Genome/BT1_multiUra.fa

GUPPYPATH="~/src/ont-guppy_4.4.1"
CFG=BrdU_Configuration4megalodon.cfg
# the config file is placed in the "$GUPPYPATH"/data folder
INPUT="$WORKDIR"/data/"$EXP"/fast5
OUTPUT="$WORKDIR"/data/"$EXP"/mega

nohup megalodon "$INPUT" --guppy-server-path "$GUPPYPATH"/bin/guppy_basecall_server --outputs mod_mappings --reference "$REF" --output-dir "$OUTPUT" --guppy-config "$CFG" --disable-mod-calibration --overwrite --device cuda:0 --processes 20 --guppy-timeout 90 --mod-min-prob 0 &
# please notice than for experiment with many long reads, we had to increase the guppy-timeout to 600
# also notice that this computer was equiped with a NVidia RTX2080Ti GPU

# this generate a megalodon's style mod-mappings.bam file in the "$EXP"/mega folder
56 changes: 56 additions & 0 deletions BrdU_Basecalling/meg2.2.9env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: megalodon_2.2.9
channels:
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- ca-certificates=2022.12.7=ha878542_0
- certifi=2016.9.26=py36_0
- ld_impl_linux-64=2.40=h41732ed_0
- libffi=3.4.2=h7f98852_5
- libgcc-ng=12.2.0=h65d4601_19
- libgomp=12.2.0=h65d4601_19
- libnsl=2.0.0=h7f98852_0
- libsqlite=3.40.0=h753d276_0
- libstdcxx-ng=12.2.0=h46fd767_19
- libzlib=1.2.13=h166bdaf_4
- ncurses=6.3=h27087fc_1
- openssl=1.1.1t=h0b41bf4_0
- pip=20.0.2=py36_1
- python=3.6.15=hb7a2778_0_cpython
- python_abi=3.6=2_cp36m
- readline=8.2=h8228510_1
- setuptools=49.6.0=py36h5fab9bb_3
- sqlite=3.40.0=h4ff8645_0
- tk=8.6.12=h27826a3_0
- wheel=0.34.2=py36_0
- xz=5.2.6=h166bdaf_0
- pip:
- cached-property==1.5.2
- cycler==0.11.0
- cython==0.29.34
- h5py==3.1.0
- importlib-resources==5.4.0
- kiwisolver==1.3.1
- mappy==2.24
- matplotlib==3.3.4
- megalodon==2.2.9
- numpy==1.19.5
- ont-fast5-api==4.1.1
- ont-pyguppy-client-lib==4.4.1
- packaging==21.3
- pandas==1.1.5
- pillow==8.4.0
- progressbar33==2.4
- pyparsing==3.0.9
- pysam==0.21.0
- python-dateutil==2.8.2
- pytz==2023.3
- scipy==1.5.4
- seaborn==0.11.2
- six==1.16.0
- sklearn==0.0.post4
- tqdm==4.64.1
- zipp==3.6.0
prefix: /users/rce/hyrien/miniconda3/envs/megalodon_2.2.9
Binary file added BrdU_Basecalling/model_adversial_bug.json.zip
Binary file not shown.
76 changes: 76 additions & 0 deletions Figures_Article/Fig1b.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
### Figure 1b
### plot BRdU content in individual reads

suppressMessages(library(tidyverse))
library(patchwork)
library(ggprism)
theme_set(theme_bw())
`%+%` <- paste0

setwd("/Users/ll/work/RStudioProjects/NanoTiming")
path2fig <- "Figures_Article/Figures_pdf/"
path2data <- "Figures_Article/Figures_data/"

nanot_data <- bind_rows(
readRDS(path2data %+% "nanoT_WT_rep1_alldata.rds"),
readRDS(path2data %+% "nanoT_WT_10_alldata.rds"),
readRDS(path2data %+% "nanoT_WT_20_alldata.rds"),
readRDS(path2data %+% "nanoT_WT_100_alldata.rds"),
readRDS(path2data %+% "nanoT_WT_1000_alldata.rds")
)

# Figure1b BrdU variation in reads
# filtering non substituted reads

set.seed(37)

toplot <- nanot_data %>%
mutate(dose=as_factor(dose))%>%
mutate(length=end-start+1)%>%
filter(length>75000)%>%
unnest(cols = c(signalr))%>%
group_by(read_id)%>%
mutate(median_b_read=median(signal,na.rm=T))%>%
mutate(mean_b_read=mean(signal,na.rm=T))%>%
filter(median_b_read> 0.02)%>%
ungroup()%>%
nest(cols = c(positions,signal))%>%
#subset of 50 reads per dose
group_by(dose)%>%
sample_n(50)%>%
arrange(median_b_read)%>%
mutate(track_i=row_number())%>%
unnest(cols = c(cols))%>%
mutate(x=positions-start) %>%
ungroup()

f1b <- ggplot(toplot)+
geom_line(aes(x=x,y=track_i,group=paste(read_id),color=signal),linewidth=1)+
facet_wrap(~dose,ncol=5)+
scale_color_viridis_c( option = "C",limits=c(0,1))+
labs(x="Position along the read (kb)", y="", color="BrdU content")+
scale_x_continuous(guide = "prism_minor",
labels=scales::unit_format(big.mark ="",suffix="",scale=1e-3,sep=""),
expand=expansion(mult=c(0,0.01)))+
scale_y_continuous(expand=expansion(mult=c(0.01,0.015)))+
theme_bw()+
theme(axis.text.y=element_blank(),axis.ticks.y=element_blank())+
ggtitle("Figure 1b")
ggsave(plot=f1b, file=path2fig %+% "Fig1b.pdf",w=5.5,h=3)


f1b <- ggplot(toplot)+
# geom_line(aes(x=x,y=track_i,group=paste(read_id),color=signal),linewidth=1)+
ggrastr::rasterise(geom_line(aes(x=x,y=track_i,group=paste(read_id),color=signal),linewidth=0.8),dpi=1200)+
facet_wrap(~dose,ncol=5)+
scale_color_viridis_c( option = "C",limits=c(0,1))+
labs(x="Position along the read (kb)", y="", color="BrdU content")+
scale_x_continuous(guide = "prism_minor",
labels=scales::unit_format(big.mark ="",suffix="",scale=1e-3,sep=""),
expand=expansion(mult=c(0,0.01)))+
scale_y_continuous(expand=expansion(mult=c(0.01,0.015)))+
theme_bw()+
theme(axis.text.y=element_blank(),axis.ticks.y=element_blank())

#ggsave(plot=f1b, file=path2fig %+% "Fig1b.pdf",w=5.5,h=3)
ggsave(plot=f1b, file=path2fig %+% "Fig1b_rast.pdf",w=5.5,h=3)
54 changes: 54 additions & 0 deletions Figures_Article/Fig1c.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#### Figure 1c
#### comparing nanoT data to sortseq vs BrdU

suppressMessages(library(tidyverse))
library(ggpubr)
theme_set(theme_bw())
mypal <- c(paletteer::paletteer_d("ggthemes::Classic_20"),"grey40","black")
`%+%` <- paste0

setwd("/Users/ll/work/RStudioProjects/NanoTiming")
path2fig <- "Figures_Article/Figures_pdf/"
path2data <- "Figures_Article/Figures_data/"

### Import Sortseq data

SortSeq_WT <- readRDS(path2data %+% "sortseq_WT.rds")

#### Import nanoT data

nanoT05 <- readRDS(path2data %+% "nanoT_WT_rep1.rds")
nanoT10 <- readRDS(path2data %+% "nanoT_WT_10.rds")
nanoT20 <- readRDS(path2data %+% "nanoT_WT_20.rds")
nanoT100 <- readRDS(path2data %+% "nanoT_WT_100.rds")
nanoT1000 <- readRDS(path2data %+% "nanoT_WT_1000.rds")

### Plot
toplot_nano <- bind_rows(nanoT05,nanoT10,nanoT20,nanoT100,nanoT1000) %>% rename(raw=mean_br_bin)

toplot <- full_join(toplot_nano,SortSeq_WT) %>% mutate(BrdU=factor(BrdU,levels=c("5 µM","10 µM","20 µM","100 µM","1000 µM")))

f1c <- ggplot(toplot,aes(x=timing,y=raw,col=BrdU))+
geom_point(size=0.2,alpha=0.2,shape=16)+
scale_color_manual(values=mypal,guide = guide_legend(reverse = TRUE) )+
geom_smooth(method="lm",se=F)+
stat_cor(label.x=2.1,label.y = c(0,0.05,0.1,0.15,0.2),method="spearman",cor.coef.name="rho",show.legend=F,aes(label=after_stat(r.label)))+
ylab("Mean BrdU content")+
xlab("Relative copy number (from sort-seq)")+
ggtitle("Figure 1c")

ggsave(plot=f1c,file=path2fig %+% "Fig1c.pdf",h=3,w=4.5)


f1c <- ggplot(toplot,aes(x=timing,y=raw,col=BrdU))+
# geom_point(size=0.2,alpha=0.2,shape=16)+
ggrastr::rasterise(geom_point(size=0.2,alpha=0.1,shape=16),dpi=600)+
scale_color_manual(values=mypal,guide = guide_legend(reverse = TRUE) )+
geom_smooth(method="lm",se=F)+
stat_cor(label.x=2.1,label.y = c(0,0.05,0.1,0.15,0.2),method="spearman",cor.coef.name="rho",show.legend=F,aes(label=after_stat(r.label)))+
ylab("Mean BrdU content")+
xlab("Relative copy number (from sort-seq)")

#ggsave(plot=f1c,file=path2fig %+% "Fig1c.pdf",h=3,w=4.5)
ggsave(plot=f1c,file=path2fig %+% "Fig1c_rast.pdf",h=3,w=4.5)

Loading

0 comments on commit 1692746

Please sign in to comment.