Skip to content

Commit

Permalink
fixed motif and added breakpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
derekwong90 committed Jun 30, 2023
1 parent 2d6d06e commit eaa6ef4
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 202 deletions.
89 changes: 0 additions & 89 deletions breakpoint/runners/breakpoints_167.sh

This file was deleted.

25 changes: 13 additions & 12 deletions breakpoint/runners/breakpoints_genome.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
INPUTDIR=/cluster/projects/pughlab/external_data/TGL49_CHARM/LFS/LFS_WG/bams
basedir=/cluster/projects/pughlab/projects/CHARM/LFS/breakpoints
INPUTDIR=/cluster/projects/pughlab/hereditary/external_data/TGL49_CHARM/LFS/LFS_WG/bams
basedir=/cluster/projects/pughlab/hereditary/projects/CHARM/LFS/breakpoint
ref=/cluster/projects/pughlab/references/TGL/hg38/hg38_random.fa
picard_dir=/cluster/tools/software/picard/2.10.9
frags=/cluster/projects/pughlab/bin/fragmentomics/R
frags=/cluster/projects/pughlab/bin/fragmentomics/v2/breakpoint/R
outdir=$basedir/output/genome
shdir=$basedir/sh_scripts/genome

Expand All @@ -25,47 +25,48 @@ module load bedtools/2.27.1
module load R/4.0.0\n" > $shdir/${bam}.sh

### Remove duplicates
echo -e "java -jar $picard_dir/picard.jar MarkDuplicates \
echo -e "#java -jar $picard_dir/picard.jar MarkDuplicates \
I=$INPUTDIR/${bam}.bam \
O=$outdir/${bam}_deduped.bam \
M=$outdir/${bam}_metrics.txt \
TMP_DIR=$outdir \
REMOVE_DUPLICATES=true \
REMOVE_SEQUENCING_DUPLICATES=true
samtools sort -n $outdir/${bam}_deduped.bam -o $outdir/${bam}_deduped_sorted.bam\n" >> $shdir/${bam}.sh
#samtools sort -n $outdir/${bam}_deduped.bam -o $outdir/${bam}_deduped_sorted.bam\n" >> $shdir/${bam}.sh

### Sort bam by name and convert to bedpe format
echo -e "samtools view -bf 0x2 $outdir/${bam}_deduped_sorted.bam | \
echo -e "#samtools view -bf 0x2 $outdir/${bam}_deduped_sorted.bam | \
bedtools bamtobed \
-i stdin \
-bedpe > $outdir/${bam}.bedpe\n" >> $shdir/${bam}.sh

### Format bedpe to bed
echo -e "Rscript $frags/breakpoint_format_bedpe.R \
echo -e "#Rscript $frags/breakpoint_format_bedpe.R \
--id $bam \
--bedpe $outdir/${bam}.bedpe \
--outdir $outdir\n" >> $shdir/${bam}.sh

### Get FASTA sequences
echo -e "bedtools getfasta \
echo -e "#bedtools getfasta \
-bedOut \
-fi $ref \
-bed $outdir/${bam}_5.bed > $outdir/${bam}_fasta_5.bed
bedtools getfasta \
#bedtools getfasta \
-bedOut \
-fi $ref \
-bed $outdir/${bam}_3.bed > $outdir/${bam}_fasta_3.bed\n" >> $shdir/${bam}.sh

### Convert FASTA to end motif context frequencies
echo -e "Rscript $frags/motif_get_contexts.R \
echo -e "Rscript $frags/breakpoint_get_contexts.R \
--id $bam \
--fasta_5 $outdir/${bam}_fasta_5.bed \
--fasta_3 $outdir/${bam}_fasta_3.bed \
--outdir $outdir\n" >> $shdir/${bam}.sh

### Remove intermediate files
echo -e "if [[ -f "$outdir/${bam}_raw.txt" ]]
echo -e "if [[ -f "$outdir/${bam}_count.txt" ]]
then
echo -e \"Output completed sucessfully\"
rm $outdir/${bam}*.bam*
Expand All @@ -80,5 +81,5 @@ cd $shdir

ls *.sh > files
for file in $(cat files);do
sbatch -c 1 -t 10:00:00 -p all --mem 24G $file
sbatch -c 1 -t 10:00:00 -p superhimem --mem 120G $file
done
92 changes: 0 additions & 92 deletions breakpoint/runners/breakpoints_sites.sh

This file was deleted.

14 changes: 7 additions & 7 deletions breakpoint/scripts/breakpoint_format_bedpe.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ bedpe <- bedpe[bedpe$length <= 600, ]
bedpe <- bedpe[, c("V1", "V2", "V6")]
bedpe <- bedpe[order(factor(bedpe$V1, levels = chrs),
bedpe$V2), ]
bedpe$Start <- ifelse(bedpe$V2 < bedpe$V6, bedpe$V2, bedpe$V6) - 1
bedpe$End <- ifelse(bedpe$V6 > bedpe$V2, bedpe$V6, bedpe$V2) - 1
bedpe$Start <- ifelse(bedpe$V2 < bedpe$V6, bedpe$V2, bedpe$V6)
bedpe$End <- ifelse(bedpe$V6 > bedpe$V2, bedpe$V6, bedpe$V2)

### Get 2 bases +/- breakpoint
bedpe$front <- bedpe$Start + 3
bedpe$Start <- bedpe$Start - 3
### Get 25 bases +/- breakpoint
bedpe$front <- bedpe$Start + 15
bedpe$Start <- bedpe$Start - 15

bedpe$back <- bedpe$End - 3
bedpe$End <- bedpe$End + 3
bedpe$back <- bedpe$End - 15
bedpe$End <- bedpe$End + 15

bedpe_1 <- bedpe[, c("V1", "Start", "front")]
bedpe_2 <- bedpe[, c("V1", "back", "End")]
Expand Down
88 changes: 88 additions & 0 deletions breakpoint/scripts/breakpoint_get_contexts.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# file: runFrag.R
# author: Derek Wong, Ph.D
# date: Aug 3rd, 2022

library(optparse)

## Set script variables
option_list <- list(
make_option(c("--id"), type = "character", help = "sample id. Required"),
make_option(c("--fasta_5"), type = "character", help = "Path to fasta file. Required."),
make_option(c("--fasta_3"), type = "character", help = "Path to fasta file. Required."),
make_option(c("--outdir"), type = "character", help = "Path to output directory. Required.")
)
parseobj <- OptionParser(option_list=option_list)
opt <- parse_args(parseobj)
print(opt)
options(scipen=999, stringsAsFactors=F)

## Load required packages
library(tidyverse)
library(data.table)
options(stringsAsFactors=FALSE)
options(bitmapType='cairo')

## Get variables from input script
id <- opt$id
fasta_file_5 <- opt$fasta_5
fasta_file_3 <- opt$fasta_3
outdir <- opt$outdir

## Read in fasta
fasta_5 <- read.delim(fasta_file_5, header = FALSE)
fasta_3 <- read.delim(fasta_file_3, header = FALSE)

### Prune and format fastas
fasta_5$V4 <- toupper(fasta_5$V4)
fasta_5 <- as.data.table(fasta_5)
fasta_5 <- fasta_5[!(fasta_5$V4 %like% "N"), ]
fasta_5$length <- nchar(fasta_5$V4)
#fasta_5 <- fasta_5[fasta_5$length == 4, ]

fasta_3$V4 <- toupper(fasta_3$V4)
fasta_3 <- as.data.table(fasta_3)
fasta_3 <- fasta_3[!(fasta_3$V4 %like% "N"), ]
fasta_3$length <- nchar(fasta_3$V4)
#fasta_3 <- fasta_3[fasta_3$length == 4, ]

### Reverse 3' end
#fasta_3 <- fasta_3[1:1000, ] ### Use this for quick tests
splits <- strsplit(fasta_3$V4, "")
reversed <- lapply(splits, rev)
concat <- lapply(reversed, function(x){paste(x, collapse = "")})
fasta_3$V4 <- unlist(concat)
fasta_3$V4 <- chartr("ATGC","TACG", fasta_3$V4)

### Count the nucleotides
fasta <- bind_rows(fasta_5, fasta_3)
motif <- strsplit(fasta$V4, "")
motif <- do.call(rbind, motif)

counts <- data.frame(nucleotide = c("A", "T", "G", "C"))

for (i in 1:ncol(motif)) {
x <- as.data.frame(table(motif[, i]))

counts <- merge(counts, x, by.x = "nucleotide", by.y = "Var1", all = TRUE)
}

counts[is.na(counts)] <- 0
colnames(counts) <- c("nucleotide", seq(-15, -15 + (ncol(counts) - 2)))

total <- sum(counts$`-15`)

frequency <- counts[, 2:ncol(counts)]/total
ratio <- frequency/c(0.2951856, 0.2039184, 0.2047607, 0.2961353)

frequency$nucleotide <- counts$nucleotide
ratio$nucleotide <- counts$nucleotide

frequency <- frequency %>%
select(nucleotide, everything())
ratio <- ratio %>%
select(nucleotide, everything())

### Write file
write.table(counts, file.path(outdir, paste0(id, "_count.txt")), sep = "\t", row.names = FALSE)
write.table(frequency, file.path(outdir, paste0(id, "_frequency.txt")), sep = "\t", row.names = FALSE)
write.table(ratio, file.path(outdir, paste0(id, "_ratio.txt")), sep = "\t", row.names = FALSE)
4 changes: 2 additions & 2 deletions end_motif/scripts/motif_format_bedpe.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ bedpe <- bedpe[bedpe$length <= 600, ]
bedpe <- bedpe[, c("V1", "V2", "V6")]
bedpe <- bedpe[order(factor(bedpe$V1, levels = chrs),
bedpe$V2), ]
bedpe$Start <- ifelse(bedpe$V2 < bedpe$V6, bedpe$V2, bedpe$V6) - 1
bedpe$End <- ifelse(bedpe$V6 > bedpe$V2, bedpe$V6, bedpe$V2) - 1
bedpe$Start <- ifelse(bedpe$V2 < bedpe$V6, bedpe$V2, bedpe$V6) # - 1
bedpe$End <- ifelse(bedpe$V6 > bedpe$V2, bedpe$V6, bedpe$V2) # - 1

### Get last 4 bases
bedpe$front <- bedpe$Start + 4
Expand Down

0 comments on commit eaa6ef4

Please sign in to comment.