From f7b91f9ba293e4e440255b7154892a9f577b2b6c Mon Sep 17 00:00:00 2001 From: derekwong90 <78445989+derekwong90@users.noreply.github.com> Date: Fri, 30 Jun 2023 13:49:51 -0700 Subject: [PATCH] added griffin --- griffin/runners/GC_correction.sh | 71 ++ griffin/runners/nucleosome_profiling.sh | 107 +++ griffin/scripts/griffin_GC_bias.py | 459 +++++++++++ griffin/scripts/griffin_GC_counts.py | 261 ++++++ griffin/scripts/griffin_calc_GC_frequency.py | 199 +++++ griffin/scripts/griffin_calc_coverage.py | 758 ++++++++++++++++++ griffin/scripts/griffin_filter_sites.py | 293 +++++++ griffin/scripts/griffin_plot.py | 160 ++++ griffin/site_configs/DHS_sites.yaml | 18 + .../site_configs/Immune_Calderon_sites.yaml | 42 + griffin/site_configs/Immune_sites.yaml | 15 + griffin/site_configs/LFS_sites.yaml | 7 + griffin/site_configs/TCGA_sites.yaml | 25 + griffin/site_configs/TFBS_Ulz_sites.yaml | 506 ++++++++++++ griffin/site_configs/TFBS_sites.yaml | 337 ++++++++ griffin/site_configs/TP53_sites.yaml | 12 + griffin/site_configs/hematopoietic_sites.yaml | 3 + griffin/site_configs/housekeeping_sites.yaml | 8 + griffin/site_configs/maneTSS_sites.yaml | 2 + griffin/site_configs/uveal_sites.yaml | 6 + 20 files changed, 3289 insertions(+) create mode 100644 griffin/runners/GC_correction.sh create mode 100644 griffin/runners/nucleosome_profiling.sh create mode 100755 griffin/scripts/griffin_GC_bias.py create mode 100755 griffin/scripts/griffin_GC_counts.py create mode 100755 griffin/scripts/griffin_calc_GC_frequency.py create mode 100755 griffin/scripts/griffin_calc_coverage.py create mode 100755 griffin/scripts/griffin_filter_sites.py create mode 100755 griffin/scripts/griffin_plot.py create mode 100755 griffin/site_configs/DHS_sites.yaml create mode 100755 griffin/site_configs/Immune_Calderon_sites.yaml create mode 100755 griffin/site_configs/Immune_sites.yaml create mode 100755 griffin/site_configs/LFS_sites.yaml create mode 100755 griffin/site_configs/TCGA_sites.yaml create mode 100755 griffin/site_configs/TFBS_Ulz_sites.yaml create mode 100755 griffin/site_configs/TFBS_sites.yaml create mode 100755 griffin/site_configs/TP53_sites.yaml create mode 100755 griffin/site_configs/hematopoietic_sites.yaml create mode 100755 griffin/site_configs/housekeeping_sites.yaml create mode 100755 griffin/site_configs/maneTSS_sites.yaml create mode 100755 griffin/site_configs/uveal_sites.yaml diff --git a/griffin/runners/GC_correction.sh b/griffin/runners/GC_correction.sh new file mode 100644 index 0000000..1ec4eb4 --- /dev/null +++ b/griffin/runners/GC_correction.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +griffin=/cluster/projects/pughlab/bin/Griffin/v0.2.0 +basedir=/cluster/projects/pughlab/projects/CHARM/LFS/griffin2 +ref=/cluster/projects/pughlab/references/TGL/hg38/hg38_random.fa +input=/cluster/projects/pughlab/external_data/TGL49_CHARM/LFS/LFS_WG/bams +outdir=$basedir/output/GC_correction +shdir=$basedir/sh_scripts/GC_correction + +mkdir -p $outdir +mkdir -p $shdir +mkdir -p $outdir/mappability_bias +mkdir -p $outdir/mappability_plots +mkdir -p $outdir/tmp + +cd $input +ls *bam > $shdir/bams + +cd $shdir +sed 's/....$//' bams > bam +mv bam bams + +for bam in $(cat bams);do + +name=${bam:0:25} +echo $bam +echo $name + +mkdir -p $outdir/tmp/$name + +echo -e "#!/bin/bash +source activate base +conda activate griffin2" > $shdir/${name}.sh + +echo -e "$griffin/scripts/griffin_mappability_correction.py \ +--bam_file $input/${bam}.bam \ +--bam_file_name $name \ +--output $outdir/mappability_bias/${name}.mappability_bias.txt \ +--output_plot $outdir/mappability_plots/${name}.mappability_bias.pdf \ +--mappability $griffin/Ref/k50.Umap.MultiTrackMappability.hg38.bw \ +--exclude_paths $griffin/Ref/encode_unified_GRCh38_exclusion_list.bed \ +--chrom_sizes $griffin/Ref/hg38.standard.chrom.sizes \ +--map_quality 20 \ +--CPU 8 \ +--tmp_dir $outdir/tmp/$name" >> $shdir/${name}.sh + +echo -e "$griffin/scripts/griffin_GC_counts.py \ +--bam_file $input/${bam}.bam \ +--bam_file_name $name \ +--mappable_regions_path $griffin/Ref/k100_minus_exclusion_lists.mappable_regions.hg38.bed \ +--ref_seq $ref \ +--chrom_sizes $griffin/Ref/hg38.standard.chrom.sizes \ +--out_dir $outdir \ +--map_q 20 \ +--size_range 15 500 \ +--CPU 8" >> $shdir/${name}.sh + +echo -e "$griffin/scripts/griffin_GC_bias.py \ +--bam_file_name $name \ +--mappable_name k100_minus_exclusion_lists.mappable_regions.hg38 \ +--genome_GC_frequency $griffin/Ref/genome_GC_frequency \ +--out_dir $outdir/ \ +--size_range 15 500" >> $shdir/${name}.sh + +done + +cd $shdir +ls *.sh > files +for file in $(cat files);do +sbatch -p all -c 8 --mem 8G -t 24:00:00 $file +done diff --git a/griffin/runners/nucleosome_profiling.sh b/griffin/runners/nucleosome_profiling.sh new file mode 100644 index 0000000..89fcee0 --- /dev/null +++ b/griffin/runners/nucleosome_profiling.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +analysis=hematopoetic +CPU=1 +mem=8G + +griffin=/cluster/projects/pughlab/bin/Griffin/v0.2.0 +basedir=/cluster/projects/pughlab/projects/CHARM/LFS/griffin2 +sites=$griffin/site_configs/${analysis}_sites.yaml +ref=/cluster/projects/pughlab/references/TGL/hg38/hg38_random.fa +input=/cluster/projects/pughlab/external_data/TGL49_CHARM/LFS/LFS_WG/bams +counts=$basedir/output/GC_correction +outdir=$basedir/output/nucleosome_profiling/$analysis +shdir=$basedir/sh_scripts/nucleosome_profiling/$analysis + +encode_exclude=$griffin/Ref/encode_unified_GRCh38_exclusion_list.bed +centromere_path=$griffin/Ref/hg38_centromeres.bed +gap_path=$griffin/Ref/hg38_gaps.bed +patch_path=$griffin/Ref/hg38_fix_patches.bed +alternative_haplotype_path=$griffin/Ref/hg38_alternative_haplotypes.bed + +mkdir -p $outdir +mkdir -p $outdir/tmp +mkdir -p $outdir/results +mkdir -p $shdir + +cd $input +ls *bam > $shdir/bams + +cd $shdir +sed 's/....$//' bams > bam +mv bam bams + +for bam in $(cat bams);do + +name=${bam:0:25} +echo $bam +echo $name + +echo -e "#!/bin/bash\n +source activate base\n +conda activate griffin2\n" > $shdir/${name}.sh + +echo -e "$griffin/scripts/griffin_coverage.py \ +--sample_name $name \ +--bam $input/${bam}.bam \ +--GC_bias $counts/GC_bias/${name}.GC_bias.txt \ +--mappability_bias $counts/mappability_bias/${name}.mappability_bias.txt \ +--mappability_correction True \ +--tmp_dir $outdir/tmp \ +--reference_genome $ref \ +--mappability_bw $griffin/Ref/k50.Umap.MultiTrackMappability.hg38.bw \ +--chrom_sizes_path $griffin/Ref/hg38.standard.chrom.sizes \ +--sites_yaml $sites \ +--griffin_scripts $griffin/scripts \ +--chrom_column Chrom \ +--position_column position \ +--strand_column Strand \ +--chroms chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 \ +--norm_window -5000 5000 \ +--size_range 100 200 \ +--map_quality 20 \ +--number_of_sites none \ +--sort_by none \ +--ascending none \ +--CPU $CPU\n" >> $shdir/${name}.sh + +echo -e "$griffin/scripts/griffin_merge_sites.py \ +--sample_name $name \ +--uncorrected_bw_path $outdir/tmp/$name/tmp_bigWig/${name}.uncorrected.bw \ +--GC_corrected_bw_path $outdir/tmp/$name/tmp_bigWig/${name}.GC_corrected.bw \ +--GC_map_corrected_bw_path $outdir/tmp/$name/tmp_bigWig/${name}.GC_map_corrected.bw \ +--mappability_correction False \ +--tmp_dir $outdir/tmp \ +--results_dir $outdir/results \ +--mappability_bw $griffin/Ref/k50.Umap.MultiTrackMappability.hg38.bw \ +--chrom_sizes_path $griffin/Ref/hg38.standard.chrom.sizes \ +--sites_yaml $sites \ +--griffin_scripts $griffin/scripts \ +--chrom_column Chrom \ +--position_column position \ +--strand_column Strand \ +--chroms chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 \ +--norm_window -5000 5000 \ +--save_window -1000 1000 \ +--fft_window -960 960 \ +--fft_index 10 \ +--smoothing_length 165 \ +--exclude_paths $encode_exclude $centromere_path $gap_path $patch_path $alternative_haplotype_path \ +--step 15 \ +--CNA_normalization False \ +--individual False \ +--smoothing True \ +--exclude_outliers True \ +--exclude_zero_mappability True \ +--number_of_sites none \ +--sort_by none \ +--ascending none \ +--CPU $CPU\n" >> $shdir/${name}.sh + +done + +cd $shdir +ls *.sh > files +for file in $(cat files);do +sbatch -c $CPU --mem $mem -t 24:00:00 $file +done diff --git a/griffin/scripts/griffin_GC_bias.py b/griffin/scripts/griffin_GC_bias.py new file mode 100755 index 0000000..8dce11a --- /dev/null +++ b/griffin/scripts/griffin_GC_bias.py @@ -0,0 +1,459 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +import pysam +import os +#import pybedtools #not used +import pandas as pd +import numpy as np +import time +import argparse +import sys +from matplotlib import pyplot as plt + + +# In[ ]: + + +# %matplotlib inline + +# bam_file_name = 'MBC_1041_1_ULP' +# mapable_name = 'repeat_masker.mapable.k50.Umap.hg38' +# genome_GC_frequency = '/fh/fast/ha_g/user/adoebley/projects/griffin_paper/genome_GC_frequency/results' +# out_dir = 'tmp' +# size_range = [15,500] + + +# In[ ]: + + +parser = argparse.ArgumentParser() + +parser.add_argument('--bam_file_name', help='sample name (does not need to match actual file name)', required=True) + +parser.add_argument('--mapable_name', help='name of mapable regions file (with .bed removed)', required=True) + +parser.add_argument('--genome_GC_frequency',help='folder containing GC counts in the reference sequence (made by generate_reference_files.snakemake)',required=True) + +parser.add_argument('--out_dir',help='folder for GC bias results',required=True) + +parser.add_argument('--size_range',help='range of read sizes to be included',nargs=2, type=int, required=True) + +args = parser.parse_args() + +bam_file_name = args.bam_file_name +mapable_name=args.mapable_name +genome_GC_frequency = args.genome_GC_frequency +out_dir = args.out_dir +size_range = args.size_range + + +# In[ ]: + + +print('arguments provided:') + +print('\tbam_file_name = "'+bam_file_name+'"') +print('\tmapable_name = "'+mapable_name+'"') + +print('\tgenome_GC_frequency = "'+genome_GC_frequency+'"') +out_dir = out_dir.rstrip('/') +print('\tout_dir = "'+out_dir+'"') + +print('\tsize_range = '+str(size_range)) + + +# In[ ]: + + +#For now I'm going to keep the smoothing bin size as a set variable +GC_smoothing_step = 20 + + +# In[ ]: + + +#input is the out_file from the previous step +in_file = out_dir +'/'+mapable_name+'/GC_counts/'+ bam_file_name+'.GC_counts.txt' +print('in_file:',in_file) + +#output is smoothed version +smoothed_out_file = out_dir +'/'+mapable_name+'/GC_bias/'+ bam_file_name+'.GC_bias.txt' + +#plot files +plot_file1 = out_dir +'/'+mapable_name+'/GC_plots/'+ bam_file_name+'.GC_bias.pdf' +plot_file2 = out_dir +'/'+mapable_name+'/GC_plots/'+ bam_file_name+'.GC_bias.summary.pdf' +plot_file3 = out_dir +'/'+mapable_name+'/GC_plots/'+ bam_file_name+'.GC_bias.key_lengths.pdf' + +print('out_file:',smoothed_out_file) +sys.stdout.flush() + + +# In[ ]: + + +#create output folders if needed +if not os.path.exists(out_dir +'/'+mapable_name+'/GC_plots/'): + os.mkdir(out_dir +'/'+mapable_name+'/GC_plots/') +if not os.path.exists(out_dir +'/'+mapable_name+'/GC_bias/'): + os.mkdir(out_dir +'/'+mapable_name+'/GC_bias/') + + +# In[ ]: + + +#import the GC info from the genome +frequency_prefix = genome_GC_frequency+'/'+mapable_name+'.' + +GC_freq = pd.DataFrame() +for i in range(size_range[0],size_range[1]+1): + current_path = frequency_prefix+str(i)+'bp.GC_frequency.txt' + current_data = pd.read_csv(current_path,sep='\t') + GC_freq = GC_freq.append(current_data, ignore_index=True) + +GC_freq['GC_content']=GC_freq['num_GC']/GC_freq['length'] +GC_freq = GC_freq.sort_values(by=['GC_content','length']).reset_index(drop=True) + + +# In[ ]: + + +#import GC counts from the sample +GC_df = pd.read_csv(in_file, sep='\t') + +GC_df['GC_content']=GC_df['num_GC']/GC_df['length'] +GC_df = GC_df.sort_values(by=['GC_content','length']).reset_index(drop=True) + + +# In[ ]: + + +#calculate the GC_bias +new_df = pd.DataFrame() +for length in range(size_range[0],size_range[1]+1): + current = GC_df[GC_df['length']==length].copy().reset_index(drop=True) + current_freq = GC_freq[GC_freq['length']==length].copy().reset_index(drop=True) + + #save the frequency of each GC content in the genome + current['number_of_positions']=current_freq['number_of_fragments'] + + #calculate the GC bias + current_bias = current['number_of_fragments']/current['number_of_positions'] + current['GC_bias'] = current_bias + + #normalize to a mean of 1 for each fragment length(compute GC bias does this same thing) + current['GC_bias'] = current['GC_bias']/np.nanmean(current['GC_bias']) + new_df = new_df.append(current, ignore_index=True) + + #print(length,len(current['GC_bias']),np.nanmean(current['GC_bias'])) + +new_df = new_df.sort_values(by=['GC_content','length']).reset_index(drop=True) + + +# In[ ]: + + +def median_smoothing(current,fraction): + bin_size=int(len(current)*fraction) + if bin_size<50: + bin_size=50 + medians = [] + + for i in range(len(current)): + start = int(i-bin_size/2) + end = int(i+bin_size/2) + #if the bin starts before the beginning, just take the first bin + if start<0: + start=0 + end=bin_size + #if the bin extends beyond the end, take the last bin + if end>=len(current): + start=len(current)-bin_size + end=len(current) + current_median = np.nanmedian(current['GC_bias'].iloc[start:end]) + medians.append(current_median) + return(medians) + + +# In[ ]: + + + + + +# In[ ]: + + +#smooth GC bias by size bin + +start_time = time.time() + +new_df2 = pd.DataFrame() +for length in new_df['length'].unique(): + if length%20==0: + print(length, time.time()-start_time) + sys.stdout.flush() + + #get a bin of similar sized fragments + min_len = int(length - (GC_smoothing_step/2)) + max_len = int(length + (GC_smoothing_step/2)) + + current = new_df[(new_df['length']>=min_len) & (new_df['length']<=max_len)].copy() + + #perform smoothing + fit = median_smoothing(current,.05) + current['smoothed_GC_bias']=fit + + #only keep smoothed values for the selected length + current = current[current['length']==length] + + #get rid of values for GC contents that are never observed + current['smoothed_GC_bias'] = np.where(current['number_of_positions']==0,np.nan,current['smoothed_GC_bias']) + + #normalize to a mean of 1 + current['smoothed_GC_bias'] = current['smoothed_GC_bias']/np.nanmean(current['smoothed_GC_bias']) + + new_df2 = new_df2.append(current,ignore_index=True) + + #print(length,len(current),np.nanmean(current['smoothed_GC_bias'])) + +new_df = new_df2 + + +# In[ ]: + + +new_df[new_df['length']==200]#['GC_bias'].sum()/len(new_df[new_df['length']==200]) + + +# In[ ]: + + +#export results +new_df2.to_csv(smoothed_out_file,sep='\t',index=False) + + +# In[ ]: + + +#generate one plot per size bin + +#set up a figure for plotting +plot_indexes = np.arange(size_range[0]+GC_smoothing_step,size_range[1]+GC_smoothing_step,GC_smoothing_step) +lengths_to_plot = plot_indexes +x_dim = 6 +y_dim = int(np.ceil(len(plot_indexes)/6)) +empty_plots = int(x_dim*y_dim - len(plot_indexes)) +plot_indexes = np.append(plot_indexes,[np.nan for m in range(empty_plots)]) +plot_indexes = np.reshape(plot_indexes,(y_dim,x_dim)) +fig, axes = plt.subplots(y_dim,x_dim, figsize = (5*x_dim,3.5*y_dim), sharex = True, sharey = True) +axes = axes.reshape(y_dim,x_dim) #make sure the axes array is two dimension (just in case it has less than 7 value) + +#do the plotting +min_len = 0 +for max_len in lengths_to_plot: + if max_len%20==0: + print(max_len) + + #pick the axis + current_index = np.where(plot_indexes==max_len) + current_index = (current_index[0][0],current_index[1][0]) + current_ax = axes[current_index] + + #pick the data + current1 = new_df2[(new_df2['length']>min_len) & (new_df2['length']<=max_len)].copy() + + #plot the smoothed data over top + for length2 in current1['length'].unique(): + current2 = current1[current1['length']==length2] + current_ax.plot(current2['GC_content'],current2['smoothed_GC_bias'], label=str(length2)+'bp') + + current_ax.set_title(str(min_len) + 'bp to '+str(max_len)+'bp') + current_ax.legend(ncol = 2) + + min_len = max_len + +for i in range(x_dim): + axes[y_dim-1,i].set_xlabel('GC content') + +for i in range(y_dim): + axes[i,0].set_ylabel('coverage bias') + +ylim = axes[0,0].get_ylim() + +old_title = axes[0,0].get_title() +axes[0,0].set_title(bam_file_name+'\n'+mapable_name + '\n' + old_title) + +fig.tight_layout() + +plt.savefig(plot_file1) + +plt.close('all') + + +# In[ ]: + + +#key lengths +selected_lengths = np.arange(100,201,GC_smoothing_step) + +fig,ax = plt.subplots(1) + +# for_color = len(selected_lengths)-1 +# color = (1-(i/for_color),.5*(1-(i/for_color)), i/for_color) + +for i,length in enumerate(selected_lengths): + current = new_df2[new_df2['length']==length] + ax.plot(current['GC_content'],current['smoothed_GC_bias'], label = str(length)+'bp') + +ax.legend(ncol = 2, bbox_to_anchor = [1,1], loc = 'upper left') + +ax.set_xlabel('GC content') +ax.set_ylabel('coverage bias') +ax.set_title(bam_file_name+'\n'+mapable_name) + +fig.tight_layout() +fig.savefig(plot_file3) +plt.close('all') + + +# In[ ]: + + +#summary figure +selected_lengths = np.arange(size_range[0],size_range[1],GC_smoothing_step) + +fig,ax = plt.subplots(1) + +for length in selected_lengths: + current = new_df2[new_df2['length']==length] + ax.plot(current['GC_content'],current['smoothed_GC_bias'], label = str(length)+'bp') +ax.legend(ncol = 2, bbox_to_anchor = [1,1], loc = 'upper left') + +ax.set_xlabel('GC content') +ax.set_ylabel('coverage bias') +ax.set_title(bam_file_name+'\n'+mapable_name) + +fig.tight_layout() +fig.savefig(plot_file2) +plt.close('all') + + +# In[ ]: + + + + + +# In[ ]: + + +# plot_file4 = out_dir +'/'+mapable_name+'/GC_plots/'+ bam_file_name+'.GC_bias.test.pdf' + +# selected_lengths = np.arange(size_range[0],size_range[1],GC_smoothing_step) + +# fig,ax = plt.subplots(1) + +# for length in selected_lengths: +# current = new_df2[new_df2['length']==length] + +# ax.plot(current['GC_content'],current['GC_bias'],alpha=.2,marker='.') + +# #reset the color cycle +# # for Matplotlib version >= 1.5 +# plt.gca().set_prop_cycle(None) + + +# for length in selected_lengths: +# current = new_df2[new_df2['length']==length] + +# ax.plot(current['GC_content'],current['smoothed_GC_bias'], label = length) + + +# ax.legend(ncol = 2, bbox_to_anchor = [1,1]) + +# ax.set_xlabel('GC content') +# ax.set_ylabel('coverage bias') +# ax.set_title(bam_file_name+'\n'+mapable_name) +# ax.set_ylim(-.1,new_df2['smoothed_GC_bias'].max()+.1) + +# fig.tight_layout() +# fig.savefig(plot_file4) + + +# In[ ]: + + +# #generate one plot per size bin +# #raw_data +# plot_file4 = out_dir +'/'+mapable_name+'/GC_plots/'+ bam_file_name+'.GC_bias.test.pdf' + +# #set up a figure for plotting +# plot_indexes = np.arange(size_range[0]+GC_smoothing_step,size_range[1]+GC_smoothing_step,GC_smoothing_step) +# lengths_to_plot = plot_indexes +# x_dim = 6 +# y_dim = int(np.ceil(len(plot_indexes)/6)) +# empty_plots = int(x_dim*y_dim - len(plot_indexes)) +# plot_indexes = np.append(plot_indexes,[np.nan for m in range(empty_plots)]) +# plot_indexes = np.reshape(plot_indexes,(y_dim,x_dim)) +# fig, axes = plt.subplots(y_dim,x_dim, figsize = (5*x_dim,3.5*y_dim), sharex = True, sharey = True) +# axes = axes.reshape(y_dim,x_dim) #make sure the axes array is two dimension (just in case it has less than 7 value) + +# #do the plotting +# min_len = 0 +# for max_len in lengths_to_plot: +# if max_len%20==0: +# print(max_len) + +# #pick the axis +# current_index = np.where(plot_indexes==max_len) +# current_index = (current_index[0][0],current_index[1][0]) +# current_ax = axes[current_index] + +# #pick the data +# current1 = new_df2[(new_df2['length']>min_len) & (new_df2['length']<=max_len)].copy() + +# #plot the raw data +# for length2 in current1['length'].unique(): +# current2 = current1[current1['length']==length2] +# current_ax.plot(current2['GC_content'],current2['GC_bias'],alpha=.2,marker='.') + +# #reset the color cycle +# # for Matplotlib version >= 1.5 +# plt.gca().set_prop_cycle(None) + +# #plot the smoothed data over top +# for length2 in current1['length'].unique(): +# current2 = current1[current1['length']==length2] +# current_ax.plot(current2['GC_content'],current2['smoothed_GC_bias'], label=length2) + +# current_ax.set_title(str(min_len) + 'bp to '+str(max_len)+'bp') +# current_ax.legend(ncol = 2) + +# min_len = max_len + +# for i in range(x_dim): +# axes[y_dim-1,i].set_xlabel('GC content') + +# for i in range(y_dim): +# axes[i,0].set_ylabel('coverage bias') + +# axes[0,0].set_ylim(ylim) + +# old_title = axes[0,0].get_title() +# axes[0,0].set_title(bam_file_name+'\n'+mapable_name + '\n' + old_title) + +# fig.tight_layout() + +# plt.savefig(plot_file4) +# plt.close('all') + + +# In[ ]: + + + + diff --git a/griffin/scripts/griffin_GC_counts.py b/griffin/scripts/griffin_GC_counts.py new file mode 100755 index 0000000..5399748 --- /dev/null +++ b/griffin/scripts/griffin_GC_counts.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +import pysam +import os + +import pandas as pd +import numpy as np +import time +import argparse +import sys + +from multiprocessing import Pool + + +# In[ ]: + + +# ##arguments for testing + +# bam_file_path = '/fh/scratch/delete90/ha_g/realigned_bams/cfDNA_MBC_ULP_hg38/realign_bam_paired_snakemake-master/results/MBC_1041_1_ULP/MBC_1041_1_ULP_recalibrated.bam' +# bam_file_name = 'MBC_1041_1_ULP' +# mapable_path = '../../downloads/genome/repeat_masker.mapable.k50.Umap.hg38.bedGraph' + +# ref_seq_path = '/fh/fast/ha_g/grp/reference/GRCh38/GRCh38.fa' +# chrom_sizes_path = '/fh/fast/ha_g/grp/reference/GRCh38/hg38.standard.chrom.sizes' + +# out_dir = './tmp/' + +# map_q = 20 +# size_range = [15,500] + +# CPU = 4 + + +# In[ ]: + + +parser = argparse.ArgumentParser() + +parser.add_argument('--bam_file', help='sample_bam_file', required=True) +parser.add_argument('--bam_file_name', help='sample name (does not need to match actual file name)', required=True) +parser.add_argument('--mapable_regions', help='highly mapable regions to be used in GC correction, bedGraph or bed foramt', required=True) + +parser.add_argument('--ref_seq',help='reference sequence (fasta format)',required=True) +parser.add_argument('--chrom_sizes',help='path to chromosome sizes for the reference seq',required=True) + +parser.add_argument('--out_dir',help='folder for GC bias results',required=True) + +parser.add_argument('--map_q',help='minimum mapping quality for reads to be considered',type=int,required=True) +parser.add_argument('--size_range',help='range of read sizes to be included',nargs=2, type=int, required=True) + +parser.add_argument('--CPU',help='number of CPU for parallelizing', type=int, required=True) + +args = parser.parse_args() + +bam_file_path = args.bam_file +bam_file_name = args.bam_file_name +mapable_path=args.mapable_regions + +ref_seq_path = args.ref_seq +chrom_sizes_path = args.chrom_sizes +out_dir = args.out_dir + +map_q = args.map_q +size_range = args.size_range +CPU = args.CPU + + +# In[ ]: + + +print('arguments provided:') + +print('\tbam_file_path = "'+bam_file_path+'"') +print('\tbam_file_name = "'+bam_file_name+'"') +print('\tmapable_regions = "'+mapable_path+'"') + +print('\tref_seq_path = "'+ref_seq_path+'"') +print('\tchrom_sizes_path = "'+chrom_sizes_path+'"') +print('\tout_dir = "'+out_dir+'"') + +print('\tmap_q = '+str(map_q)) +print('\tsize_range = '+str(size_range)) +print('\tCPU = '+str(CPU)) + + +# In[ ]: + + +mapable_name = mapable_path.rsplit('/',1)[1].rsplit('.',1)[0] +out_file = out_dir +'/'+mapable_name+'/GC_counts/'+ bam_file_name+'.GC_counts.txt' + +print('out_file',out_file) + + +# In[ ]: + + +#create a directory for the GC data +if not os.path.exists(out_dir +'/'+mapable_name): + os.mkdir(out_dir +'/'+mapable_name) +if not os.path.exists(out_dir +'/'+mapable_name+'/GC_counts/'): + os.mkdir(out_dir +'/'+mapable_name+'/GC_counts/') + + +# In[ ]: + + +#import filter +mapable_intervals = pd.read_csv(mapable_path, sep='\t', header=None) + +#remove non standard chromosomes and X and Y +chroms = ['chr'+str(m) for m in range(1,23)] +mapable_intervals = mapable_intervals[mapable_intervals[0].isin(chroms)] + +print('chroms:', chroms) +print('number_of_intervals:',len(mapable_intervals)) + +sys.stdout.flush() + + +# In[ ]: + + +def collect_reads(sublist): + #create a dict for holding the frequency of each read length and GC content + GC_dict = {} + for length in range(size_range[0],size_range[1]+1): + GC_dict[length]={} + for num_GC in range(0,length+1): + GC_dict[length][num_GC]=0 + + #import the bam file + #this needs to be done within the loop otherwise it gives a truncated file warning + bam_file = pysam.AlignmentFile(bam_file_path, "rb") + print('sublist intervals:',len(sublist)) + + #this might also need to be in the loop + #import the ref_seq + ref_seq=pysam.FastaFile(ref_seq_path) + + for i in range(len(sublist)): + chrom = sublist.iloc[i][0] + start = sublist.iloc[i][1] + end = sublist.iloc[i][2] + if i%5000==0: + print('interval',i,':',chrom,start,end,'seconds:',np.round(time.time()-start_time)) + sys.stdout.flush() + #fetch any read that overlaps the inteterval (don't need to extend the interval because the fetch function does this automatically) + fetched = bam_file.fetch(chrom,start,end) + for read in fetched: + #use both fw (positive template length) and rv (negative template length) reads + if (read.is_reverse==False and read.template_length>=size_range[0] and read.template_length<=size_range[1]) or (read.is_reverse==True and -read.template_length>=size_range[0] and -read.template_length<=size_range[1]): + #qc filters, some longer fragments are considered 'improper pairs' but I would like to keep these + if read.is_paired==True and read.mapping_quality>=map_q and read.is_duplicate==False and read.is_qcfail==False: + if read.is_reverse==False: + read_start = read.reference_start + read_end = read.reference_start+read.template_length + elif read.is_reverse==True: + read_end = read.reference_start + read.reference_length + read_start = read_end + read.template_length + + fragment_seq = ref_seq.fetch(read.reference_name,read_start,read_end) + #tally up the GC content + fragment_seq=fragment_seq.replace('g','G').replace('c','C').replace('a','A').replace('t','T').replace('n','N') + + # ################# + # ##logic check#### + # ################# + # if read.is_reverse==False: + # if fragment_seq[0:read.reference_length]==read.query_sequence and len(fragment_seq)==read.template_length: + # print('fw match',read.reference_length) + # else: + # print(fragment_seq[0:read.reference_length],read.reference_length,'fw') + # print(read.query_sequence,len(read.query_sequence),'fw') + # print(len(fragment_seq),read.template_length) + # print('\n') + # elif read.is_reverse==True: + # if fragment_seq[-read.reference_length:]==read.query_sequence and len(fragment_seq)==-read.template_length: + # print('rv match',read.reference_length) + # else: + # print(fragment_seq[-read.reference_length:],read.reference_length,'rv') + # print(read.query_sequence,len(read.query_sequence),'rv') + # print(len(fragment_seq),read.template_length) + # print('\n') + # ################# + + #split and convert to numpy array + fragment_seq = np.array(list(fragment_seq)) + #replace with values + fragment_seq[(fragment_seq=='G') | (fragment_seq=='C')]=1 + fragment_seq[(fragment_seq=='A') | (fragment_seq=='T')]=0 + fragment_seq[(fragment_seq=='N')]=np.random.randint(2) #choose a random 0 or 1 for N (so that you always get an integer) #should be very rare if the filter is done right + fragment_seq = fragment_seq.astype(int) + + num_GC = int(fragment_seq.sum()) + GC_dict[abs(read.template_length)][num_GC]+=1 + + print('done') + return(GC_dict) + + +# In[ ]: + + +start_time = time.time() +p = Pool(processes=CPU) #use the available CPU +sublists = np.array_split(mapable_intervals,CPU) #split the list into sublists, one per CPU + +GC_dict_list = p.map(collect_reads, sublists, 1) + + +# In[ ]: + + +all_GC_df = pd.DataFrame() +for i,GC_dict in enumerate(GC_dict_list): + GC_df = pd.DataFrame() + for length in GC_dict.keys(): + current = pd.Series(GC_dict[length]).reset_index() + current = current.rename(columns={'index':'num_GC',0:'number_of_fragments'}) + current['length']=length + current = current[['length','num_GC','number_of_fragments']] + GC_df = GC_df.append(current, ignore_index=True) + GC_df = GC_df.set_index(['length','num_GC']) + all_GC_df[i] = GC_df['number_of_fragments'] + del(GC_df,GC_dict) + +all_GC_df = all_GC_df.sum(axis=1) +all_GC_df = pd.DataFrame(all_GC_df).rename(columns = {0:'number_of_fragments'}) +all_GC_df = all_GC_df.reset_index() +all_GC_df.to_csv(out_file,sep='\t',index=False) + + +# In[ ]: + + +print('done') + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# In[ ]: + + + + diff --git a/griffin/scripts/griffin_calc_GC_frequency.py b/griffin/scripts/griffin_calc_GC_frequency.py new file mode 100755 index 0000000..fcc1ca1 --- /dev/null +++ b/griffin/scripts/griffin_calc_GC_frequency.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +import pysam +import os +import pandas as pd +import numpy as np +import time +import argparse +import sys + + +# In[2]: + + +#This script calculates the frequency of each GC content for fragments that overlap the non-blacklisted areas +#This is performed for each fragment size in the range specified +#this only needs to be performed once for each filter + + +# In[3]: + + +# #arguments for testing +# mapable_path = '/fh/fast/ha_g/user/adoebley/projects/griffin_paper/downloads/genome/repeat_masker.mapable.k50.Umap.hg38.bedGraph' + +# ref_seq_path = '/fh/fast/ha_g/grp/reference/GRCh38/GRCh38.fa' +# chrom_sizes_path = '/fh/fast/ha_g/grp/reference/GRCh38/hg38.standard.chrom.sizes' +# out_dir = './tmp' + +# # step = 1 +# length = 50 #fragment length + + +# In[4]: + + +parser = argparse.ArgumentParser() + +parser.add_argument('--mapable_regions', help='highly mapable regions to be used in GC correction, bed or bedGraph format', required=True) +parser.add_argument('--ref_seq',help='reference sequence (fasta format)',required=True) +parser.add_argument('--chrom_sizes',help='path to chromosome sizes for the reference seq',required=True) +parser.add_argument('--out_dir',help='folder for results',required=True) +parser.add_argument('--fragment_length',help='length of fragment (in bp) for which GC will be calculated',type=int, required=True) + +args = parser.parse_args() + +mapable_path=args.mapable_regions +ref_seq_path = args.ref_seq +chrom_sizes_path = args.chrom_sizes +out_dir = args.out_dir +length = args.fragment_length + + +# In[5]: + + +print('arguments provided:') + +print('\tmapable_path = "'+mapable_path+'"') +print('\tref_seq_path = "'+ref_seq_path+'"') +print('\tchrom_sizes_path = "'+chrom_sizes_path+'"') +print('\tout_dir = "'+out_dir+'"') +print('\tlength = '+str(length)) + + +# In[6]: + + +mapable_name = mapable_path.rsplit('/',1)[1].rsplit('.',1)[0] +out_file = out_dir+'/'+mapable_name+'.'+str(length)+'bp.GC_frequency.txt' +print('output path:',out_file) + +if not os.path.exists(out_dir): + os.mkdir(out_dir) + + +# In[7]: + + +sys.stdout.flush() + + +# In[8]: + + +#import filter +mapable_intervals = pd.read_csv(mapable_path, sep='\t', header=None) + +#keep autosomes only +chroms = ['chr'+str(m) for m in range(1,23)] +mapable_intervals = mapable_intervals[mapable_intervals[0].isin(chroms)] + +print('chroms:', chroms) +print('number_of_intervals:',len(mapable_intervals)) +sys.stdout.flush() + + +# In[9]: + + +#get chrom sizes info +chrom_sizes = pd.read_csv(chrom_sizes_path, sep='\t', header=None) + +#also keep as a dict +chrom_size_dict = chrom_sizes.set_index(0).to_dict()[1] + + +# In[10]: + + +#import the ref_seq +ref_seq=pysam.FastaFile(ref_seq_path) + + +# In[11]: + + +#create the GC frequencies dict +GC_dict = {} + +GC_dict={} +for num_GC in range(0,length+1): + GC_dict[num_GC]=0 + + +# In[12]: + + +start_time = time.time() + +k = length #just keeping this compatable with the previous version + +for i in range(len(mapable_intervals)): + chrom = mapable_intervals.iloc[i][0] + start = mapable_intervals.iloc[i][1] + end = mapable_intervals.iloc[i][2] + if i%5000==0: + print('interval',i,':',chrom,start,end,'seconds:',np.round(time.time()-start_time)) + sys.stdout.flush() + #adjust the start and end so it includes all fragments that overlap the interval + adjusted_start = start-k + adjusted_end = end+k + + if adjusted_start<0: + adjusted_start = 0 + if adjusted_end>chrom_size_dict[chrom]: + adjusted_end = chrom_sizes_dict[chrom] + print(chrom,chrom_sizes_dict[chrom],'adjusting_end') + + fetched = ref_seq.fetch(chrom,adjusted_start,adjusted_end) + fetched = fetched.replace('g','G').replace('c','C').replace('a','A').replace('t','T').replace('n','N') + fetched = np.array(list(fetched.replace('G','1').replace('C','1').replace('A','0').replace('T','0').replace('N','2')),dtype=float) + + #swap the 2 for a random 1 or 0 #there has to be a better way to do this but I can't figure it out + #the 0 or 1 is required because the sliding window sum algorithm only does integers + #unknown nucleotides should be quite rare if the filter is done correctly + fetched[fetched==2]=np.random.randint(2) #random integer in range(2) (i.e. 0 or 1) + + n=len(fetched) + + window_sum = int(sum(fetched[:k])) + #print(k,len(fetched[:k]),window_sum) + + GC_dict[window_sum]+=1 + for i in range(n-k): + window_sum = int(window_sum - fetched[i] + fetched[i+k]) + #print(k,window_sum) + GC_dict[window_sum]+=1 + + +# In[13]: + + +#convert to df and export +GC_df = pd.DataFrame() +#save GC dict +current = pd.Series(GC_dict).reset_index() +current = current.rename(columns={'index':'num_GC',0:'number_of_fragments'}) +current['length']=length +current = current[['length','num_GC','number_of_fragments']] +GC_df = GC_df.append(current, ignore_index=True) +GC_df.to_csv(out_file,sep='\t',index=False) + + +# In[14]: + + +print('done') + + +# In[ ]: + + + + diff --git a/griffin/scripts/griffin_calc_coverage.py b/griffin/scripts/griffin_calc_coverage.py new file mode 100755 index 0000000..3fec814 --- /dev/null +++ b/griffin/scripts/griffin_calc_coverage.py @@ -0,0 +1,758 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +import os +import sys +import argparse +import pandas as pd +import pysam +import numpy as np +import time +from scipy.signal import savgol_filter +import yaml +from multiprocessing import Pool + +# import warnings +# warnings.filterwarnings('error') + + +# In[ ]: + + +# from matplotlib import pyplot as plt +# %matplotlib inline + +# #sample specific params for testing +# #ER_pos_merged: +# sample_name = 'HD45.ctDNA.WGS.FC19269447' +# bam_path = '/fh/scratch/delete90/ha_g/realigned_bams/cfDNA_deepWGS_hg38/deepWGS_fastq_to_bam_paired_snakemake/results/HD45.ctDNA.WGS.FC19269447/HD45.ctDNA.WGS.FC19269447_recalibrated.bam' +# GC_bias_path = '/fh/fast/ha_g/user/adoebley/projects/griffin_paper/GC_correction/MBC_GC_correction/results/repeat_masker.mapable.k50.Umap.hg38/GC_bias/HD45.ctDNA.WGS.FC19269447.GC_bias.txt' +# background_normalization = 'none' +# ref_seq_path = '/fh/fast/ha_g/grp/reference/GRCh38/GRCh38.fa' + +# # #additional params for testing +# sites_yaml = '/fh/fast/ha_g/user/adoebley/projects/griffin_paper/tests/test_site_lists/test_sites_locations.yaml' +# results_dir = 'tmp' + +# chrom_col = 'Chrom' +# chroms = ['chr'+str(m) for m in np.arange(1,23)] +# norm_window = [-5000, 5000] #for testing +# plot_window = [-500, 1000]#for testing +# fragment_length = 165 + +# step = 15 +# sz_range = [100, 200] +# map_q = 20 +# strand_col = 'Strand' + +# individual = 'False' +# smoothing = 'True' + +# number_of_sites = 1000 +# sort_by = 'Chrom' +# #sort_by = 'peak.count' +# ascending = 'False' + +# CPU = 4 +# erase_intermediates = 'True' + +# debugging = True + + +# In[ ]: + + +parser = argparse.ArgumentParser() + +parser.add_argument('--sample_name', help='name of sample', required=True) +parser.add_argument('--bam', help='bam file', required=True) +parser.add_argument('--GC_bias', help='GC bias info', required=True) +parser.add_argument('--background_normalization', help='None or local', required=True) +parser.add_argument('--reference_genome',help = 'path to the reference genome',required=True) + +parser.add_argument('--sites_yaml', help='.bed file of sites', required=True) +parser.add_argument('--results_dir', help='directory for coverage_data', required=True) + +parser.add_argument('--chrom_column',help='name of column containing chromosome number', default='Chrom') +parser.add_argument('--chroms', help='chroms to include when selecting sites', nargs='*', default=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22']) +parser.add_argument('--norm_window',help='start and end of the window to be used for normalization',nargs=2, type=int, default=(-5000,5000)) +parser.add_argument('--plot_window',help='start and end of window to be plotted',nargs=2, type=int, default=(-1000,1000)) +parser.add_argument('--fragment_length',help='length of fragment (in bp) for which GC will be calculated, default 165',type=int, default=165) + +parser.add_argument('--step',help='step size when calculating coverage', type=int, default=5) +parser.add_argument('--size_range',help='acceptable size range for fragments (to filter out genomic contamination)',nargs=2, type=int, default=(0,500)) +parser.add_argument('--map_quality',help='minimum mapping quality', type=int, default=60) +parser.add_argument('--strand_column',help='name of column containing the strand (+ or -)', default='Strand') + +parser.add_argument('--individual',help='save individual site coverage. TRUE WILL RESULT IN HUGE OUTPUT FILES. (True/False)',default='False', required = True) +parser.add_argument('--smoothing',help='whether to use a savgol filter to smooth sites (True/False)', required = True) + +parser.add_argument('--num_sites',help='number of sites to analyze', default='NA') +parser.add_argument('--sort_by',help='how to select the sites to analyze', default='none') +parser.add_argument('--ascending',help='whether to sort in ascending or descending order when selecting sites', default='NA') + +parser.add_argument('--cpu',help='cpu available for parallelizing', type = int, required = True) +parser.add_argument('--erase_intermediates',help='whether to erase intermediate files to save space', type = str, default = 'True') + + +args = parser.parse_args() + + +sample_name=args.sample_name +bam_path=args.bam +GC_bias_path=args.GC_bias +background_normalization = args.background_normalization +ref_seq_path = args.reference_genome + +sites_yaml=args.sites_yaml +results_dir=args.results_dir + +chrom_col=args.chrom_column +chroms = args.chroms +norm_window =args.norm_window +plot_window=args.plot_window +fragment_length=args.fragment_length + +step=args.step +sz_range=args.size_range +map_q=args.map_quality +strand_col=args.strand_column + +individual=args.individual +smoothing = args.smoothing + +number_of_sites=args.num_sites +sort_by=args.sort_by +ascending=args.ascending + +CPU = args.cpu +erase_intermediates = args.erase_intermediates + +debugging = False + + +# In[ ]: + + +if ascending.lower()=='false': + ascending=False +elif ascending.lower()=='true': + ascending=True +else: + ascending='none' + +print('\narguments provided:') + +print('\tsample_name = "'+sample_name+'"') +print('\tbam_path = "'+bam_path+'"') +print('\tGC_bias_path = "'+GC_bias_path+'"') +print('\tbackground_normalization = "'+background_normalization+'"') +print('\tref_seq_path = "'+ref_seq_path+'"') +#print('\ttumor_fraction =',tumor_fraction) + +print('\tsites_yaml = "'+sites_yaml+'"') +print('\tresults_dir = "'+results_dir+'"') + +print('\tchrom_col = "'+chrom_col+'"') +print('\tchroms = ',chroms) +print('\tnorm_window = ', norm_window) +norm_window=[int(np.ceil(norm_window[0]/step)*step),int(np.floor(norm_window[1]/step)*step)] #round to the nearest step inside the window +print('\t#norm_window rounded to step',norm_window) +print('\tplot_window = '+str(plot_window)) +plot_window=[int(np.ceil(plot_window[0]/step)*step),int(np.floor(plot_window[1]/step)*step)] #round to the nearest step inside the window +print('\t#plot_window rounded to step:',plot_window) +print('\tfragment_length =',fragment_length) +fragment_length=int(np.ceil(fragment_length/step)*step) #round fragment length to the nearest step +print('\t#fragment_length_rounded_up_to_step:',fragment_length) + +print('\tstep =',step) +print('\tsz_range =',sz_range) +print('\tmap_q =',map_q) +print('\tstrand_col = "'+strand_col+'"') + +print('\tindividual = "'+individual+'"') +if smoothing.lower()=='true' or smoothing.lower()=='false': + print('\tsmoothing = "'+smoothing+'"') +else: + print('smoothing must be True or False! :',smoothing) + sys.exit() + +print('\tnumber_of_sites = "'+str(number_of_sites)+'"') +print('\tsort_by = "'+sort_by+'"') +print('\tascending = "'+str(ascending)+'"') + +print('\tCPU =',CPU) +print('\n') +sys.stdout.flush() + + +# In[ ]: + + +#define global parameters and open global files + +#set up global variables +norm_columns = np.arange(norm_window[0],norm_window[1],step) +plot_columns = np.arange(plot_window[0],plot_window[1],step) + +# #import the site files +with open(sites_yaml,'r') as f: + site_files = yaml.safe_load(f) +site_files = site_files['site_files'] +site_files = [[key,site_files[key]] for key in site_files.keys()] +to_do_list = site_files + +######################################## +#GET GC BIAS +######################################## +#open the GC_bias file +GC_bias = pd.read_csv(GC_bias_path, sep='\t') + +#get rid of extremely low GC bias values +#these fragments will now be excluded +#these fragments are extremely rare so it is difficult to get a good estimate of GC bias +GC_bias['smoothed_GC_bias'] = np.where(GC_bias['smoothed_GC_bias']<0.05,np.nan,GC_bias['smoothed_GC_bias']) + +GC_bias = GC_bias[['length','num_GC','smoothed_GC_bias']] +GC_bias = GC_bias.set_index(['num_GC','length']).unstack() + +#convert to a dictionary +GC_bias = GC_bias.to_dict() + +#get rid of values where the num_GC is greater than the length (included due to the way I made the dict) +GC_bias2 = {} +for key in GC_bias.keys(): + length = key[1] + GC_bias2[length] = {} + for num_GC in range(0,length+1): + bias = GC_bias[key][num_GC] + GC_bias2[length][num_GC]=bias +GC_bias = GC_bias2 +del(GC_bias2) + + +# step_1_import_sites + +# In[ ]: + + +def step_1_import_sites(sites_file,site_name): #number of sites needed so it can be changed to an int + all_sites=pd.read_csv(sites_file,sep='\t') + print(site_name,'total sites',len(all_sites)) + + #throw out sites that aren't on the selected chroms + all_sites = all_sites[all_sites[chrom_col].isin(chroms)] + print(site_name,'sites on selected chromosomes',len(all_sites)) + + #select the sites to use if specified + if sort_by.lower()=='none': #if using all sites + print(site_name,'processing all '+str(len(all_sites))+' sites') + + else: #othewise sort by the specified column + print(site_name,'sorting by',sort_by,'and selecting the top',number_of_sites,',ascending:',ascending) + print(site_name,sort_by,'initial range: ',min(all_sites[sort_by]),'to',max(all_sites[sort_by])) + all_sites=all_sites.sort_values(by=sort_by,ascending=ascending).reset_index(drop=True)#sort and reset index + all_sites=all_sites.iloc[0:int(number_of_sites)] + print(site_name,sort_by,'range after sorting: ',min(all_sites[sort_by]),'to',max(all_sites[sort_by])) + + #add a site_name column + all_sites['site_name']=site_name + #add a sample column + all_sites['sample']=sample_name + #add background normalizatino + all_sites['background_normalization']=background_normalization + + #split the list of all sites into groups for quicker processing + max_chunk_size=500 + n_chunks=np.ceil(len(all_sites)/max_chunk_size) + site_groups=np.array_split(all_sites.index.values,n_chunks) + + return(all_sites, site_groups) + + +# step_2_collect_coverage + +# In[ ]: + + +def collect_fragments(sites,window_start,window_end,direction): + #open the bam file for each pool worker (otherwise individual pool workers can close it) + bam_file = pysam.AlignmentFile(bam_path) + + #open the ref seq + ref_seq=pysam.FastaFile(ref_seq_path) + + #extend the window by half the max fragment length in each direction + max_adjustment = int(np.ceil(sz_range[1]/2)) + adjusted_start = window_start-max_adjustment + adjusted_end = window_end+max_adjustment + + window_columns = np.arange(window_start,window_end,step) + + #make sure Chrom is a string + sites[chrom_col]=sites[chrom_col].astype(str) + + cov_pd={} #set up dictionary to hold data + GC_cov_pd = {} #set up a dictionary to hold GC corrected data + + + for endpoint in ['start','end']: + cov_pd[endpoint]=pd.DataFrame(columns = window_columns) + GC_cov_pd[endpoint] = pd.DataFrame(columns = window_columns) + + #workaround for NCBI style formatted bams (1,2,3 etc) with UCSC sites + if len(sites)>0: + test_chrom=sites.iloc[0][chrom_col] + try: + bam_file.get_reference_length(test_chrom) + NCBI=False + except: + bam_file.get_reference_length(test_chrom.split('chr')[-1]) + NCBI=True + + #run analysis on each site + for i in range(len(sites)): #for each location in the genome + if i%100==0: + print (sites.iloc[0]['site_name'],i,time.time()-start_time) + sys.stdout.flush() + + ############# + #make dicts to hold output for this individual site + ############# + #for now, catch all reads that start or end within a fragment length of the window + cov_dict = {} + GC_cov_dict = {} + for item in ['start','end']: + cov_dict[item]={m:0 for m in range(adjusted_start,adjusted_end)} + GC_cov_dict[item]={m:0 for m in range(adjusted_start,adjusted_end)} + + #################### + #fetch reads + #################### + #identify the analysis window for that site + chrom = sites.iloc[i][chrom_col] + position = sites.iloc[i]['position'] + + if NCBI: + chrom=chrom.split('chr')[-1] + + #these regions have been filtered so they should all be fetchable + fetched=bam_file.fetch(contig=chrom, start=adjusted_start+position, stop=adjusted_end+position) #fetch reads that map to the region of interest + + ######################## + #count coverage + ######################## + for read in fetched: + #filter out reads + if abs(read.template_length)>=sz_range[0] and abs(read.template_length)<=sz_range[1] and read.is_paired==True and read.mapping_quality>=map_q and read.is_duplicate==False and read.is_qcfail==False: + read_start=read.reference_start-position #read start (with respect to the position of the current region of interest) + + #find the place where the fragment starts or ends with respect to the window + if read.is_reverse==False and read.template_length>0: + fragment_start = read_start + fragment_end = read_start+read.template_length + read_type='start' + + #adjusted_loc = fragment_start+int(np.ceil(fragment_length/2)) #where to count the coverage + midpoint = int(np.floor((fragment_start+fragment_end)/2)) + + elif read.is_reverse==True and read.template_length<0: + read_len=read.reference_length #this is the read length only (for adjusting the start position) + fragment_start = read_start+read_len+read.template_length + fragment_end = read_start+read_len + read_type='end' + + #adjusted_loc = fragment_end-int(np.ceil(fragment_length/2)) #where to count the coverage + midpoint = int(np.floor((fragment_start+fragment_end)/2)) + + else: + continue + + #get the fragment seq for GC content + #seq_string = (ref_seq.fetch(chrom,fragment_start+position,fragment_end+position)).upper() #for printing + + fragment_seq = (ref_seq.fetch(chrom,fragment_start+position,fragment_end+position)).upper() + fragment_seq = list(fragment_seq.replace('T','0').replace('A','0').replace('C','1').replace('G','1').replace('N',str(np.random.randint(0,2)))) + fragment_seq = [int(m) for m in fragment_seq] + + ##check work + ############ + #print(read_type,read.query_sequence) + #if read.is_reverse == False: + # print(read_type,seq_string[0:read.reference_length]) + #elif read.is_reverse == True: + # print(read_type,seq_string[-read.reference_length:]) + #print(sum(fragment_seq),len(fragment_seq),read.template_length) + #print('\n') + ########### + + #check that the site is in the window + if midpoint>=adjusted_start and midpoint0: + #take the means (ignore nans) + mean_value = np.nanmean(out_data[data_type][endpoint].astype(float).values) + #normalize to 1 for an average site + out_data[data_type][endpoint] = out_data[data_type][endpoint]/mean_value + + #normalize individual sites to 1 if background_normalization == 'local' + if background_normalization == 'local': + print(site_name,'normalizing individual sites to one') + for data_type in ['reads','GC_corr']: + for endpoint in ['start','end']: + mean_data = out_data[data_type][endpoint].values.mean(axis=1,keepdims=True) + #replace zero with nan + mean_data = np.where(mean_data==0,np.nan,mean_data) + out_data[data_type][endpoint] = out_data[data_type][endpoint]/mean_data + + + #if not saving individual sites, take the mean + if individual.lower()=='false': + print(site_name,'taking means') + for data_type in ['reads','GC_corr']: + for endpoint in ['start','end']: + #not sure why 'astype' is needed but it seems to be required + #number of sites will be the same for all data types and endpoints + number_of_sites_used = len(out_data[data_type][endpoint][~(np.isnan(out_data[data_type][endpoint].values.mean(axis=1).astype(float)))]) + + out_data[data_type][endpoint] = out_data[data_type][endpoint].mean(axis=0) + #convert back to dataframe for further processing + out_data[data_type][endpoint] = pd.DataFrame(out_data[data_type][endpoint]).T + print(site_name,'mean of',number_of_sites_used, 'sites') + + ################# + #smooth data + ################# + if smoothing.lower()=='true': + #savgol window should be approx one fragment length but it must be odd + savgol_window=np.floor(fragment_length/step) + if savgol_window%2==0: + savgol_window=savgol_window+1 + savgol_window=int(savgol_window) + + print(site_name,'smoothing') + sys.stdout.flush() + + for data_type in ['reads','GC_corr']: + for endpoint in ['start','end']: + out_data[data_type][endpoint][norm_columns]=savgol_filter(out_data[data_type][endpoint][norm_columns], savgol_window, 3) + ################# + + #keep only the plotting columns + for data_type in ['reads','GC_corr']: + out_data[data_type]['start'] = out_data[data_type]['start'][plot_columns] + out_data[data_type]['end'] = out_data[data_type]['end'][plot_columns] + + + #if taking the mean of all sites, get metadata for a mean site + if individual.lower()=='false': + print(site_name,'taking metadata means') + metadata = pd.DataFrame(all_sites.iloc[0][['site_name','sample','background_normalization']]).T + metadata['total_read_starts'] = np.mean(all_sites['total_read_starts']) + metadata['total_read_ends'] = np.mean(all_sites['total_read_ends']) + metadata['GC_corrected_total_read_starts'] = np.mean(all_sites['GC_corrected_total_read_starts']) + metadata['GC_corrected_total_read_ends'] = np.mean(all_sites['GC_corrected_total_read_ends']) + metadata['number_of_sites'] = number_of_sites_used + all_sites = metadata + + #merge all the different dataframes + final_data = pd.DataFrame() + for data_type in ['reads','GC_corr']: + for endpoint in ['start','end']: + current_data = all_sites.merge(out_data[data_type][endpoint], left_index=True ,right_index=True) + + if data_type == 'GC_corr': + current_data['GC_correction']='GC_corrected' + elif data_type == 'reads': + current_data['GC_correction']='none' + current_data['endpoint']=endpoint + + final_data = final_data.append(current_data,ignore_index=True, sort=True) + + + #rearrange the columns back into a logical order + metadata_columns = list(all_sites.columns) + ['GC_correction','endpoint'] + + final_data = final_data[metadata_columns+list(plot_columns)] + + #start and end traces should be nearly the same (because they are being used to calculate midpoints) + #keep only start for export + final_data = final_data[final_data['endpoint']=='start'] + final_data['endpoint']='midpoint' + + #export + print(site_name,'exporting') + sys.stdout.flush() + final_data.to_csv(current_out_file,sep='\t',float_format='%.5f', index=False) + + print('done',site_name) + sys.stdout.flush() + + if debugging == True: + print('debugging!') + return(final_data) + elif debugging == False: + pass + + +# In[ ]: + + +#run the analysis + +#start the timer +start_time=time.time() + +p = Pool(processes=CPU) #use the specified number of processes +p.map(run_full_analysis, to_do_list, 1) #run the analysis on all TFs in TFs_list. Send only one item to each processor at a time. + + +# In[ ]: + + +print('merging all sites') +start_time = time.time() + +# merge results together and export +merged_out_file = results_dir+'/coverage/all_sites/'+sample_name+'.all_sites.coverage.txt' +if not os.path.exists(results_dir+'/coverage/all_sites'): #make any necessary directories for output + os.mkdir(results_dir+'/coverage/all_sites') + +merged_output=pd.DataFrame() +for j,line in enumerate(to_do_list): + if j%50 == 0: + print(line[0],time.time()-start_time) + sys.stdout.flush() + site_name = line[0] + indiv_out_file = results_dir+'/coverage/'+site_name+'/'+sample_name+'.'+site_name+'.coverage.txt' + current_data = pd.read_csv(indiv_out_file,sep='\t') + merged_output = merged_output.append(current_data,ignore_index=True,sort=True) + +merged_output.columns +merged_output.to_csv(merged_out_file,sep='\t',index=False) + +if erase_intermediates.lower()=='true': + for j,line in enumerate(to_do_list): + site_name = line[0] + indiv_out_file = results_dir+'/coverage/'+site_name+'/'+sample_name+'.'+site_name+'.coverage.txt' + os.remove(indiv_out_file) + +print('done with merge') + + +# In[ ]: + + +# #cell for testing +# #start the timer +# start_time=time.time() + +# #for testing use a single CPU +# final_data=run_full_analysis(to_do_list[0]) + + +# In[ ]: + + +# ##plot for testing +# current = final_data[(final_data['endpoint']=='midpoint') & (final_data['GC_correction']=='GC_corrected')][plot_columns].mean() +# plt.plot(plot_columns,current) + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# In[ ]: + + + + diff --git a/griffin/scripts/griffin_filter_sites.py b/griffin/scripts/griffin_filter_sites.py new file mode 100755 index 0000000..7d1a2ca --- /dev/null +++ b/griffin/scripts/griffin_filter_sites.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +#import stuff +import pandas as pd +import numpy as np +import math +import time +import pyBigWig +import sys +import argparse + + +# In[ ]: + + +# #TSS panel test +# # #define paths and parameters +# in_file='/fh/fast/ha_g/user/adoebley/data/SCLC_targeted_panel_sites/all_sites/TSS_targets.bed' +# in_file_name = 'TSS_targets' +# out_dir='./' +# mapability_file='../../downloads/genome/k50.Umap.MultiTrackMappability.hg38.bw' + +# # #define the columns in the TFBS files +# chrom_col='Chrom' +# start_col='TSS' +# end_col='TSS' +# strand_col='Strand' +# chroms = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22','chrX','chrY'] + +# window_values=(-100,255) #norm window +# targeted_window_columns=('window_start','window_end') +# targeted_panel = 'True' + +# threshold = 0.95 + + +# In[ ]: + + + + + +# In[ ]: + + +parser = argparse.ArgumentParser() + +parser.add_argument('-i','--in_file', help='file of TFBS to annotate', required=True) +parser.add_argument('--name', help='in file name', required=True) +parser.add_argument('-o','--out_dir', help='directory for output files', required=True) +parser.add_argument('-m','--mapability_file',help='.bw file with mapability values e.g. UCSC hg38 k50.Umap.MultiTrackMappability.bw', required=True) + +parser.add_argument('-c','--chrom_column',help='name of column containing chromosome number', default='Chrom') +parser.add_argument('-s','--start_column',help='name of column containing the start of the TFBS (this will be averaged with the end to identify the TFBS center)', default='Start') +parser.add_argument('-e','--end_column',help='name of column containing the end of the TFBS (this will be averaged with the start to identify the TFBS center)', default='End') +parser.add_argument('--strand_column',help='name of column containing the strand (+ or -)', default='Strand') +parser.add_argument('--chroms', help='chromosomes to include when selecting sites', nargs='*') +parser.add_argument('--window_values',help='start and end of window to be analyzed around each TFBS',nargs=2, type=int, required=True) +parser.add_argument('--targeted_panel',help="whether the sites are from a targeted panel",default='False') +parser.add_argument('--targeted_window_columns',help='column names that specify the start and end of the window for a targeted region',nargs=2,default=('NA','NA')) + +parser.add_argument('--threshold',help='define cutoff for high mapability', default=0.95, type=float) + +args = parser.parse_args() + +in_file=args.in_file +in_file_name = args.name +out_dir=args.out_dir.rstrip('/')+'/' +mapability_file=args.mapability_file + +chrom_col=args.chrom_column +start_col=args.start_column +end_col=args.end_column +strand_col = args.strand_column +chroms = args.chroms + +window_values=args.window_values +targeted_panel=args.targeted_panel +targeted_window_columns=args.targeted_window_columns + +threshold=args.threshold + + +# In[ ]: + + +#set up parameters +print('\narguments provided:') +print('\tin_file = "'+in_file+'"') +print('\tin_file_name = "'+in_file_name+'"') +print('\tout_dir = "'+out_dir+'"') +print('\tmapability_file = "'+mapability_file+'"') + +print('\tchrom_col = "'+chrom_col+'"') +print('\tstart_col = "'+start_col+'"') +print('\tend_col = "'+end_col+'"') +print('\tstrand_col = "'+strand_col+'"') +print('\tchroms = ',chroms) + +print('\twindow_values=',window_values) +print('\ttargeted_panel = "'+targeted_panel+'"') + +window_start_column=targeted_window_columns[0] +window_end_column=targeted_window_columns[1] +print('\ttargeted_window_columns = ',targeted_window_columns) + +print('\tthreshold = '+str(threshold)) + +print('\n') + + +# In[ ]: + + +#import info about the sites +sites=pd.read_csv(in_file, sep='\t') +print('number_of_sites:',len(sites)) +sys.stdout.flush() + +#identify the TF position +sites['position'] = (sites[start_col]+sites[end_col])/2 +sites['position'] = np.floor(sites['position']) +sites['position'] = sites['position'].astype(int) + +#identify the window around each site to be used for mapability +sites['norm_window_start']=sites['position']+window_values[0] +sites['norm_window_end']=sites['position']+window_values[1] + +#identify the window around reverse sites if direction is specified +if strand_col in sites.columns: + print('flipping_reverse_sites') + rv_sites = sites[sites[strand_col]=='-'].copy() + rv_sites['norm_window_start']=rv_sites['position']-window_values[1] + rv_sites['norm_window_end']=rv_sites['position']-window_values[0] + sites[sites[strand_col]=='-'] = rv_sites.copy() #replace the rv sites with the flipped sites + del(rv_sites) + + +#drop any sites that don't span the full window +if targeted_panel.lower()=='true': + + sites['relative_window_start']=sites[window_start_column]-sites['position'] + sites['relative_window_end']=sites[window_end_column]-sites['position'] + + if strand_col in sites.columns: #flip the orientation of the window for reverse sites + print('flipping_reverse_sites') + rv_sites = sites[sites[strand_col]=='-'].copy() + rv_sites[['relative_window_start','relative_window_end']]=rv_sites[['relative_window_end','relative_window_start']]*-1 + sites[sites[strand_col]=='-'] = rv_sites #replace the rv sites with the flipped sites + + sites=sites[(sites['relative_window_start']<=window_values[0]) & (sites['relative_window_end']>=window_values[1])] + #drop the new columns as they are no longer needed + sites = sites.drop(columns=['relative_window_start','relative_window_end']) + + print('sites that span the window:',len(sites)) + + + +# In[ ]: + + +if strand_col in sites.columns: + print('fw_sites:',len(sites[sites[strand_col]=='+'])) + print('rv_sites:',len(sites[sites[strand_col]=='-'])) + + +# In[ ]: + + +sites = sites[sites[chrom_col].isin(chroms)] +print('sites_after_removing_non_specified_chroms:',len(sites)) + + +# In[ ]: + + +mapability = pyBigWig.open(mapability_file) +chroms = sites[chrom_col].unique() +for chrom in chroms: + try: + mapability.values(chrom,100000,100010) + except: + print('###\n###\nChromosome names dont match chromosomes in mapability file, check chromosome formatting\n###\n###\n') + sys.exit(1) + + + +# In[ ]: + + +#run analysis +start_time=time.time() + +print(in_file_name) +sys.stdout.flush() + +#import the mapability data +mapability = pyBigWig.open(mapability_file) + +#make list to hold mean values +mean_values=[] + +#get 1% intervals for tracking progress +one_percent=int(len(sites)/100) +if one_percent==0: #if there are less than 100 sites + one_percent=1 + +window_len = window_values[1]-window_values[0] + +for i in range(len(sites)): + if i%one_percent==0: + print(i,time.time()-start_time) + sys.stdout.flush() + #get the location of the site (chromosome and center of site) + chrom = sites.iloc[i][chrom_col] + + position=sites.iloc[i]['position'] + start=sites.iloc[i]['norm_window_start'] + end=sites.iloc[i]['norm_window_end'] + +# print(sites.iloc[i][strand_col],start-position) + #fetch the values for that site + try: + fetched=mapability.values(chrom, start, end) + #convert nan values in the data to zeros and convert fetched to np array + fetched=np.nan_to_num(fetched) + + except: #if the site can't be fetched + fetched=[0 for m in range(window_len)] + #convert the data to an np array + fetched=np.array(fetched) + + #reshape the data for adding to the array for all sites + try: + fetched=fetched.reshape(1,window_len) + + except: #if the full site wasn't fetched + fetched=np.array([0 for m in range(window_len)]) + fetched=fetched.reshape(1,window_len) + + #add the fetched data to the array of data for all sites + mean_values.append(fetched.mean()) + + del (fetched,position,chrom) + +#drop the start and end columns because these will be recalculated based on step in future analyses +sites = sites.drop(columns=['norm_window_start','norm_window_end']) +print(time.time()-start_time) +sys.stdout.flush() + + +# In[ ]: + + +#calculate the mean value per site +sites['mean_mapability']=mean_values + +overall_TF_mapability={'total_sites':len(sites)} + +#split_list_of_sites for export and count list lengths + +high_sites=sites[(sites['mean_mapability']>=threshold)] +low_sites=sites[(sites['mean_mapability']