From 9517f583f6db22a54bf6c86fd2857173674e3cde Mon Sep 17 00:00:00 2001 From: Thomas Deckers Date: Mon, 20 Sep 2021 12:11:04 -0700 Subject: [PATCH] Updated to ERVmap 1.1. Implemented ERVmap auto, introduced argument checking to erv_genome.pl, modified erv_genome.pl and run_clean_htseq.pl so that ERVmap no longer needs to be added to the path variable, improved documentation, cleaned up folder structure, fixed bugs. --- ERVmap_auto.sh | 113 ++++++++++ README.md | 143 ++++++++++--- ERVmap.bed => ref/ERVmap.bed | 0 ref/GRCh38.genome_file.txt | 194 ++++++++++++++++++ ref/illumina_adapter.txt | 17 ++ clean_htseq.pl => scripts/clean_htseq.pl | 0 erv_genome.pl => scripts/erv_genome.pl | 82 ++++++-- interleaved.pl => scripts/interleaved.pl | 0 merge.pl => scripts/merge_count.pl | 0 .../normalize_deseq.r | 0 .../normalize_with_file.pl | 0 parse_bam.pl => scripts/parse_bam.pl | 0 .../run_clean_htseq.pl | 3 +- 13 files changed, 496 insertions(+), 56 deletions(-) create mode 100644 ERVmap_auto.sh rename ERVmap.bed => ref/ERVmap.bed (100%) create mode 100644 ref/GRCh38.genome_file.txt create mode 100644 ref/illumina_adapter.txt rename clean_htseq.pl => scripts/clean_htseq.pl (100%) rename erv_genome.pl => scripts/erv_genome.pl (60%) rename interleaved.pl => scripts/interleaved.pl (100%) rename merge.pl => scripts/merge_count.pl (100%) rename normalize_deseq.r => scripts/normalize_deseq.r (100%) rename normalize_with_file.pl => scripts/normalize_with_file.pl (100%) rename parse_bam.pl => scripts/parse_bam.pl (100%) rename run_clean_htseq.pl => scripts/run_clean_htseq.pl (86%) diff --git a/ERVmap_auto.sh b/ERVmap_auto.sh new file mode 100644 index 0000000..8ea7832 --- /dev/null +++ b/ERVmap_auto.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +shopt -s extglob +shopt -s nullglob + +ref=~/ERVmap/ref #folder containing required reference files (see below) +scripts=~/ERVmap/scripts #folder containing included perl and R scripts + +#Main mapping function. Change paths as necessary to match your file structure +map_data () { + $scripts/erv_genome.pl -start_stage 1 -end_stage 6 \ + --fastq $1 \ + --genome $ref/bwa_genome/genome \ + --genome_Bowtie2 $ref/Bowtie2_genome/genome \ + --bed $ref/ERVmap.bed \ + --genomefile $ref/GRCh38.genome_file.txt \ + --gtf $ref/genes.gtf \ + --transcriptome $ref/Bowtie2_genome/known \ + --adaptor $ref/illumina_adapter.txt \ + --filter $scripts/parse_bam.pl \ + --cell ${workdir}_working + + output_folder=./output/${workdir##*/} + + mkdir -p $output_folder/erv $output_folder/cellular + + + mv ${workdir}_working/herv_coverage_GRCh38_genome.txt $output_folder/erv/${1##*/}.e + mv ${workdir}_working/GRCh38/htseq.cnt $output_folder/cellular/${1##*/}.c +} + +#####Argument valdation##### + +if [[ "$#" -ne 1 ]]; then + echo "usage: bash ERVmap_auto.sh input_dir" >&2 + exit 1 +fi + +workdir=$(readlink -e $1) + +#Check if the provided argument is a valid directory +if [ -d "${workdir}" ] ; then + echo "starting ERVmap on $workdir"; +else + if [ -f "${workdir}" ]; then + echo "${workdir} is a file. Please provide the path to a directory"\ + "containing the files you'd like to process.">&2 + + exit 1 + else + echo "${workdir} not found. Please provide a path to a directory"\ + "containing the files you'd like to process.">&2 + exit 1 + fi +fi + +cd $workdir + +#Check if appropraitely named files exist +if ! [[ $(ls *@(_SS|_R1|_R2).fastq?(.gz))2>/dev/null ]]; then + echo "No appropriately named files found. Please name all single-end reads as"\ + "_SS.fastq.gz and all pair end reads as _R1.fastq.gz or"\ + " _R2.fastq.gz for the first and second reads, respectively.">&2 + exit 1 +fi + + +if [[ $(ls *@(_R1|_R2).fastq?(.gz))2>/dev/null ]]; then +#Check if pair end files are matched + failed=0 + #check if each R1 has an R2 + for i in *_R1.fastq?(.gz); do + filename=${i%_*} + if ! [ -e ${filename}_R2.fastq?(.gz) ]; then + echo "${filename}_R2 not found">&2 + failed=1 + fi + done + #Check if each R2 has an R1 + for i in *_R2.fastq?(.gz); do + filename=${i%_*} + if ! [ -e ${filename}_R1.fastq?(.gz) ]; then + echo "${filename}_R1 not found">&2 + failed=1 + fi + done + + if [ $failed -eq 1 ]; then + echo "Ensure that all pair end reads have both reads present."\ + "Name all single end reads as _SS.fastq.gz">&2 + exit 1; fi +fi + +cd - + + + +#####running ERVmap##### +for i in $workdir/*_SS.fastq?(.gz); do + map_data $i +done + +for i in $workdir/*_R1.fastq?(.gz); do + $scripts/interleaved.pl --read1 ${i} --read2 ${i/R1.fastq?(.gz)/R2.fastq?(.gz)} > "${i/_R1.fastq?(.gz)/.fastq}" + map_data ${i/_R1.fastq?(.gz)/.fastq} +done + +$scripts/run_clean_htseq.pl $output_folder/cellular c c2 __ $scripts +$scripts/merge_count.pl 3 6 e $output_folder/erv > $output_folder/erv/merged_erv.txt +$scripts/merge_count.pl 0 1 c2 $output_folder/cellular > $output_folder/cellular/merged_cellular.txt +$scripts/normalize_deseq.r $output_folder/cellular/merged_cellular.txt $output_folder/cellular/normalized_cellular.txt $output_folder/cellular/normalized_factors.txt +$scripts/normalize_with_file.pl $output_folder/cellular/normalized_factors.txt $output_folder/erv/merged_erv.txt > $output_folder/full_normalized_erv_counts.txt + diff --git a/README.md b/README.md index 6da79b9..eadb35a 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,37 @@ # ERVmap -ERVmap is one part curated database of human proviral ERV loci and one part a stringent algorithm to determine which ERVs are transcribed in their RNA seq data. +ERVmap is one part curated database of human proviral ERV loci and one part a stringent algorithm to determine which ERVs are transcribed in RNA seq data. -# **Citation** +# **Citation** Tokuyama M. et. al., ERVmap analysis reveals genome-wide transcription of human endogenous retroviruses. Proc Natl Acad Sci U S A. 2018 Dec 11;115(50):12565-12572. doi: 10.1073/pnas.1814589115. +# **What's new?** +ERVmap 1.1 includes: +* The introduction of ERVmap auto, to allow easy processing of multiple files +* Descriptive error catching for incorrect arguments +* Thorough documentation for easier setup and use +* Improved folder structure +* Bug fixes + +Also keep an eye out for ERVmap 2, featuring updated dependencies and cleaner architecture. Coming soon. # **Installing** ### Install dependencies -``` -bedtools2 -cufflinks -bwa-0.7.17 -cufflinks-2.2.1.Linux_x86_64 -python -samtools-1.8 -tophat-2.1.1.Linux_x86_64 -tophat2 -trim (http://graphics.med.yale.edu/trim/) +``` +bedtools 2.29.2 +btrim 0.3.0 (http://graphics.med.yale.edu/trim/) +bwa 0.7.17 +cufflinks 2.2.1 +perl 5.26 +python 2.7.16 +R 3.6.2 +samtools 1.9 +tophat 2.1.1 +``` +### Install required libraries +``` +htseq 0.11.3 for Python +File::Type 0.22 for Perl +DESeq 1.38.0 for R (via BiocManager) ``` ### Install .pl and r files @@ -29,46 +44,106 @@ merge_count.pl normalize_with_file.pl normalize_deseq.r ``` - +### Gather and generate reference files +|File|Possible source| Suggested file location| +|----------------------------------------|------------------------------------------------------|-----------------------------------------------------------| +|HG38 primary assembly fasta|[ensembl](https://ensembl.org/Homo_sapiens/Info/Index)| ref/Bowtie2_genome/genome.fa AND ref/bwa_genome/genome.fa| +|HG38 comprehensive gene annotation (gtf)|[ensembl](https://ensembl.org/Homo_sapiens/Info/Index)| ref/genes.gtf| +|ERV bed |Included|ref/ERVmap.bed| |Bowtie and BWA index files|**Manually generated** (see below)|ref/Bowtie2_genome, ref/bwa_genome| +|Illumina Adapter|Included*|ref/illumina_adapter.txt| +|Genome file|Included|ref/GRCh38.genome_file.txt| +|Bowtie transcriptome files|Autogenerated (see below)|ref/Bowtie2_genome/known| + +*Oligonucleotide sequences © 2021 Illumina, Inc. All rights reserved. + +Index files must be generated manually before the first run if not already generated: +* **BWA index files**: `bwa index -p ref/bwa_genome/genome ref/bwa_genome/genome.fa` +* **Bowtie index files**: `bowtie2-build ref/Bowtie2_genome/genome.fa ref/Bowtie2_genome/genome` + +Transcriptome files are normally auto-generated on the first run of ERVmap. However, if ERVmap is unable to generate the transcriptome files (e.g. on a server with limited write permissions for computational nodes), the transcriptome can be manually generated with the following code after generating the Bowtie index files: +`tophat2 -G ref/genes.gtf --transcriptome-index=genome_Bowtie2/known genome_Bowtie2/genome` # **Map data to human genome (hg38)** +Running the below blocks of code in the terminal will yield raw counts for cellular genes and ERVmap loci as separate files. -This step will yield raw counts for cellular genes and ERVmap loci as separate files. - -### For single-end sequences: +### Setup for pair-end +Skip the following step for single end data ``` -erv_genome.pl -stage 1 -stage2 6 -fastq /${i}_SS.fastq.gz +interleaved.pl --read1 my_file_R1.fastq.gz --read2 my_file_R2.fastq.gz > my_file.fastq ``` +The above will convert `my_file_R1.fastq.gz` and `my_file_R2.fastq.gz` into one file, `my_file.fastq` +### Define file locations +Modify the below lines of code for your particular data and folder structure +``` +dataset_name=my_data #used for creating unique directory names + +input_file=/path/to/my_file.fastq.gz #absolute path to file -### For pair-end sequences: +ref=~/ERVmap/ref #folder containing required reference files (see below) +scripts=~/ERVmap/scripts #folder containing included perl and R scripts ``` -interleaved.pl --read1 ${i}_R1.fastq.gz --read2 ${i}_R2.fastq.gz > ${i}.fastq.gz -erv_genome.pl -stage 1 -stage2 6 -fastq /${i}.fastq.gz +### Map data ``` - -### Store output files +$scripts/erv_genome.pl -start_stage 1 -end_stage 6 \ +--fastq $input_file \ +--genome $ref/bwa_genome/genome \ +--genome_Bowtie2 $ref/Bowtie2_genome/genome \ +--bed $ref/ERVmap.bed \ +--genomefile $ref/GRCh38.genome_file.txt \ +--gtf $ref/genes.gtf \ +--transcriptome $ref/Bowtie2_genome/known \ +--adaptor $ref/illumina_adapter.txt \ +--filter $scripts/parse_bam.pl \ +--cell ${dataset_name}_working ``` -mkdir -p output -mv ./sample/herv_coverage_GRCh38_genome.txt ./output/erv/${i}.e -mv ./sample/GRCh38/htseq.cnt ./output/cellular/${i}.c +Note that if btrim, tophat2, bwa, samtools, or bedtools are not in your path variable, they will have to be specified as arguments as well. (if unsure, check with `type `) +### Store output files ``` +output_folder=./output/$dataset_name -# **Clean up data, merge, and normalize** +mkdir -p $output_folder/erv $output_folder/cellular + +input_name=${input_file##*/} +input_name=${input_name%%.*} -These steps will yield normalized ERV read counts based on size factors obtained through DESeq2 analysis. -Use the output files from above. +mv ./${dataset_name}_working/herv_coverage_GRCh38_genome.txt $output_folder/erv/${input_name}.e +mv ./${dataset_name}_working/GRCh38/htseq.cnt $output_folder/cellular/${input_name}.c +``` +**Repeat the above steps for all the input samples.** +# **Clean up data, merge, and normalize** +These steps will yield normalized ERV read counts based on size factors obtained through DESeq analysis. +Use the output files from above. ``` -run_clean_htseq.pl ./output/cellular c c2 __ -merge_count.pl 3 6 e ./output/erv > ./output/erv/merged_erv.txt -merge_count.pl 0 1 c2 ./output/cellular > ./output/cellular/merged_cellular.txt -normalize_deseq.r ./output/cellular/merged_cellular.txt ./output/cellular/normalized_cellular ./output/cellular/normalized_factors -normalize_with_file.pl ./output/cellular/normalized_factors ./output/erv/merged_erv.txt > ./output/$folder_name.txt +$scripts/run_clean_htseq.pl $output_folder/cellular c c2 __ $scripts +$scripts/merge_count.pl 3 6 e $output_folder/erv > $output_folder/erv/merged_erv.txt +$scripts/merge_count.pl 0 1 c2 $output_folder/cellular > $output_folder/cellular/merged_cellular.txt +$scripts/normalize_deseq.r $output_folder/cellular/merged_cellular.txt $output_folder/cellular/normalized_cellular.txt $output_folder/cellular/normalized_factors.txt +$scripts/normalize_with_file.pl $output_folder/cellular/normalized_factors.txt $output_folder/erv/merged_erv.txt > $output_folder/full_normalized_erv_counts.txt ``` +# **ERVmap auto** +Also included in this github is ERVmap auto. After installing dependencies and gathering reference files, ERVmap auto allows you to run the above code on all appropriately named fastq files in a specified directory. +If installation is completed with files and directories named as specified above, and with the ERVmap base directory in your home folder (`~`) this will work as-is. **Otherwise, you will need to edit ERVmap_auto.sh to work with your folder structure.** + +To use, place all fastq files in an input folder, with the following names: +* Single end data as \_SS.fastq.gz +* First reads of pair-end data as \_R1.fastq.gz +* Second reads of pair-end data as \_R2.fastq.gz + +Then run `bash ERVmap_auto.sh ` +# **Interpreting results** +This will output the following files of interest in the specified output folder: +* **full_normalized_erv_counts.txt**: a tab-separated table of ERV counts across all samples, normalized to size factors based on cellular gene count using DEseq +* **merged_erv.txt**: a tab-separated table of un-normalized ERV counts across all samples +* **normalized_cellular.txt**: a space-separated table of cellular gene counts across all samples, normalized using DEseq +* **merged_cellular.txt**: a space-separated table of un-normalized cellular gene counts across all samples +* **normalized_factors.txt**: a tab-separated table containing the factors used to normalize each of the input samples + # **Authors** * Maria Tokuyama +* Thomas Deckers +* Eric Liu * Yong Kong - diff --git a/ERVmap.bed b/ref/ERVmap.bed similarity index 100% rename from ERVmap.bed rename to ref/ERVmap.bed diff --git a/ref/GRCh38.genome_file.txt b/ref/GRCh38.genome_file.txt new file mode 100644 index 0000000..4e1bf2c --- /dev/null +++ b/ref/GRCh38.genome_file.txt @@ -0,0 +1,194 @@ +1 248956422 +10 133797422 +11 135086622 +12 133275309 +13 114364328 +14 107043718 +15 101991189 +16 90338345 +17 83257441 +18 80373285 +19 58617616 +2 242193529 +20 64444167 +21 46709983 +22 50818468 +3 198295559 +4 190214555 +5 181538259 +6 170805979 +7 159345973 +8 145138636 +9 138394717 +MT 16569 +X 156040895 +Y 57227415 +KI270728.1 1872759 +KI270727.1 448248 +KI270442.1 392061 +KI270729.1 280839 +GL000225.1 211173 +KI270743.1 210658 +GL000008.2 209709 +GL000009.2 201709 +KI270747.1 198735 +KI270722.1 194050 +GL000194.1 191469 +KI270742.1 186739 +GL000205.2 185591 +GL000195.1 182896 +KI270736.1 181920 +KI270733.1 179772 +GL000224.1 179693 +GL000219.1 179198 +KI270719.1 176845 +GL000216.2 176608 +KI270712.1 176043 +KI270706.1 175055 +KI270725.1 172810 +KI270744.1 168472 +KI270734.1 165050 +GL000213.1 164239 +GL000220.1 161802 +KI270715.1 161471 +GL000218.1 161147 +KI270749.1 158759 +KI270741.1 157432 +GL000221.1 155397 +KI270716.1 153799 +KI270731.1 150754 +KI270751.1 150742 +KI270750.1 148850 +KI270519.1 138126 +GL000214.1 137718 +KI270708.1 127682 +KI270730.1 112551 +KI270438.1 112505 +KI270737.1 103838 +KI270721.1 100316 +KI270738.1 99375 +KI270748.1 93321 +KI270435.1 92983 +GL000208.1 92689 +KI270538.1 91309 +KI270756.1 79590 +KI270739.1 73985 +KI270757.1 71251 +KI270709.1 66860 +KI270746.1 66486 +KI270753.1 62944 +KI270589.1 44474 +KI270726.1 43739 +KI270735.1 42811 +KI270711.1 42210 +KI270745.1 41891 +KI270714.1 41717 +KI270732.1 41543 +KI270713.1 40745 +KI270754.1 40191 +KI270710.1 40176 +KI270717.1 40062 +KI270724.1 39555 +KI270720.1 39050 +KI270723.1 38115 +KI270718.1 38054 +KI270317.1 37690 +KI270740.1 37240 +KI270755.1 36723 +KI270707.1 32032 +KI270579.1 31033 +KI270752.1 27745 +KI270512.1 22689 +KI270322.1 21476 +GL000226.1 15008 +KI270311.1 12399 +KI270366.1 8320 +KI270511.1 8127 +KI270448.1 7992 +KI270521.1 7642 +KI270581.1 7046 +KI270582.1 6504 +KI270515.1 6361 +KI270588.1 6158 +KI270591.1 5796 +KI270522.1 5674 +KI270507.1 5353 +KI270590.1 4685 +KI270584.1 4513 +KI270320.1 4416 +KI270382.1 4215 +KI270468.1 4055 +KI270467.1 3920 +KI270362.1 3530 +KI270517.1 3253 +KI270593.1 3041 +KI270528.1 2983 +KI270587.1 2969 +KI270364.1 2855 +KI270371.1 2805 +KI270333.1 2699 +KI270374.1 2656 +KI270411.1 2646 +KI270414.1 2489 +KI270510.1 2415 +KI270390.1 2387 +KI270375.1 2378 +KI270420.1 2321 +KI270509.1 2318 +KI270315.1 2276 +KI270302.1 2274 +KI270518.1 2186 +KI270530.1 2168 +KI270304.1 2165 +KI270418.1 2145 +KI270424.1 2140 +KI270417.1 2043 +KI270508.1 1951 +KI270303.1 1942 +KI270381.1 1930 +KI270529.1 1899 +KI270425.1 1884 +KI270396.1 1880 +KI270363.1 1803 +KI270386.1 1788 +KI270465.1 1774 +KI270383.1 1750 +KI270384.1 1658 +KI270330.1 1652 +KI270372.1 1650 +KI270548.1 1599 +KI270580.1 1553 +KI270387.1 1537 +KI270391.1 1484 +KI270305.1 1472 +KI270373.1 1451 +KI270422.1 1445 +KI270316.1 1444 +KI270340.1 1428 +KI270338.1 1428 +KI270583.1 1400 +KI270334.1 1368 +KI270429.1 1361 +KI270393.1 1308 +KI270516.1 1300 +KI270389.1 1298 +KI270466.1 1233 +KI270388.1 1216 +KI270544.1 1202 +KI270310.1 1201 +KI270412.1 1179 +KI270395.1 1143 +KI270376.1 1136 +KI270337.1 1121 +KI270335.1 1048 +KI270378.1 1048 +KI270379.1 1045 +KI270329.1 1040 +KI270419.1 1029 +KI270336.1 1026 +KI270312.1 998 +KI270539.1 993 +KI270385.1 990 +KI270423.1 981 +KI270392.1 971 +KI270394.1 970 diff --git a/ref/illumina_adapter.txt b/ref/illumina_adapter.txt new file mode 100644 index 0000000..08dec34 --- /dev/null +++ b/ref/illumina_adapter.txt @@ -0,0 +1,17 @@ +ZZZZZZZZZZ AGATCGGAAGAGC 0 2 0 0 +ZZZZZZZZZZ AGATCGGAAGAG 0 1 0 0 +ZZZZZZZZZZ AGATCGGAAGA 0 1 0 0 +ZZZZZZZZZZ AGATCGGAAG 0 1 0 -12 +ZZZZZZZZZZ AGATCGGAA 0 1 0 -11 +ZZZZZZZZZZ AGATCGGA 0 0 0 -10 +ZZZZZZZZZZ AGATCGG 0 0 0 -10 +ZZZZZZZZZZ AGATCG 0 0 0 -10 +ZZZZZZZZZZ AGATC 0 0 0 -10 +ZZZZZZZZZZ CTGTCTCTTATA 0 1 0 0 +ZZZZZZZZZZ CTGTCTCTTAT 0 1 0 0 +ZZZZZZZZZZ CTGTCTCTTA 0 1 0 -12 +ZZZZZZZZZZ CTGTCTCTT 0 1 0 -11 +ZZZZZZZZZZ CTGTCTCT 0 1 0 -10 +ZZZZZZZZZZ CTGTCTC 0 0 0 -10 +ZZZZZZZZZZ CTGTCT 0 0 0 -10 +ZZZZZZZZZZ CTGTC 0 0 0 -10 diff --git a/clean_htseq.pl b/scripts/clean_htseq.pl similarity index 100% rename from clean_htseq.pl rename to scripts/clean_htseq.pl diff --git a/erv_genome.pl b/scripts/erv_genome.pl similarity index 60% rename from erv_genome.pl rename to scripts/erv_genome.pl index e6819f9..a29b934 100644 --- a/erv_genome.pl +++ b/scripts/erv_genome.pl @@ -16,9 +16,11 @@ my $help = 0; my $man = 0; -my ($btrim, $aligner, $bwa, $samtools, $filter, $bedtools,); my ($genome, $genome_Bowtie2, $bed, $genomefile, $gtf, $transcriptome, $adaptor, ); +my ($btrim, $aligner, $bwa, $samtools, $bedtools, $filter, ); +my %defaults = ("btrim" => "btrim", "aligner" => "tophat2", "bwa" => "bwa", + "samtools" => "samtools", "bedtools" => "bedtools", ); my $cell = "sample"; # sample name my $workdir = "."; # working dir "$cell" @@ -48,8 +50,8 @@ "workdir=s" => \$workdir, "outdir=s" => \$outdir, - "stage=i" => \$stage, - "stage2=i" => \$stage2, + "start_stage=i" => \$stage, + "end_stage=i" => \$stage2, "length=i" => \$length, "score_offset=i" => \$score_offset, @@ -80,11 +82,45 @@ pod2usage(1) if $help; pod2usage(2) if $man; -pod2usage(1) unless ($btrim && $aligner && $bwa && $samtools && $filter && $bedtools); -pod2usage(1) unless (-x $btrim && -x $aligner && -x $bwa && -x $samtools && -x $filter && -x $bedtools); -pod2usage(1) unless (-e $genome && -e $genome_Bowtie2 && -e $bed && -e $genomefile && -e $gtf && -e $transcriptome && -e $adaptor); +###checking specified arguments and error checking +my %dependencies = ("btrim" => \$btrim, "aligner" => \$aligner, "bwa" => \$bwa, + "samtools" => \$samtools, "bedtools" => \$bedtools); +my %references = ("genome" => $genome . (defined $genome ? ".fa" : ''), "genome_Bowtie2" => $genome_Bowtie2 . (defined $genome_Bowtie2 ? ".fa" : ''), "bed" => $bed, #note concatentaion of .fa to genomes + "genomefile" => $genomefile, "gtf" => $gtf, "adaptor" => $adaptor, "fastq" => $fastq, "filter" => $filter); +my $failed = 0; + +foreach my $dependency (keys %dependencies) { + if (!(${ $dependencies{$dependency} })){ + ${ $dependencies{$dependency} } = $defaults{$dependency} + } + elsif (!(-x ${ $dependencies{$dependency} })){ + print STDERR "ERROR: please verify specified --$dependency. Unable to execute ${ $dependencies{$dependency} }\n"; + $failed = 1; + } +} + +foreach my $reference (keys %references) { + if (!($references{$reference})){ + print STDERR "ERROR: please specify --$reference\n"; + $failed = 1; + } + elsif (!(-e $references{$reference})){ + print STDERR "ERROR: Please verify specified --$reference. Could not find $references{$reference}\n"; + $failed = 1; + } +} -pod2usage(1) unless ($fastq && $stage > 0 && $stage2 > 0); +if ($stage < 1 || $stage > 6 || $stage2 < 1 || $stage > $stage2){ + print STDERR "ERROR: please ensure that --stage and --stage2 are between 1 and 6, and that --stage2 > --stage\n"; + $failed = 1; +} + +pod2usage(1) if $failed; +#pod2usage(1) unless ($btrim && $aligner && $bwa && $samtools && $filter && $bedtools); +#pod2usage(1) unless (-x $btrim && -x $aligner && -x $bwa && -x $samtools && -x $filter && -x $bedtools); +#pod2usage(1) unless (-e $genome && -e $genome_Bowtie2 && -e $bed && -e $genomefile && -e $gtf && -e $transcriptome && -e $adaptor); + +#pod2usage(1) unless ($fastq && $stage > 0 && $stage2 > 0); @@ -102,15 +138,15 @@ my $btrim_cmd; if ($score_offset == 33) { if ($zipped) { - $btrim_cmd = "/bin/bash -c '$btrim -l $length -w 10 -a 25 -p $adaptor -3 -P -o $btrimout -t <(gunzip -c $fastq) -C > btrim.log 2> btrim.log'"; + $btrim_cmd = "/bin/bash -c '$btrim -l $length -w 10 -a $score -p $adaptor -3 -P -o $btrimout -t <(gunzip -c $fastq) -C > btrim.log 2> btrim.log'"; } else { - $btrim_cmd = "/bin/bash -c '$btrim -l $length -w 10 -a 25 -p $adaptor -3 -P -o $btrimout -t <(cat $fastq) -C > btrim.log 2> btrim.log'"; + $btrim_cmd = "/bin/bash -c '$btrim -l $length -w 10 -a $score -p $adaptor -3 -P -o $btrimout -t <(cat $fastq) -C > btrim.log 2> btrim.log'"; } } else { if ($zipped) { - $btrim_cmd = "/bin/bash -c '$btrim -i -l $length -w 10 -a 25 -p $adaptor -3 -P -o $btrimout -t <(gunzip -c $fastq) -C > btrim.log 2> btrim.log'"; + $btrim_cmd = "/bin/bash -c '$btrim -i -l $length -w 10 -a $score -p $adaptor -3 -P -o $btrimout -t <(gunzip -c $fastq) -C > btrim.log 2> btrim.log'"; } else { - $btrim_cmd = "/bin/bash -c '$btrim -i -l $length -w 10 -a 25 -p $adaptor -3 -P -o $btrimout -t <(cat $fastq) -C > btrim.log 2> btrim.log'"; + $btrim_cmd = "/bin/bash -c '$btrim -i -l $length -w 10 -a $score -p $adaptor -3 -P -o $btrimout -t <(cat $fastq) -C > btrim.log 2> btrim.log'"; } } @@ -212,22 +248,17 @@ sub cmd { __END__ =head1 NAME -erv_se_genome_v2.pl - Script to run ervmap +erv_genome.pl - Script to run ervmap =head1 SYNOPSIS -erv_se_genome_v2.pl [options] +erv_genome.pl [options] Options: -h, -? --help brief help message -t, --test print out commands without run - --btrim path to btrim (http://graphics.med.yale.edu/trim/) - --tophat path to tophat - --bwa path to bwa - --samtools path to samtools - --filter path to filter (parse_bam.pl) - --bedtools path to bedtools + ***Reference Files*** --genome path to bwa human genome index --genome_Bowtie2 path to Bowtie2 human genome index --bed path to bed file of ERVs @@ -235,10 +266,19 @@ =head1 SYNOPSIS --gtf path to gtf file of human gene annotation --transcriptome path to known transcriptome, used by tophat2 --adaptor path to adaptor files, used by btrim (http://graphics.med.yale.edu/trim/) + --filter path to filter (parse_bam.pl) + + ***Dependencies (defaults use software versions in your path variable)*** + --btrim path to btrim (http://graphics.med.yale.edu/trim/) [default=btrim] + --tophat path to tophat [default=tophat2] + --bwa path to bwa [default=bwa] + --samtools path to samtools [default=samtools] + --bedtools path to bedtools [default=bedtools] + ***Run info*** --fastq fastq file - --stage start stage (see below) - --stage2 end stage (see below) + --start_stage start stage (see below) + --end_stage end stage (see below) Stages: diff --git a/interleaved.pl b/scripts/interleaved.pl similarity index 100% rename from interleaved.pl rename to scripts/interleaved.pl diff --git a/merge.pl b/scripts/merge_count.pl similarity index 100% rename from merge.pl rename to scripts/merge_count.pl diff --git a/normalize_deseq.r b/scripts/normalize_deseq.r similarity index 100% rename from normalize_deseq.r rename to scripts/normalize_deseq.r diff --git a/normalize_with_file.pl b/scripts/normalize_with_file.pl similarity index 100% rename from normalize_with_file.pl rename to scripts/normalize_with_file.pl diff --git a/parse_bam.pl b/scripts/parse_bam.pl similarity index 100% rename from parse_bam.pl rename to scripts/parse_bam.pl diff --git a/run_clean_htseq.pl b/scripts/run_clean_htseq.pl similarity index 86% rename from run_clean_htseq.pl rename to scripts/run_clean_htseq.pl index e5b15aa..d12f790 100644 --- a/run_clean_htseq.pl +++ b/scripts/run_clean_htseq.pl @@ -14,6 +14,7 @@ my $e1 = shift; my $e2 = shift; my $stop = shift; +my $scrdir = shift; die "$e1 eq $e2" if ($e1 eq $e2); @@ -27,7 +28,7 @@ for my $f (@files) { my $o = $f; $o =~ s/${e1}$/$e2/; - my $cmd = "clean_htseq.pl $stop $f > $o"; + my $cmd = "$scrdir/clean_htseq.pl $stop $f > $o"; print "$cmd\n"; system($cmd); }