cd ~
mkdir lesson_18
cd lesson_18
mkdir raw_data
mkdir annotation
cp /data/home/mchen33/RNASeq_trinity_assembly/trinity_out_dir_copy/Trinity.fasta ./raw_data/
cd ./annotation
ln -s ../raw_data/Trinity.fasta .
-
Get transdecoder ready
-
(option 1) If you have anaconda installed, install transdecoder with conda
conda install transdecoder -y TransDecoder.LongOrfs
-
(option 2) If you don't have anaconda
module load transdecoder
-
-
Run
TransDecoder.LongOrfs -t Trinity.fasta
-
Outputs
ls Trinity.fasta.transdecoder_dir/
longest_orfs.pep : all ORFs meeting the minimum length criteria, regardless of coding potential. longest_orfs.gff3 : positions of all ORFs as found in the target transcripts longest_orfs.cds : the nucleotide coding sequence for all detected ORFs
-
How many sequences have ORF?
grep '^>' Trinity.fasta.transdecoder_dir/*pep | wc -l
-
Pfam search: search the peptides for protein domains using Pfam.
-
Get software and Pfam database ready
module load hmmer mkdir Pfam_search ## create a directory for the Pfam database cd Pfam_search wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz gunzip Pfam-A.hmm.gz hmmpress Pfam-A.hmm ## prepare an HMM database for faster hmmscan searches
-
Run
hmmscan -o my_hmmscan.out --tblout my_hmmscan.SeqHits.tblr \ --domtblout my_hmmscan.DomainHits.tblr \ -E 1e-5 \ --cpu 2 \ ./Pfam-A.hmm ../Trinity.fasta.transdecoder_dir/longest_orfs.pep cd ../ ## change directory back to the annotation/ directory
-
-
blastP search: search a protein database (Open a new terminal)
-
Get software and protein database ready
module load blast module switch blast mkdir uniprot_blastp_search cd uniprot_blastp_search wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz gunzip uniprot_sprot.fasta.gz makeblastdb -in uniprot_sprot.fasta -dbtype prot
-
Run
blastp -query ../Trinity.fasta.transdecoder_dir/longest_orfs.pep \ -db uniprot_sprot.fasta \ -max_target_seqs 1 \ -outfmt 6 \ -evalue 1e-5 -num_threads 10 > blastp.outfmt6 cd ../ ## change directory back to the annotation/ directory
-
-
We can include homology searches as ORF retention criteria.
-
Create a new directory
mkdir improved_ORF cd improved_ORF/
-
Create a softlink to the Trinity.fasta.transdecoder_dir directory in our current directory
ln -s ../Trinity.fasta.transdecoder_dir/ .
-
Run
TransDecoder.Predict -t ../Trinity.fasta \ --retain_pfam_hits ../Pfam_search/my_hmmscan.DomainHits.tblr \ --retain_blastp_hits ../uniprot_blastp_search/blastp.outfmt6
-
Run this if the Pfam search takes too long to finish!
TransDecoder.Predict -t ../Trinity.fasta \ --retain_blastp_hits ../uniprot_blastp_search/blastp.outfmt6
-
-
Results
-
How many sequences have ORF?
grep '^>' *pep | wc -l ## improved grep '^>' ../Trinity.fasta.transdecoder_dir/*pep | wc -l ## unimproved
-