-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline.sh
107 lines (78 loc) · 3.27 KB
/
pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
INPUT="../raw_data/NGI_files_to_analyze/glycocapture_6_TG_C_by_Reid"
EXPNUM="62"
PEPS="Glycocap-6TG(C)_peptide report.csv"
SPECS="Glycocap-6TG(C)_spectrareport.csv"
# goto the GlycoMadness folder ...
# and execute from there:
echo "stage 0"
# stage 0 is simply checking the consistency between spectrum report and peptide summary
# it produces readable outputs: unique peptides in both files, their intersections along with
# unique proteins in both files and their intersection ...
# Proteins and peptides are compared independently, using "Protein name" and "Peptide sequence" columns ...
python2.7 stage0_spec_n_summary_check.py \
--prefix "$INPUT" \
--verbose \
--separator comma \
-p "$PEPS" \
-s "$SPECS"
# NOT_IN kinda files are also produced by this script ...
# check exit status and exit if it's not 0
rc=$?; if (( $rc != 0 )); then exit $rc; fi
echo "stage 1a and 1b ..."
# stage 1 performs protein records fetching from NCBI, using parsed Protein name from Peptide summary
# it identifies fetchid-s first, and then it downloads corresponding records in genbank format ...
# not all proteins might end up with the fetchids, moreso,
# there might be discrepancies between spectrum and pept_sum in terms of # of proteins these files are reffering to.
python2.7 stage1a_aquire_fetchid_NCBI.py \
--prefix "$INPUT" \
--separator comma \
-p "$PEPS" \
--email sergey.venev@umassmed.edu \
--swiss
# pull em ...
python2.7 stage1b_pull_proteins_NCBI.py \
--prefix "$INPUT" \
-f peptides_with_fetch.csv \
--email sergey.venev@umassmed.edu
# check exit status and exit if it's not 0
rc=$?; if (( $rc != 0 )); then exit $rc; fi
echo "stage 2"
# Stage2 is simply assigning pulled protein sequences to the peptides listed in summary...
# We're trying to choose best peptide-protein combinations according to some rules and criteria.
# Ambiguous and/or underachieving are reported in the BAD file ...
python2.7 stage2_assign_pulled_proteins.py \
--prefix "$INPUT" \
-f peptides_with_fetch.csv \
-g pulled_proteins.gb
# check exit status and exit if it's not 0
rc=$?; if (( $rc != 0 )); then exit $rc; fi
echo "stage 3"
# Stage 3 is the place where we are finally touching upon actual Peptide spectrum(!) ...
# we're extracting glycosilation information and MERGING already processed peptide summary with spectrum report.
# Spec and pep are merged using peptides, so if there were any mismatches between 2 files on the "Protein" level,
# than Peptide summary associated nomenclature would end up in the FINAL file ...
python2.7 stage3_gsites_catalog.py \
-e "$EXPNUM" \
--prefix "$INPUT" \
--separator comma \
-m pept_prot_map.csv \
-g pulled_proteins.gb \
-s "$SPECS"
# check exit status and exit if it's not 0
rc=$?; if (( $rc != 0 )); then exit $rc; fi
echo "stage 4"
# combine BAD and GOOD in one excel spreadsheet ...
python2.7 stage4_eval_bad_combine_out.py \
--prefix "$INPUT" \
-e "$EXPNUM" \
-b BAD_pept_prot.csv \
-o FINAL_gsite_anthology.csv \
-g pulled_proteins.gb \
-s "$SPECS" \
--separator comma
# check exit status and exit if it's not 0
rc=$?; if (( $rc != 0 )); then exit $rc; fi
# TO DO
# adadress 'enzyme' issue ...
# discrepancies between pepts and spec summay - what to do?!
# END OF FILE?