-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_saf_mstrg.sh
73 lines (66 loc) · 2.55 KB
/
generate_saf_mstrg.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/bin/bash
# Input files
gtf_file="$1"
non_coding_list="$2"
output_saf="$4"
output_bed="$5"
original_gff="$6"
id_table=()
non_coding_ids=()
while IFS= read -r line || [ -n "$line" ]; do
line_split=($line)
mstrg="${line_split[0]}"
non_coding_ids["$mstrg"]=$line
done < "$non_coding_list"
while IFS= read -r line || [ -n "$line" ]; do
if [[ $line == \#* ]]; then
continue
fi
IFS=$'\t' read -r -a fields <<<"$line"
if [[ "${fields[2]}" == "transcript" ]]; then
col9="${fields[8]}"
class_code=$(echo "$col9" | awk -F';' '{print $NF}' | awk '{print substr($NF, 2, length($NF)-2)}')
length=$(( ${fields[4]} - ${fields[3]} + 1))
if [[ $class_code == "=" ]]; then
continue
elif [[ $class_code == "u" ]]; then
mstrg=$(echo "$col9" | awk '{print substr($1, 2, length($1)-2)}')
if [[ ${non_coding_ids["$mstrg"]} ]]; then
mstrg="${mstrg}|${length}|${class_code}|NA|${fields[0]}|$3"
if ! [[ " ${id_table[@]} " =~ " $mstrg " ]]; then
gtf_line="$mstrg;${fields[0]};${fields[3]};${fields[4]};${fields[6]}"
id_table+=("$gtf_line")
fi
fi
elif [[ $class_code == "x" ]]; then
mstrg=$(echo "$col9" | awk '{print substr($1, 2, length($1)-2)}')
if [[ ${non_coding_ids["$mstrg"]} ]]; then
gene_name=$(echo "$col9" | awk '{print $5}' | awk '{print substr($3, 2, length($3)-2)}')
mstrg="${mstrg}|${length}|${class_code}|${gene_name}|${fields[0]}|$3"
if ! [[ " ${id_table[@]} " =~ " $mstrg " ]]; then
gtf_line="$mstrg;${fields[0]};${fields[3]};${fields[4]};${fields[6]}"
id_table+=("$gtf_line")
fi
fi
fi
fi
done < "$gtf_file"
while IFS= read -r line || [ -n "$line" ]; do
if [[ $line == \#* ]]; then
continue
fi
IFS=$'\t' read -r -a fields <<<"$line"
if [[ "${fields[2]}" == "gene" ]]; then
ID=$(echo "${fields[8]}" | awk -F';' '{print $1}' | sed 's/ID=//')
orig_gtf_line="${ID}|$(( ${fields[4]} - ${fields[3]} + 1));${fields[0]};${fields[3]};${fields[4]};${fields[6]}"
id_table+=("$orig_gtf_line")
fi
done < "$original_gff"
for k in "${id_table[@]}"; do
echo "$k" >> "$output_saf"
done
for line in "${id_table[@]}"; do
IFS=';' read -r -a fields <<<"$line"
line_rearranged="${fields[1]};${fields[2]};${fields[3]};${fields[0]};${fields[4]}"
echo "$line_rearranged" >> "$output_bed"
done