-
Notifications
You must be signed in to change notification settings - Fork 1
/
size_sort_gff.py
executable file
·70 lines (53 loc) · 1.46 KB
/
size_sort_gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
name = "size_sort_gff.py"
version = "0.1.1"
updated = "2023-05-20"
usage = f"""
NAME {name}
VERSION {version}
UPDATED {updated}
SYNOPSIS Separates gene predictions into two files based on coded protein length.
USAGE {name} \\
-g 50507.apollo.gff \\
-a 100 \\
-o SIZE_SORT
OPTIONS
-g (--gff) gff file from gene prediction tool
-a (--aa_len) Protein length in amino-acids [Default: 60]
-o (--outdir) Output directory [Default = SIZE_SORTED_PROTEINS]
"""
from sys import argv
if len(argv) < 2:
print(f"\nusage")
exit()
from argparse import ArgumentParser
from os import makedirs
from os.path import isdir,basename
GetOptions = ArgumentParser()
GetOptions.add_argument("-g","--gff",required=True)
GetOptions.add_argument("-a","--aa_len",default=60,type=int)
GetOptions.add_argument("-o","--outdir",default="SIZE_SORTED_PROTEINS")
args = GetOptions.parse_args()
gff_file = args.gff
aa_len = args.aa_len
outdir = args.outdir
if not isdir(outdir):
makedirs(outdir,mode=0o755)
filename = basename(gff_file).split(".")[0]
GFF = open(gff_file,'r')
LONG = open(f"{outdir}/{filename}.long.gff",'w')
SHORT = open(f"{outdir}/{filename}.short.gff",'w')
for line in GFF:
line = line.strip()
if line[0] == "#":
LONG.write(f"{line}\n")
SHORT.write(f"{line}\n")
else:
start,end = line.split("\t")[3:5]
if int(abs(int(end)-int(start))/3) >= aa_len:
LONG.write(f"{line}\n")
else:
SHORT.write(f"{line}\n")
GFF.close()
LONG.close()
SHORT.close()