-
Notifications
You must be signed in to change notification settings - Fork 6
/
combine_and_parse_gff_per_genus.R
executable file
·72 lines (56 loc) · 2.16 KB
/
combine_and_parse_gff_per_genus.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env Rscript
library(tidyverse)
# command line args -------------------------------------------------------
args <- commandArgs(trailingOnly = TRUE)
out_tsv <- args[1]
in_gffs <- args[2:length(args)]
# function ----------------------------------------------------------------
parse_gff_attribute_field <- function(gff) {
# GFF attribute fields encode a lot of information
# each field is named and separated from its value by an equals ("=") sign
# fields are separated from each other by a semi colon (";").
# this function separates each attribute into a column in a data frame.
# note that the column must be named "attribute"
# split the attributes column by semicolon
gff_split <- gff %>%
mutate(split_attr = str_split(string = .data$attribute, pattern = ";")) %>%
unnest(split_attr)
# Separate the key-value pairs
gff_key_value <- gff_split %>%
separate(split_attr, into = c("key", "value"), sep = "=", extra = "merge") %>%
spread(key, value) # Spread the key-value pairs to wide format
return(gff_key_value)
}
# read and parse ----------------------------------------------------------
gff <- in_gffs %>%
map_dfr(read_tsv, comment = "#",
col_names = c("seqid", "source", "feature", "start", "end",
"score", "strand", "frame", "attribute"),
col_types = "cccddcccc")
# make contig/chr length a column
seq_length <- gff %>%
filter(feature == "region") %>%
select(seqid, seqid_length = end)
# add column back to gff df
gff <- gff %>%
filter(feature == "CDS") %>%
left_join(seq_length, by = "seqid")
# parse the gff attribute field into columns in a data frame
gff <- parse_gff_attribute_field(gff)
# rename columns to label source of info as "gff"
gff <- gff %>%
rename_with( ~ paste0("gff_", .x))
# count how many frames a protein has
gff_frame_tally <- gff %>%
group_by(gff_protein_id) %>%
tally() %>%
select(gff_protein_id, gff_frame_tally = n)
# select only the first frame
gff <- gff %>%
left_join(gff_frame_tally, by = "gff_protein_id") %>%
group_by(gff_protein_id) %>%
slice_head(n = 1) %>%
ungroup() %>%
distinct()
# write out info
write_tsv(gff, out_tsv)