-
Notifications
You must be signed in to change notification settings - Fork 2
/
Metadata_summary.R
197 lines (157 loc) · 7.09 KB
/
Metadata_summary.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
### Calculate summary statistics of metadata file ###
# Creator: Paula Sureda | Arnau Vich
# Year: 2017
## USAGE ##
# Copy or import function to R #
#Input files
#1.File contaning categorical or numerical phenotypes
#2.(optional)File contaning in the first column sample_ID and in the second column the category name
#Output files
#If category table is provided the script creates a summary table per each category
#If not the script creates a global summary table (for whole table input)
#Example metadata_input
#
#SID factor1 factor2 factor3
#Sample1 23.5 0.9 yes
#Sample2 10.9 0.01 no
#Sample3 50 0.3 no
#Example category_table
#
#SID category
#Sample1 cat1
#Sample2 cat1
#Sample3 cat2
#Example output
#
#factors Type Categories/Median Counts/Mean %/SD Number_non-zeros(n) Number_NA
#factor1 numerical 4.5 4.32 1.22 34 6
#factor2 categorical UC,CD,IBDU 4,20,5 13.8,69,17.2 29 11
#factor3 numerical 10 9.34 0.38 38 2
summary_statistics_metadata <- function (metadata_input, category_table) {
# Packages needed
library (psych) #describe r function
# Create other functions to calculate the different parameters
## Categorical values - create function to calculate the counts and the percentage for categorical variables
tblFun <- function(x) {
# Create a table
tbl <- table(x)
# Combine columnes/rows to get the counts and percentage (creates new table -> res)
res <- cbind(tbl,round(prop.table(tbl)*100,2))
# Give names to the columns
colnames(res) <- c('Count','Percentage')
res
}
## NA sum function - counts the number of NA
nzsum <- function(x) {
sum (is.na(x))
}
if (missing(category_table)) {
## Calculate table1 with the whole data:
my_results = matrix(ncol = 6, nrow = ncol(metadata_input))
for (k in 1:ncol(metadata_input)){
if (is.numeric(metadata_input[,k])) {
# Keep in "x" the result from describe function (done in the columns) - for each factor
x = describe(metadata_input[,k])
z = nzsum(metadata_input[,k])
# In the new table ("x"): keep different values in the different columns
my_results[k,1] = "numerical"
my_results[k,2] = x$median
my_results[k,3] = x$mean
my_results[k,4] = x$sd
my_results[k,5] = x$n
my_results[k,6] = z
}
# Condition: if the column values are categorical
else {
# Keep in "x" the result from tblFun function (done in the columns) - for each factor
x = tblFun(metadata_input[,k])
z = nzsum(metadata_input[,k])
# In the new table ("x"): keep different values in the different columns
my_results[k,1]="categorical"
# toString to keep the possible different values/categories in the same vector/column
my_results[k,2]=toString(rownames(x))
# First column table x = 'Count'
my_results[k,3]=toString(x[,1])
# Second column table x = 'Percentage'
my_results[k,4]=toString(x[,2])
# Sum of the values on column1 ("x")
my_results[k,5]=sum(x[,1])
my_results[k,6]= z
}
}
# The column names from the original table = row names from the new table
rownames(my_results) = colnames(metadata_input)
# Give names to the columns of the new table
colnames(my_results) = c("Type", "Categories/Median", "Counts/Mean", "%/SD", "Number_non-zeros(n)", "Number_NA")
# Export the new table
write.table (my_results, file = "./total_metadata_table.txt" , quote = F, sep = "\t")
}
else {
# Merge metadata_table with category_table
metadata_input <- merge(category_table, metadata_input, by ="row.names")
# First column as rownames
rownames(metadata_input) <- metadata_input[,1]
metadata_input <- metadata_input[,-1]
# Create a new column to assign a number to each category
metadata_input$category <- as.integer(as.factor(metadata_input[,1]))
category_number <- nlevels(metadata_input[,1])
# Save the different categories files in the global environment
matrix_list <- list()
j = 1
# Loop for to subset the different variables by category
for (i in 1:category_number) {
category_matrix <- subset(metadata_input, metadata_input$category== i )
nam <- paste("Category", j , sep = "")
matrix_list[[j]] <- assign(nam, category_matrix)
j <- j + 1
}
# Save in different matrix the variables for each category
for (ii in 1:category_number) {
new_matrix <- as.data.frame(matrix_list[[ii]])
my_results = matrix(ncol = 6, nrow = ncol(new_matrix))
# For each loop goes to the next column (numerical way)
for (iii in 1:ncol(new_matrix)) {
# Condition: if the column values are numerical/continuous
if (is.numeric(new_matrix[,iii])) {
# Keep in "x" the result from describe function (done in the columns) - for each factor
x = describe(new_matrix[,iii])
z = nzsum(new_matrix[,iii])
# In the new table ("x"): keep different values in the different columns
my_results[iii,1] = "numerical"
my_results[iii,2] = x$median
my_results[iii,3] = x$mean
my_results[iii,4] = x$sd
my_results[iii,5] = x$n
my_results[iii,6] = z
}
# Condition: if the column values are categorical
else {
# Keep in "x" the result from tblFun function (done in the columns) - for each factor
x = tblFun(new_matrix[,iii])
z = nzsum(new_matrix[,iii])
# In the new table ("x"): keep different values in the different columns
my_results[iii,1]="categorical"
# toString to keep the possible different values/categories in the same vector/column
my_results[iii,2]=toString(rownames(x))
# First column table x = 'Count'
my_results[iii,3]=toString(x[,1])
# Second column table x = 'Percentage'
my_results[iii,4]=toString(x[,2])
# Sum of the values on column1 ("x")
my_results[iii,5]=sum(x[,1])
my_results[iii,6]= z
}
}
# The column names from the original table = row names from the new table
rownames(my_results) = colnames(new_matrix)
# Give names to the columns of the new table
colnames(my_results) = c("Type", "Categories/Median", "Counts/Mean", "%/SD", "Number_non-zeros(n)", "Number_NA")
# Save the name of the variable to title the data.frame (table)
name_category <- new_matrix[1,1]
name_matrix <- paste(name_category, "_metadata_table1.txt", sep = "")
final_name_matrix <- paste("./", name_matrix, sep = "")
# Export the new table
write.table (my_results, file = final_name_matrix , quote = F, sep = "\t")
}
}
}