-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
executable file
·179 lines (153 loc) · 9.09 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#! /usr/bin/env python
import tensorflow as tf
from tensorflow.keras import Model
import numpy as np
def parse_cfg(cfgfile):
with open(cfgfile, 'r') as file:
lines = [line.rstrip('\n') for line in file if line != '\n' and line[0] != '#']
holder = {}
blocks = []
for line in lines:
if line[0] == '[':
line = 'type=' + line[1:-1].rstrip()
if len(holder) != 0:
blocks.append(holder)
holder = {}
key, value = line.split("=")
holder[key.rstrip()] = value.lstrip()
blocks.append(holder)
return blocks
def YOLOv3Net(cfgfile, num_classes= 80, model_size = (416, 416, 3), max_total_size= 100,\
max_output_size_per_class=100, iou_threshold= 0.5, score_threshold= 0.5):
blocks = parse_cfg(cfgfile)
output_each_layer = []
scale = 1
inputs = input_img=tf.keras.Input(shape=model_size)
# as per the cgf file, block[0] is for [net] which contains the hyperparameters' values
# hence iterating on block[1:]
conv_count = 0
norm_count = 0
for i, block in enumerate(blocks[1:]):
if block["type"] == "convolutional":
activation = block["activation"]
filters = int(block["filters"])
kernel_size = int(block["size"])
strides = int(block["stride"])
if strides > 1:
pad_layer = tf.keras.layers.ZeroPadding2D(((1, 1), (1, 1)), name='pad_' + str(i+1))
inputs = pad_layer(inputs)
conv2D_layer = tf.keras.layers.Conv2D(filters, kernel_size, strides=strides,
padding='valid' if strides > 1 else 'same', name='conv2d_' + str(conv_count),
use_bias=False if (activation == 'leaky') else True)
conv_count +=1
inputs = conv2D_layer(inputs)
if activation == 'leaky':
inputs = tf.keras.layers.BatchNormalization(name='bnorm_' + str(norm_count))(inputs)
norm_count +=1
inputs = tf.keras.layers.LeakyReLU(alpha=0.1, name='leaky_' + str(i+1))(inputs)
elif block["type"] == "upsample":
stride = int(block["stride"])
upsampling2D_layer = tf.keras.layers.UpSampling2D(stride, name='upsample_' + str(i+1))
inputs = upsampling2D_layer(inputs)
elif block["type"] == "route":
'''
the attribute 'layers' holds a value of -4 which means that if we are in route block,
we need to move backward 4 layers in the cfg file and use the feature map from that
layer. However,for the case of the route block whose attribute 'layers' in cfg file has
2 values like in lines 633-634, layers contains -1 and 61, we need to concatenate the
feature map from a previous layer (-1) and the feature map from layer 61
'''
block["layers"] = block["layers"].split(',')
start = int(block["layers"][0])
if len(block["layers"]) > 1:
end = int(block["layers"][1])
# For ex: start = -1, and the network have 4 blocks executed, and 5th block (current)
# is 'route' block. Thus we need "inputs = 4rd block output" i.e. 5-1
# For index correction by '-1', inputs = idx[3] block output
# Also since out 'output_filter' variable is behind the current position by 1
# that is output_filter has data upto 4th block (or idx[3] block), then
# idx[3] = idx[-1] = idx[staart], thus inputs = output_filtes[start]
inputs = tf.keras.layers.concatenate([output_each_layer[start], output_each_layer[end]], axis=-1, name='route_' + str(i+1))
else:
inputs = output_each_layer[start]
elif block["type"] == "shortcut":
from_ = int(block["from"])
# "1+from" as we want to move from[0] backwards from current position and it is already negative
# and add it to the previous layer feature map i.e output_each_layer[-1]
inputs = tf.keras.layers.add([output_each_layer[-1], output_each_layer[from_]], name='shortcut_' + str(i+1))
elif block["type"] == "yolo":
inputs = decode(inputs, block, model_size, num_classes)
if scale == 1:
boxes_0 = inputs
scale +=1
elif scale == 2:
boxes_1 = inputs
scale +=1
elif scale == 3:
boxes_2 = inputs
scale +=1
output_each_layer.append(inputs)
best_boxes = get_box_nms([boxes_0, boxes_1, boxes_2], max_total_size, max_output_size_per_class,\
iou_threshold, score_threshold)
YOLO_v3_Model = Model(input_img, outputs = best_boxes)
return YOLO_v3_Model
def get_box_nms(preds, max_total_size= 100, max_output_size_per_class=100, iou_threshold= 0.5, score_threshold= 0.5):
bbox = []
objectness = []
class_prob = []
for pred in preds:
box = pred[0]
obj = pred[1]
prob = pred[2]
bbox.append(tf.reshape(box, (tf.shape(box)[0], -1, tf.shape(box)[-1])))
objectness.append(tf.reshape(obj, (tf.shape(obj)[0], -1, tf.shape(obj)[-1])))
class_prob.append(tf.reshape(prob, (tf.shape(prob)[0], -1, tf.shape(prob)[-1])))
bbox = tf.concat([bbox[0], bbox[1], bbox[2]], axis=-2)
objectness = tf.concat([objectness[0], objectness[1], objectness[2]], axis=-2)
class_prob = tf.concat([class_prob[0], class_prob[1], class_prob[2]], axis=-2)
scores = objectness * class_prob
boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression(
boxes=tf.reshape(bbox, (tf.shape(bbox)[0], -1, 1, 4)),
scores=tf.reshape(scores, (tf.shape(scores)[0], -1, tf.shape(scores)[-1])),
max_output_size_per_class = max_output_size_per_class,
max_total_size = max_total_size,
iou_threshold = iou_threshold,
score_threshold = score_threshold
)
return boxes, scores, classes, valid_detections
def decode(inputs, block, model_size, num_classes):
mask = block["mask"].split(",")
mask = [int(x) for x in mask]
anchors = block["anchors"].split(",")
anchors = [int(a) for a in anchors]
anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
anchors = np.array([anchors[k] for k in mask]) * 1. / model_size[0]
n_anchors = len(anchors)
out_shape = inputs.get_shape().as_list()
batch_size = 1 if out_shape[0] == None else out_shape[0]
inputs = tf.reshape(inputs, (-1, out_shape[1], out_shape[2], n_anchors, 5 + num_classes))
conv_raw_dxdy = inputs[:, : , :, : , 0:2]
conv_raw_dwdh = inputs[:, : , :, : , 2:4]
conv_raw_conf = inputs[:, : , :, : , 4:5]
conv_raw_prob = inputs[:, : , :, : , 5: num_classes + 5]
y = tf.tile(tf.range(out_shape[1], dtype=tf.int32)[:, tf.newaxis], [1, out_shape[2]])
x = tf.tile(tf.range(out_shape[2], dtype=tf.int32)[tf.newaxis, :], [out_shape[1], 1])
cx_cy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1)
cx_cy_grid = tf.tile(cx_cy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, n_anchors, 1])
cx_cy_grid = tf.cast(cx_cy_grid, tf.float32)
# refine the output by applying sigmoid function to have the value in the range [0,1]
# convert relative positions of the center boxes into the real positions i.e. use the formulation
# given by the author in the original paper for bx, by
pred_xy = (tf.sigmoid(conv_raw_dxdy) + cx_cy_grid) / out_shape[1:3]
pred_wh = (tf.exp(conv_raw_dwdh) * anchors)
pred_x1y1 = pred_xy - pred_wh / 2
pred_x2y2 = pred_xy + pred_wh / 2
pred_xy1xy2 = tf.concat([pred_x1y1, pred_x2y2], axis=-1)
# instead of using softmax, we use sigmoid for classification
# as softmax assumes mutually exclusive classes i.e. if it is in classified in one, then cannot
# be other. Thus in softmax we take the class as the argmax of all the probabilities
# but with sigmoid, all classes get a probability and are then verified agaisnt a threshold
# This is useful in more real-world scenario when we have overlapping classes like'dog','animal'
pred_conf = tf.sigmoid(conv_raw_conf)
pred_prob = tf.sigmoid(conv_raw_prob)
return pred_xy1xy2, pred_conf, pred_prob