From 2ed82ff4d819297049c42f1e96b56ce5f73a43af Mon Sep 17 00:00:00 2001 From: Javi Ribera Date: Thu, 29 Mar 2018 19:44:14 -0400 Subject: [PATCH] use size before resize to normalize Euc distance in WHD Former-commit-id: c6b28714559909ef9879efe7f713bc106af64acf --- object-locator/data.py | 18 +- object-locator/get_image_size.py | 389 +++++++++++++++++++++++++++++++ object-locator/locate.py | 4 +- object-locator/losses.py | 28 ++- object-locator/train.py | 22 +- 5 files changed, 441 insertions(+), 20 deletions(-) create mode 100644 object-locator/get_image_size.py diff --git a/object-locator/data.py b/object-locator/data.py index 981b37e..861266b 100644 --- a/object-locator/data.py +++ b/object-locator/data.py @@ -11,6 +11,7 @@ from torchvision import transforms import xmltodict from parse import parse +from . import get_image_size class CSVDataset(data.Dataset): @@ -221,6 +222,10 @@ def __call__(self, img, dictionary): xs.view(-1, 1)), 1) + # Indicate new size in dictionary + dictionary['resized_height'] = self.size[0] + dictionary['resized_width'] = self.size[1] + return img, dictionary @@ -346,8 +351,13 @@ def __init__(self, locations = [] for plant in plot['plants']['plant']: locations.append(eval(plant['location_wrt_plot'])) + img_abspath = os.path.join(self.root_dir, filename) + orig_width, orig_height = \ + get_image_size.get_image_size(img_abspath) self.dict[filename] = {'count': count, - 'locations': locations} + 'locations': locations, + 'orig_width': orig_width, + 'orig_height': orig_height} # Use an Ordered Dictionary to allow random access self.dict = OrderedDict(self.dict.items()) @@ -380,7 +390,11 @@ def __getitem__(self, idx): filename, dictionary = self.dict_list[idx] else: filename = self.listfiles[idx] - dictionary = {'filename': self.listfiles[idx]} + orig_width, orig_height = \ + get_image_size.get_image_size(img_abspath) + dictionary = {'filename': self.listfiles[idx], + 'orig_width': orig_width, + 'orig_height': orig_height} img_abspath = os.path.join(self.root_dir, filename) img = Image.open(img_abspath) diff --git a/object-locator/get_image_size.py b/object-locator/get_image_size.py new file mode 100644 index 0000000..ec1ef74 --- /dev/null +++ b/object-locator/get_image_size.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import print_function +""" + +get_image_size.py +==================== + + :Name: get_image_size + :Purpose: extract image dimensions given a file path + + :Author: Paulo Scardine (based on code from Emmanuel VAÏSSE) + + :Created: 26/09/2013 + :Copyright: (c) Paulo Scardine 2013 + :Licence: MIT + +""" +import collections +import json +import os +import struct + +FILE_UNKNOWN = "Sorry, don't know how to get size for this file." + + +class UnknownImageFormat(Exception): + pass + + +types = collections.OrderedDict() +BMP = types['BMP'] = 'BMP' +GIF = types['GIF'] = 'GIF' +ICO = types['ICO'] = 'ICO' +JPEG = types['JPEG'] = 'JPEG' +PNG = types['PNG'] = 'PNG' +TIFF = types['TIFF'] = 'TIFF' + +image_fields = ['path', 'type', 'file_size', 'width', 'height'] + + +class Image(collections.namedtuple('Image', image_fields)): + + def to_str_row(self): + return ("%d\t%d\t%d\t%s\t%s" % ( + self.width, + self.height, + self.file_size, + self.type, + self.path.replace('\t', '\\t'), + )) + + def to_str_row_verbose(self): + return ("%d\t%d\t%d\t%s\t%s\t##%s" % ( + self.width, + self.height, + self.file_size, + self.type, + self.path.replace('\t', '\\t'), + self)) + + def to_str_json(self, indent=None): + return json.dumps(self._asdict(), indent=indent) + + +def get_image_size(file_path): + """ + Return (width, height) for a given img file content - no external + dependencies except the os and struct builtin modules + """ + img = get_image_metadata(file_path) + return (img.width, img.height) + + +def get_image_metadata(file_path): + """ + Return an `Image` object for a given img file content - no external + dependencies except the os and struct builtin modules + + Args: + file_path (str): path to an image file + + Returns: + Image: (path, type, file_size, width, height) + """ + size = os.path.getsize(file_path) + + # be explicit with open arguments - we need binary mode + with open(file_path, "rb") as input: + height = -1 + width = -1 + data = input.read(26) + msg = " raised while trying to decode as JPEG." + + if (size >= 10) and data[:6] in (b'GIF87a', b'GIF89a'): + # GIFs + imgtype = GIF + w, h = struct.unpack("= 24) and data.startswith(b'\211PNG\r\n\032\n') + and (data[12:16] == b'IHDR')): + # PNGs + imgtype = PNG + w, h = struct.unpack(">LL", data[16:24]) + width = int(w) + height = int(h) + elif (size >= 16) and data.startswith(b'\211PNG\r\n\032\n'): + # older PNGs + imgtype = PNG + w, h = struct.unpack(">LL", data[8:16]) + width = int(w) + height = int(h) + elif (size >= 2) and data.startswith(b'\377\330'): + # JPEG + imgtype = JPEG + input.seek(0) + input.read(2) + b = input.read(1) + try: + while (b and ord(b) != 0xDA): + while (ord(b) != 0xFF): + b = input.read(1) + while (ord(b) == 0xFF): + b = input.read(1) + if (ord(b) >= 0xC0 and ord(b) <= 0xC3): + input.read(3) + h, w = struct.unpack(">HH", input.read(4)) + break + else: + input.read( + int(struct.unpack(">H", input.read(2))[0]) - 2) + b = input.read(1) + width = int(w) + height = int(h) + except struct.error: + raise UnknownImageFormat("StructError" + msg) + except ValueError: + raise UnknownImageFormat("ValueError" + msg) + except Exception as e: + raise UnknownImageFormat(e.__class__.__name__ + msg) + elif (size >= 26) and data.startswith(b'BM'): + # BMP + imgtype = 'BMP' + headersize = struct.unpack("= 40: + w, h = struct.unpack("= 8) and data[:4] in (b"II\052\000", b"MM\000\052"): + # Standard TIFF, big- or little-endian + # BigTIFF and other different but TIFF-like formats are not + # supported currently + imgtype = TIFF + byteOrder = data[:2] + boChar = ">" if byteOrder == "MM" else "<" + # maps TIFF type id to size (in bytes) + # and python format char for struct + tiffTypes = { + 1: (1, boChar + "B"), # BYTE + 2: (1, boChar + "c"), # ASCII + 3: (2, boChar + "H"), # SHORT + 4: (4, boChar + "L"), # LONG + 5: (8, boChar + "LL"), # RATIONAL + 6: (1, boChar + "b"), # SBYTE + 7: (1, boChar + "c"), # UNDEFINED + 8: (2, boChar + "h"), # SSHORT + 9: (4, boChar + "l"), # SLONG + 10: (8, boChar + "ll"), # SRATIONAL + 11: (4, boChar + "f"), # FLOAT + 12: (8, boChar + "d") # DOUBLE + } + ifdOffset = struct.unpack(boChar + "L", data[4:8])[0] + try: + countSize = 2 + input.seek(ifdOffset) + ec = input.read(countSize) + ifdEntryCount = struct.unpack(boChar + "H", ec)[0] + # 2 bytes: TagId + 2 bytes: type + 4 bytes: count of values + 4 + # bytes: value offset + ifdEntrySize = 12 + for i in range(ifdEntryCount): + entryOffset = ifdOffset + countSize + i * ifdEntrySize + input.seek(entryOffset) + tag = input.read(2) + tag = struct.unpack(boChar + "H", tag)[0] + if(tag == 256 or tag == 257): + # if type indicates that value fits into 4 bytes, value + # offset is not an offset but value itself + type = input.read(2) + type = struct.unpack(boChar + "H", type)[0] + if type not in tiffTypes: + raise UnknownImageFormat( + "Unkown TIFF field type:" + + str(type)) + typeSize = tiffTypes[type][0] + typeChar = tiffTypes[type][1] + input.seek(entryOffset + 8) + value = input.read(typeSize) + value = int(struct.unpack(typeChar, value)[0]) + if tag == 256: + width = value + else: + height = value + if width > -1 and height > -1: + break + except Exception as e: + raise UnknownImageFormat(str(e)) + elif size >= 2: + # see http://en.wikipedia.org/wiki/ICO_(file_format) + imgtype = 'ICO' + input.seek(0) + reserved = input.read(2) + if 0 != struct.unpack(" 1: + import warnings + warnings.warn("ICO File contains more than one image") + # http://msdn.microsoft.com/en-us/library/ms997538.aspx + w = input.read(1) + h = input.read(1) + width = ord(w) + height = ord(h) + else: + raise UnknownImageFormat(FILE_UNKNOWN) + + return Image(path=file_path, + type=imgtype, + file_size=size, + width=width, + height=height) + + +import unittest + + +class Test_get_image_size(unittest.TestCase): + data = [{ + 'path': 'lookmanodeps.png', + 'width': 251, + 'height': 208, + 'file_size': 22228, + 'type': 'PNG'}] + + def setUp(self): + pass + + def test_get_image_metadata(self): + img = self.data[0] + output = get_image_metadata(img['path']) + self.assertTrue(output) + self.assertEqual(output.path, img['path']) + self.assertEqual(output.width, img['width']) + self.assertEqual(output.height, img['height']) + self.assertEqual(output.type, img['type']) + self.assertEqual(output.file_size, img['file_size']) + for field in image_fields: + self.assertEqual(getattr(output, field), img[field]) + + def test_get_image_metadata__ENOENT_OSError(self): + with self.assertRaises(OSError): + get_image_metadata('THIS_DOES_NOT_EXIST') + + def test_get_image_metadata__not_an_image_UnknownImageFormat(self): + with self.assertRaises(UnknownImageFormat): + get_image_metadata('README.rst') + + def test_get_image_size(self): + img = self.data[0] + output = get_image_size(img['path']) + self.assertTrue(output) + self.assertEqual(output, + (img['width'], + img['height'])) + + def tearDown(self): + pass + + +def main(argv=None): + """ + Print image metadata fields for the given file path. + + Keyword Arguments: + argv (list): commandline arguments (e.g. sys.argv[1:]) + Returns: + int: zero for OK + """ + import logging + import optparse + import sys + + prs = optparse.OptionParser( + usage="%prog [-v|--verbose] [--json|--json-indent] []", + description="Print metadata for the given image paths " + "(without image library bindings).") + + prs.add_option('--json', + dest='json', + action='store_true') + prs.add_option('--json-indent', + dest='json_indent', + action='store_true') + + prs.add_option('-v', '--verbose', + dest='verbose', + action='store_true',) + prs.add_option('-q', '--quiet', + dest='quiet', + action='store_true',) + prs.add_option('-t', '--test', + dest='run_tests', + action='store_true',) + + argv = list(argv) if argv is not None else sys.argv[1:] + (opts, args) = prs.parse_args(args=argv) + loglevel = logging.INFO + if opts.verbose: + loglevel = logging.DEBUG + elif opts.quiet: + loglevel = logging.ERROR + logging.basicConfig(level=loglevel) + log = logging.getLogger() + log.debug('argv: %r', argv) + log.debug('opts: %r', opts) + log.debug('args: %r', args) + + if opts.run_tests: + import sys + sys.argv = [sys.argv[0]] + args + import unittest + return unittest.main() + + output_func = Image.to_str_row + if opts.json_indent: + import functools + output_func = functools.partial(Image.to_str_json, indent=2) + elif opts.json: + output_func = Image.to_str_json + elif opts.verbose: + output_func = Image.to_str_row_verbose + + EX_OK = 0 + EX_NOT_OK = 2 + + if len(args) < 1: + prs.print_help() + print('') + prs.error("You must specify one or more paths to image files") + + errors = [] + for path_arg in args: + try: + img = get_image_metadata(path_arg) + print(output_func(img)) + except KeyboardInterrupt: + raise + except OSError as e: + log.error((path_arg, e)) + errors.append((path_arg, e)) + except Exception as e: + log.exception(e) + errors.append((path_arg, e)) + pass + if len(errors): + import pprint + print("ERRORS", file=sys.stderr) + print("======", file=sys.stderr) + print(pprint.pformat(errors, indent=2), file=sys.stderr) + return EX_NOT_OK + return EX_OK + + +if __name__ == "__main__": + import sys + sys.exit(main(argv=sys.argv[1:])) diff --git a/object-locator/locate.py b/object-locator/locate.py index e1428f7..14c7ebc 100644 --- a/object-locator/locate.py +++ b/object-locator/locate.py @@ -66,8 +66,8 @@ # Loss function l1_loss = nn.L1Loss(reduce=False) mse_loss = nn.MSELoss(reduce=False) -criterion_training = losses.WeightedHausdorffDistance(height=args.height, - width=args.width, +criterion_training = losses.WeightedHausdorffDistance(resized_height=args.height, + resized_width=args.width, return_2_terms=True, tensortype=tensortype) diff --git a/object-locator/losses.py b/object-locator/losses.py index f9fb1d8..ed4dea9 100644 --- a/object-locator/losses.py +++ b/object-locator/losses.py @@ -95,12 +95,12 @@ def forward(self, set1, set2): class WeightedHausdorffDistance(nn.Module): def __init__(self, - height, width, + resized_height, resized_width, return_2_terms=False, tensortype=torch.FloatTensor): """ - :param height: Number of rows in the image. - :param width: Number of columns in the image. + :param resized_height: Number of rows in the image. + :param resized_width: Number of columns in the image. :param return_2_terms: Whether to return the 2 terms of the CD instead of their sum. Default: False. :param tensortype: The result will be in this Tensor type. """ @@ -109,17 +109,18 @@ def __init__(self, self.tensortype = tensortype # Prepare all possible (row, col) locations in the image - self.height, self.width = height, width - self.max_dist = math.sqrt(height**2 + width**2) - self.n_pixels = height * width - self.all_img_locations = torch.from_numpy(cartesian([np.arange(height), - np.arange(width)])) + self.height, self.width = resized_height, resized_width + self.resized_size = Variable(tensortype([resized_height, resized_width])) + self.max_dist = math.sqrt(resized_height**2 + resized_width**2) + self.n_pixels = resized_height * resized_width + self.all_img_locations = torch.from_numpy(cartesian([np.arange(resized_height), + np.arange(resized_width)])) self.all_img_locations = self.all_img_locations.type(tensortype) self.all_img_locations = Variable(self.all_img_locations) self.return_2_terms = return_2_terms - def forward(self, prob_map, gt): + def forward(self, prob_map, gt, orig_sizes): """ Compute the Weighted Hausdorff Distance function between the estimated probability map and ground truth points. @@ -132,6 +133,9 @@ def forward(self, prob_map, gt): Must be of size B as in prob_map. Each element in the list must be a 2D Tensor, where each row is the (y, x), i.e, (row, col) of a GT point. + :param orig_sizes: Bx2 Tensor containing the size of the original images. + B is batch size. The size must be in (height, width) format. + :param orig_widths: List of the original width for each image in the batch. :return: Single-scalar Tensor with the Weighted Hausdorff Distance. If self.return_2_terms=True, then return a tuple containing the two terms of the Weighted Hausdorff Distance. @@ -155,10 +159,14 @@ def forward(self, prob_map, gt): # One by one prob_map_b = prob_map[b, :, :] gt_b = gt[b] + orig_size_b = orig_sizes[b, :] + norm_factor = (orig_size_b/self.resized_size).unsqueeze(0) # Pairwise distances between all possible locations and the GTed locations n_gt_pts = gt_b.size()[0] - d_matrix = cdist(self.all_img_locations, gt_b) + normalized_x = norm_factor.repeat(self.n_pixels, 1)*self.all_img_locations + normalized_y = norm_factor.repeat(len(gt_b), 1)*gt_b + d_matrix = cdist(normalized_x, normalized_y) # Reshape probability map as a long column vector, # and prepare it for multiplication diff --git a/object-locator/train.py b/object-locator/train.py index c110d17..6367135 100644 --- a/object-locator/train.py +++ b/object-locator/train.py @@ -103,10 +103,10 @@ # Loss function loss_regress = nn.SmoothL1Loss() -loss_loc = losses.WeightedHausdorffDistance(height=args.height, - width=args.width, - return_2_terms=True, - tensortype=tensortype) +loss_loc = losses.WeightedHausdorffDistance(resized_height=args.height, + resized_width=args.width, + return_2_terms=True, + tensortype=tensortype) l1_loss = nn.L1Loss(size_average=False) mse_loss = nn.MSELoss(reduce=False) @@ -160,16 +160,21 @@ target_locations = [dictt['locations'] for dictt in dictionaries] target_count = torch.stack([dictt['count'] for dictt in dictionaries]) + target_orig_heights = [dictt['orig_height'] for dictt in dictionaries] + target_orig_widths = [dictt['orig_width'] for dictt in dictionaries] imgs = Variable(imgs.type(tensortype)) target_locations = [Variable(t.type(tensortype)) for t in target_locations] target_count = Variable(target_count.type(tensortype)) + target_orig_heights = Variable(tensortype(target_orig_heights)) + target_orig_widths = Variable(tensortype(target_orig_widths)) + target_orig_sizes = torch.stack((target_orig_heights, target_orig_widths)).transpose(0, 1) # One training step optimizer.zero_grad() est_map, est_count = model.forward(imgs) - term1, term2 = loss_loc.forward(est_map, target_locations) + term1, term2 = loss_loc.forward(est_map, target_locations, target_orig_sizes) term3 = loss_regress.forward(est_count, target_count) #\ # / torch.sum(target_count) term3 *= args.lambdaa @@ -254,6 +259,8 @@ target_locations = [dictt['locations'] for dictt in dictionaries] target_count = torch.stack([dictt['count'] for dictt in dictionaries]) + target_orig_heights = [dictt['orig_height'] for dictt in dictionaries] + target_orig_widths = [dictt['orig_width'] for dictt in dictionaries] if bool((target_count==0).cpu().numpy()[0]): continue @@ -262,12 +269,15 @@ target_locations = [Variable(t.type(tensortype), volatile=True) for t in target_locations] target_count = Variable(target_count.type(tensortype), volatile=True) + target_orig_heights = Variable(tensortype(target_orig_heights)) + target_orig_widths = Variable(tensortype(target_orig_widths)) + target_orig_sizes = torch.stack((target_orig_heights, target_orig_widths)).transpose(0, 1) # Feed-forward est_map, est_count = model.forward(imgs) # The 3 terms - term1, term2 = loss_loc.forward(est_map, target_locations) + term1, term2 = loss_loc.forward(est_map, target_locations, target_orig_sizes) # if bool((torch.sum(target_count)==0).data.cpu().numpy()[0]): term3 = loss_regress.forward(est_count, target_count) # / torch.sum(target_count)