From 2ed82ff4d819297049c42f1e96b56ce5f73a43af Mon Sep 17 00:00:00 2001
From: Javi Ribera <jprat@purdue.edu>
Date: Thu, 29 Mar 2018 19:44:14 -0400
Subject: [PATCH] use size before resize to normalize Euc distance in WHD

Former-commit-id: c6b28714559909ef9879efe7f713bc106af64acf
---
 object-locator/data.py           |  18 +-
 object-locator/get_image_size.py | 389 +++++++++++++++++++++++++++++++
 object-locator/locate.py         |   4 +-
 object-locator/losses.py         |  28 ++-
 object-locator/train.py          |  22 +-
 5 files changed, 441 insertions(+), 20 deletions(-)
 create mode 100644 object-locator/get_image_size.py

diff --git a/object-locator/data.py b/object-locator/data.py
index 981b37e..861266b 100644
--- a/object-locator/data.py
+++ b/object-locator/data.py
@@ -11,6 +11,7 @@
 from torchvision import transforms
 import xmltodict
 from parse import parse
+from . import get_image_size
 
 
 class CSVDataset(data.Dataset):
@@ -221,6 +222,10 @@ def __call__(self, img, dictionary):
                                              xs.view(-1, 1)),
                                             1)
 
+        # Indicate new size in dictionary
+        dictionary['resized_height'] = self.size[0]
+        dictionary['resized_width'] = self.size[1]
+
         return img, dictionary
 
 
@@ -346,8 +351,13 @@ def __init__(self,
                         locations = []
                         for plant in plot['plants']['plant']:
                             locations.append(eval(plant['location_wrt_plot']))
+                        img_abspath = os.path.join(self.root_dir, filename)
+                        orig_width, orig_height = \
+                            get_image_size.get_image_size(img_abspath)
                         self.dict[filename] = {'count': count,
-                                               'locations': locations}
+                                               'locations': locations,
+                                               'orig_width': orig_width,
+                                               'orig_height': orig_height}
 
             # Use an Ordered Dictionary to allow random access
             self.dict = OrderedDict(self.dict.items())
@@ -380,7 +390,11 @@ def __getitem__(self, idx):
             filename, dictionary = self.dict_list[idx]
         else:
             filename = self.listfiles[idx]
-            dictionary = {'filename': self.listfiles[idx]}
+            orig_width, orig_height = \
+                get_image_size.get_image_size(img_abspath)
+            dictionary = {'filename': self.listfiles[idx],
+                          'orig_width': orig_width,
+                          'orig_height': orig_height}
         img_abspath = os.path.join(self.root_dir, filename)
 
         img = Image.open(img_abspath)
diff --git a/object-locator/get_image_size.py b/object-locator/get_image_size.py
new file mode 100644
index 0000000..ec1ef74
--- /dev/null
+++ b/object-locator/get_image_size.py
@@ -0,0 +1,389 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+"""
+
+get_image_size.py
+====================
+
+    :Name:        get_image_size
+    :Purpose:     extract image dimensions given a file path
+
+    :Author:      Paulo Scardine (based on code from Emmanuel VAÏSSE)
+
+    :Created:     26/09/2013
+    :Copyright:   (c) Paulo Scardine 2013
+    :Licence:     MIT
+
+"""
+import collections
+import json
+import os
+import struct
+
+FILE_UNKNOWN = "Sorry, don't know how to get size for this file."
+
+
+class UnknownImageFormat(Exception):
+    pass
+
+
+types = collections.OrderedDict()
+BMP = types['BMP'] = 'BMP'
+GIF = types['GIF'] = 'GIF'
+ICO = types['ICO'] = 'ICO'
+JPEG = types['JPEG'] = 'JPEG'
+PNG = types['PNG'] = 'PNG'
+TIFF = types['TIFF'] = 'TIFF'
+
+image_fields = ['path', 'type', 'file_size', 'width', 'height']
+
+
+class Image(collections.namedtuple('Image', image_fields)):
+
+    def to_str_row(self):
+        return ("%d\t%d\t%d\t%s\t%s" % (
+            self.width,
+            self.height,
+            self.file_size,
+            self.type,
+            self.path.replace('\t', '\\t'),
+        ))
+
+    def to_str_row_verbose(self):
+        return ("%d\t%d\t%d\t%s\t%s\t##%s" % (
+            self.width,
+            self.height,
+            self.file_size,
+            self.type,
+            self.path.replace('\t', '\\t'),
+            self))
+
+    def to_str_json(self, indent=None):
+        return json.dumps(self._asdict(), indent=indent)
+
+
+def get_image_size(file_path):
+    """
+    Return (width, height) for a given img file content - no external
+    dependencies except the os and struct builtin modules
+    """
+    img = get_image_metadata(file_path)
+    return (img.width, img.height)
+
+
+def get_image_metadata(file_path):
+    """
+    Return an `Image` object for a given img file content - no external
+    dependencies except the os and struct builtin modules
+
+    Args:
+        file_path (str): path to an image file
+
+    Returns:
+        Image: (path, type, file_size, width, height)
+    """
+    size = os.path.getsize(file_path)
+
+    # be explicit with open arguments - we need binary mode
+    with open(file_path, "rb") as input:
+        height = -1
+        width = -1
+        data = input.read(26)
+        msg = " raised while trying to decode as JPEG."
+
+        if (size >= 10) and data[:6] in (b'GIF87a', b'GIF89a'):
+            # GIFs
+            imgtype = GIF
+            w, h = struct.unpack("<HH", data[6:10])
+            width = int(w)
+            height = int(h)
+        elif ((size >= 24) and data.startswith(b'\211PNG\r\n\032\n')
+              and (data[12:16] == b'IHDR')):
+            # PNGs
+            imgtype = PNG
+            w, h = struct.unpack(">LL", data[16:24])
+            width = int(w)
+            height = int(h)
+        elif (size >= 16) and data.startswith(b'\211PNG\r\n\032\n'):
+            # older PNGs
+            imgtype = PNG
+            w, h = struct.unpack(">LL", data[8:16])
+            width = int(w)
+            height = int(h)
+        elif (size >= 2) and data.startswith(b'\377\330'):
+            # JPEG
+            imgtype = JPEG
+            input.seek(0)
+            input.read(2)
+            b = input.read(1)
+            try:
+                while (b and ord(b) != 0xDA):
+                    while (ord(b) != 0xFF):
+                        b = input.read(1)
+                    while (ord(b) == 0xFF):
+                        b = input.read(1)
+                    if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
+                        input.read(3)
+                        h, w = struct.unpack(">HH", input.read(4))
+                        break
+                    else:
+                        input.read(
+                            int(struct.unpack(">H", input.read(2))[0]) - 2)
+                    b = input.read(1)
+                width = int(w)
+                height = int(h)
+            except struct.error:
+                raise UnknownImageFormat("StructError" + msg)
+            except ValueError:
+                raise UnknownImageFormat("ValueError" + msg)
+            except Exception as e:
+                raise UnknownImageFormat(e.__class__.__name__ + msg)
+        elif (size >= 26) and data.startswith(b'BM'):
+            # BMP
+            imgtype = 'BMP'
+            headersize = struct.unpack("<I", data[14:18])[0]
+            if headersize == 12:
+                w, h = struct.unpack("<HH", data[18:22])
+                width = int(w)
+                height = int(h)
+            elif headersize >= 40:
+                w, h = struct.unpack("<ii", data[18:26])
+                width = int(w)
+                # as h is negative when stored upside down
+                height = abs(int(h))
+            else:
+                raise UnknownImageFormat(
+                    "Unkown DIB header size:" +
+                    str(headersize))
+        elif (size >= 8) and data[:4] in (b"II\052\000", b"MM\000\052"):
+            # Standard TIFF, big- or little-endian
+            # BigTIFF and other different but TIFF-like formats are not
+            # supported currently
+            imgtype = TIFF
+            byteOrder = data[:2]
+            boChar = ">" if byteOrder == "MM" else "<"
+            # maps TIFF type id to size (in bytes)
+            # and python format char for struct
+            tiffTypes = {
+                1: (1, boChar + "B"),  # BYTE
+                2: (1, boChar + "c"),  # ASCII
+                3: (2, boChar + "H"),  # SHORT
+                4: (4, boChar + "L"),  # LONG
+                5: (8, boChar + "LL"),  # RATIONAL
+                6: (1, boChar + "b"),  # SBYTE
+                7: (1, boChar + "c"),  # UNDEFINED
+                8: (2, boChar + "h"),  # SSHORT
+                9: (4, boChar + "l"),  # SLONG
+                10: (8, boChar + "ll"),  # SRATIONAL
+                11: (4, boChar + "f"),  # FLOAT
+                12: (8, boChar + "d")   # DOUBLE
+            }
+            ifdOffset = struct.unpack(boChar + "L", data[4:8])[0]
+            try:
+                countSize = 2
+                input.seek(ifdOffset)
+                ec = input.read(countSize)
+                ifdEntryCount = struct.unpack(boChar + "H", ec)[0]
+                # 2 bytes: TagId + 2 bytes: type + 4 bytes: count of values + 4
+                # bytes: value offset
+                ifdEntrySize = 12
+                for i in range(ifdEntryCount):
+                    entryOffset = ifdOffset + countSize + i * ifdEntrySize
+                    input.seek(entryOffset)
+                    tag = input.read(2)
+                    tag = struct.unpack(boChar + "H", tag)[0]
+                    if(tag == 256 or tag == 257):
+                        # if type indicates that value fits into 4 bytes, value
+                        # offset is not an offset but value itself
+                        type = input.read(2)
+                        type = struct.unpack(boChar + "H", type)[0]
+                        if type not in tiffTypes:
+                            raise UnknownImageFormat(
+                                "Unkown TIFF field type:" +
+                                str(type))
+                        typeSize = tiffTypes[type][0]
+                        typeChar = tiffTypes[type][1]
+                        input.seek(entryOffset + 8)
+                        value = input.read(typeSize)
+                        value = int(struct.unpack(typeChar, value)[0])
+                        if tag == 256:
+                            width = value
+                        else:
+                            height = value
+                    if width > -1 and height > -1:
+                        break
+            except Exception as e:
+                raise UnknownImageFormat(str(e))
+        elif size >= 2:
+                # see http://en.wikipedia.org/wiki/ICO_(file_format)
+            imgtype = 'ICO'
+            input.seek(0)
+            reserved = input.read(2)
+            if 0 != struct.unpack("<H", reserved)[0]:
+                raise UnknownImageFormat(FILE_UNKNOWN)
+            format = input.read(2)
+            assert 1 == struct.unpack("<H", format)[0]
+            num = input.read(2)
+            num = struct.unpack("<H", num)[0]
+            if num > 1:
+                import warnings
+                warnings.warn("ICO File contains more than one image")
+            # http://msdn.microsoft.com/en-us/library/ms997538.aspx
+            w = input.read(1)
+            h = input.read(1)
+            width = ord(w)
+            height = ord(h)
+        else:
+            raise UnknownImageFormat(FILE_UNKNOWN)
+
+    return Image(path=file_path,
+                 type=imgtype,
+                 file_size=size,
+                 width=width,
+                 height=height)
+
+
+import unittest
+
+
+class Test_get_image_size(unittest.TestCase):
+    data = [{
+        'path': 'lookmanodeps.png',
+        'width': 251,
+        'height': 208,
+        'file_size': 22228,
+        'type': 'PNG'}]
+
+    def setUp(self):
+        pass
+
+    def test_get_image_metadata(self):
+        img = self.data[0]
+        output = get_image_metadata(img['path'])
+        self.assertTrue(output)
+        self.assertEqual(output.path, img['path'])
+        self.assertEqual(output.width, img['width'])
+        self.assertEqual(output.height, img['height'])
+        self.assertEqual(output.type, img['type'])
+        self.assertEqual(output.file_size, img['file_size'])
+        for field in image_fields:
+            self.assertEqual(getattr(output, field), img[field])
+
+    def test_get_image_metadata__ENOENT_OSError(self):
+        with self.assertRaises(OSError):
+            get_image_metadata('THIS_DOES_NOT_EXIST')
+
+    def test_get_image_metadata__not_an_image_UnknownImageFormat(self):
+        with self.assertRaises(UnknownImageFormat):
+            get_image_metadata('README.rst')
+
+    def test_get_image_size(self):
+        img = self.data[0]
+        output = get_image_size(img['path'])
+        self.assertTrue(output)
+        self.assertEqual(output,
+                         (img['width'],
+                          img['height']))
+
+    def tearDown(self):
+        pass
+
+
+def main(argv=None):
+    """
+    Print image metadata fields for the given file path.
+
+    Keyword Arguments:
+        argv (list): commandline arguments (e.g. sys.argv[1:])
+    Returns:
+        int: zero for OK
+    """
+    import logging
+    import optparse
+    import sys
+
+    prs = optparse.OptionParser(
+        usage="%prog [-v|--verbose] [--json|--json-indent] <path0> [<pathN>]",
+        description="Print metadata for the given image paths "
+                    "(without image library bindings).")
+
+    prs.add_option('--json',
+                   dest='json',
+                   action='store_true')
+    prs.add_option('--json-indent',
+                   dest='json_indent',
+                   action='store_true')
+
+    prs.add_option('-v', '--verbose',
+                   dest='verbose',
+                   action='store_true',)
+    prs.add_option('-q', '--quiet',
+                   dest='quiet',
+                   action='store_true',)
+    prs.add_option('-t', '--test',
+                   dest='run_tests',
+                   action='store_true',)
+
+    argv = list(argv) if argv is not None else sys.argv[1:]
+    (opts, args) = prs.parse_args(args=argv)
+    loglevel = logging.INFO
+    if opts.verbose:
+        loglevel = logging.DEBUG
+    elif opts.quiet:
+        loglevel = logging.ERROR
+    logging.basicConfig(level=loglevel)
+    log = logging.getLogger()
+    log.debug('argv: %r', argv)
+    log.debug('opts: %r', opts)
+    log.debug('args: %r', args)
+
+    if opts.run_tests:
+        import sys
+        sys.argv = [sys.argv[0]] + args
+        import unittest
+        return unittest.main()
+
+    output_func = Image.to_str_row
+    if opts.json_indent:
+        import functools
+        output_func = functools.partial(Image.to_str_json, indent=2)
+    elif opts.json:
+        output_func = Image.to_str_json
+    elif opts.verbose:
+        output_func = Image.to_str_row_verbose
+
+    EX_OK = 0
+    EX_NOT_OK = 2
+
+    if len(args) < 1:
+        prs.print_help()
+        print('')
+        prs.error("You must specify one or more paths to image files")
+
+    errors = []
+    for path_arg in args:
+        try:
+            img = get_image_metadata(path_arg)
+            print(output_func(img))
+        except KeyboardInterrupt:
+            raise
+        except OSError as e:
+            log.error((path_arg, e))
+            errors.append((path_arg, e))
+        except Exception as e:
+            log.exception(e)
+            errors.append((path_arg, e))
+            pass
+    if len(errors):
+        import pprint
+        print("ERRORS", file=sys.stderr)
+        print("======", file=sys.stderr)
+        print(pprint.pformat(errors, indent=2), file=sys.stderr)
+        return EX_NOT_OK
+    return EX_OK
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main(argv=sys.argv[1:]))
diff --git a/object-locator/locate.py b/object-locator/locate.py
index e1428f7..14c7ebc 100644
--- a/object-locator/locate.py
+++ b/object-locator/locate.py
@@ -66,8 +66,8 @@
 # Loss function
 l1_loss = nn.L1Loss(reduce=False)
 mse_loss = nn.MSELoss(reduce=False)
-criterion_training = losses.WeightedHausdorffDistance(height=args.height,
-                                                      width=args.width,
+criterion_training = losses.WeightedHausdorffDistance(resized_height=args.height,
+                                                      resized_width=args.width,
                                                       return_2_terms=True,
                                                       tensortype=tensortype)
 
diff --git a/object-locator/losses.py b/object-locator/losses.py
index f9fb1d8..ed4dea9 100644
--- a/object-locator/losses.py
+++ b/object-locator/losses.py
@@ -95,12 +95,12 @@ def forward(self, set1, set2):
 
 class WeightedHausdorffDistance(nn.Module):
     def __init__(self,
-                 height, width,
+                 resized_height, resized_width,
                  return_2_terms=False,
                  tensortype=torch.FloatTensor):
         """
-        :param height: Number of rows in the image.
-        :param width: Number of columns in the image.
+        :param resized_height: Number of rows in the image.
+        :param resized_width: Number of columns in the image.
         :param return_2_terms: Whether to return the 2 terms of the CD instead of their sum. Default: False.
         :param tensortype: The result will be in this Tensor type.
         """
@@ -109,17 +109,18 @@ def __init__(self,
         self.tensortype = tensortype
 
         # Prepare all possible (row, col) locations in the image
-        self.height, self.width = height, width
-        self.max_dist = math.sqrt(height**2 + width**2)
-        self.n_pixels = height * width
-        self.all_img_locations = torch.from_numpy(cartesian([np.arange(height),
-                                                             np.arange(width)]))
+        self.height, self.width = resized_height, resized_width
+        self.resized_size = Variable(tensortype([resized_height, resized_width]))
+        self.max_dist = math.sqrt(resized_height**2 + resized_width**2)
+        self.n_pixels = resized_height * resized_width
+        self.all_img_locations = torch.from_numpy(cartesian([np.arange(resized_height),
+                                                             np.arange(resized_width)]))
         self.all_img_locations = self.all_img_locations.type(tensortype)
         self.all_img_locations = Variable(self.all_img_locations)
 
         self.return_2_terms = return_2_terms
 
-    def forward(self, prob_map, gt):
+    def forward(self, prob_map, gt, orig_sizes):
         """
         Compute the Weighted Hausdorff Distance function
          between the estimated probability map and ground truth points.
@@ -132,6 +133,9 @@ def forward(self, prob_map, gt):
                    Must be of size B as in prob_map.
                    Each element in the list must be a 2D Tensor,
                    where each row is the (y, x), i.e, (row, col) of a GT point.
+        :param orig_sizes: Bx2 Tensor containing the size of the original images.
+                           B is batch size. The size must be in (height, width) format.
+        :param orig_widths: List of the original width for each image in the batch.
         :return: Single-scalar Tensor with the Weighted Hausdorff Distance.
                  If self.return_2_terms=True, then return a tuple containing
                  the two terms of the Weighted Hausdorff Distance. 
@@ -155,10 +159,14 @@ def forward(self, prob_map, gt):
             # One by one
             prob_map_b = prob_map[b, :, :]
             gt_b = gt[b] 
+            orig_size_b = orig_sizes[b, :]
+            norm_factor = (orig_size_b/self.resized_size).unsqueeze(0)
 
             # Pairwise distances between all possible locations and the GTed locations
             n_gt_pts = gt_b.size()[0]
-            d_matrix = cdist(self.all_img_locations, gt_b)
+            normalized_x = norm_factor.repeat(self.n_pixels, 1)*self.all_img_locations
+            normalized_y = norm_factor.repeat(len(gt_b), 1)*gt_b
+            d_matrix = cdist(normalized_x, normalized_y)
 
             # Reshape probability map as a long column vector,
             # and prepare it for multiplication
diff --git a/object-locator/train.py b/object-locator/train.py
index c110d17..6367135 100644
--- a/object-locator/train.py
+++ b/object-locator/train.py
@@ -103,10 +103,10 @@
 
 # Loss function
 loss_regress = nn.SmoothL1Loss()
-loss_loc = losses.WeightedHausdorffDistance(height=args.height,
-                                            width=args.width,
-                                                      return_2_terms=True,
-                                                      tensortype=tensortype)
+loss_loc = losses.WeightedHausdorffDistance(resized_height=args.height,
+                                            resized_width=args.width,
+                                            return_2_terms=True,
+                                            tensortype=tensortype)
 l1_loss = nn.L1Loss(size_average=False)
 mse_loss = nn.MSELoss(reduce=False)
 
@@ -160,16 +160,21 @@
         target_locations = [dictt['locations'] for dictt in dictionaries]
         target_count = torch.stack([dictt['count']
                                     for dictt in dictionaries])
+        target_orig_heights = [dictt['orig_height'] for dictt in dictionaries]
+        target_orig_widths = [dictt['orig_width'] for dictt in dictionaries]
 
         imgs = Variable(imgs.type(tensortype))
         target_locations = [Variable(t.type(tensortype))
                             for t in target_locations]
         target_count = Variable(target_count.type(tensortype))
+        target_orig_heights = Variable(tensortype(target_orig_heights))
+        target_orig_widths = Variable(tensortype(target_orig_widths))
+        target_orig_sizes = torch.stack((target_orig_heights, target_orig_widths)).transpose(0, 1)
 
         # One training step
         optimizer.zero_grad()
         est_map, est_count = model.forward(imgs)
-        term1, term2 = loss_loc.forward(est_map, target_locations)
+        term1, term2 = loss_loc.forward(est_map, target_locations, target_orig_sizes)
         term3 = loss_regress.forward(est_count, target_count) #\
             # / torch.sum(target_count)
         term3 *= args.lambdaa
@@ -254,6 +259,8 @@
         target_locations = [dictt['locations'] for dictt in dictionaries]
         target_count = torch.stack([dictt['count']
                                     for dictt in dictionaries])
+        target_orig_heights = [dictt['orig_height'] for dictt in dictionaries]
+        target_orig_widths = [dictt['orig_width'] for dictt in dictionaries]
 
         if bool((target_count==0).cpu().numpy()[0]):
             continue
@@ -262,12 +269,15 @@
         target_locations = [Variable(t.type(tensortype), volatile=True)
                             for t in target_locations]
         target_count = Variable(target_count.type(tensortype), volatile=True)
+        target_orig_heights = Variable(tensortype(target_orig_heights))
+        target_orig_widths = Variable(tensortype(target_orig_widths))
+        target_orig_sizes = torch.stack((target_orig_heights, target_orig_widths)).transpose(0, 1)
 
         # Feed-forward
         est_map, est_count = model.forward(imgs)
 
         # The 3 terms
-        term1, term2 = loss_loc.forward(est_map, target_locations)
+        term1, term2 = loss_loc.forward(est_map, target_locations, target_orig_sizes)
         # if bool((torch.sum(target_count)==0).data.cpu().numpy()[0]):
         term3 = loss_regress.forward(est_count, target_count)
                 # / torch.sum(target_count)