microsoft · vkrd · Aug 13, 2024 · Aug 13, 2024
diff --git a/scripts/data_utils.py b/scripts/data_utils.py
@@ -674,11 +674,14 @@ def extract_pdf_content(file_path, form_recognizer_client, use_layout=False):
             page_number = bounding_box['pageNumber'] - 1  # Page numbers in PyMuPDF start from 0
             x0, y0, x1, y1 = polygon_to_bbox(bounding_box['polygon'])
 
-            # Select the figure and upscale it by 200% for higher resolution
             page = document.load_page(page_number)
             bbox = fitz.Rect(x0, y0, x1, y1)
 
-            zoom = 2.0 
+            # If either the width or height of the bounding box is less than 3 inches, we upscale by 2x
+            if bbox.width < 72*3 or bbox.height < 72*3:
+                zoom = 2.0
+            else:
+                zoom = 1.0 
             mat = fitz.Matrix(zoom, zoom)
             image = page.get_pixmap(matrix=mat, clip=bbox)
 
@@ -687,17 +690,24 @@ def extract_pdf_content(file_path, form_recognizer_client, use_layout=False):
             image_base64 = base64.b64encode(image_data).decode("utf-8")
             image_base64 = f"data:image/jpg;base64,{image_base64}"
 
-            # Add the image tag to the full text
+            # Identify the text that corresponds to the figure
             replace_start = figure["spans"][0]["offset"]
             replace_end = figure["spans"][0]["offset"] + figure["spans"][0]["length"]
+
+            # Sometimes the figure doesn't correspond to any text, in which case we skip it
+            if replace_start == replace_end:
+                continue
+
+            # Now we get the image tag
             original_text = form_recognizer_results.content[replace_start:replace_end]
 
             if original_text not in full_text:
                 continue
 
             img_tag = image_content_to_tag(original_text)
 
-            full_text = full_text.replace(original_text, img_tag)
+            # We replace only the first occurrence of the original text
+            full_text = full_text.replace(original_text, img_tag, 1)
             image_mapping[img_tag] = image_base64
 
     return full_text, image_mapping