Skip to content

Commit

Permalink
Refactor the isUnwantedText (#12369)
Browse files Browse the repository at this point in the history
* Refactor the IsFarAway

1. Add comment
2. minor refactor

* remove extra space

remove extra space

* remove unnecessary quote

remove unnecessary quote

---------

Co-authored-by: Christoph <siedlerkiller@gmail.com>
  • Loading branch information
leaf-soba and Siedlerchr authored Jan 10, 2025
1 parent 94f0f4e commit 2cd83df
Showing 1 changed file with 13 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -252,16 +252,24 @@ private boolean isFarAway(TextPosition previous, TextPosition current) {
return Math.abs(Xgap) > XspaceThreshold && Math.abs(Ygap) > YspaceThreshold;
}

private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition textPosition) {
private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition textPosition,
Map<Float, TextPosition> lastPositionMap, float fontSize) {
// This indicates that the text is at the start of the line, so it is needed.
if (textPosition == null || previousTextPosition == null) {
return false;
}
// We use the font size to identify titles. Blank characters don't have a font size, so we discard them.
// The space will be added back in the final result, but not in this method.
if (StringUtil.isBlank(textPosition.getUnicode())) {
return true;
}
// The title usually don't in the bottom 10% of a page.
return (textPosition.getPageHeight() - textPosition.getYDirAdj())
< (textPosition.getPageHeight() * 0.1);
// Titles are generally not located in the bottom 10% of a page.
if ((textPosition.getPageHeight() - textPosition.getYDirAdj()) < (textPosition.getPageHeight() * 0.1)) {
return true;
}
// Characters in a title typically remain close together,
// so a distant character is unlikely to be part of the title.
return lastPositionMap.containsKey(fontSize) && isFarAway(lastPositionMap.get(fontSize), textPosition);
}

private Optional<String> findLargestFontText(List<TextPosition> textPositions) {
Expand All @@ -271,8 +279,7 @@ private Optional<String> findLargestFontText(List<TextPosition> textPositions) {
for (TextPosition textPosition : textPositions) {
float fontSize = textPosition.getFontSizeInPt();
// Exclude unwanted text based on heuristics
if (isUnwantedText(previousTextPosition, textPosition) ||
(lastPositionMap.containsKey(fontSize) && isFarAway(lastPositionMap.get(fontSize), textPosition))) {
if (isUnwantedText(previousTextPosition, textPosition, lastPositionMap, fontSize)) {
continue;
}
fontSizeTextMap.putIfAbsent(fontSize, new StringBuilder());
Expand Down

0 comments on commit 2cd83df

Please sign in to comment.