diff --git a/CHANGELOG.md b/CHANGELOG.md index b00ed925c31..d55efe3474b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv ### Changed +- We improved the offline parsing of BibTeX data from PDF-documents. [#12278](https://github.com/JabRef/jabref/issues/12278) + ### Fixed - We fixed an issue where a bib file with UFF-8 charset was wrongly loaded with a different charset [forum#5369](https://discourse.jabref.org/t/jabref-5-15-opens-bib-files-with-shift-jis-encoding-instead-of-utf-8/5369/) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 2c9c94a4745..14a5b1e6c21 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -243,11 +243,13 @@ protected void writeString(String text, List textPositions) { } private boolean isFarAway(TextPosition previous, TextPosition current) { - float XspaceThreshold = 3.0F; - float YspaceThreshold = previous.getFontSizeInPt() * 1.5F; + float XspaceThreshold = previous.getFontSizeInPt() * 3.0F; + float YspaceThreshold = previous.getFontSizeInPt() * 3.0F; float Xgap = current.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()); - float Ygap = current.getYDirAdj() - (previous.getYDirAdj() - previous.getHeightDir()); - return Xgap > XspaceThreshold && Ygap > YspaceThreshold; + float Ygap = current.getYDirAdj() - previous.getYDirAdj(); + // For cases like paper titles spanning two or more lines, both X and Y gaps must exceed thresholds, + // so "&&" is used instead of "||". + return Math.abs(Xgap) > XspaceThreshold && Math.abs(Ygap) > YspaceThreshold; } private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition textPosition) { @@ -258,28 +260,27 @@ private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition t return true; } // The title usually don't in the bottom 10% of a page. - if ((textPosition.getPageHeight() - textPosition.getYDirAdj()) - < (textPosition.getPageHeight() * 0.1)) { - return true; - } - // The title character usually stay together. - return isFarAway(previousTextPosition, textPosition); + return (textPosition.getPageHeight() - textPosition.getYDirAdj()) + < (textPosition.getPageHeight() * 0.1); } private Optional findLargestFontText(List textPositions) { Map fontSizeTextMap = new TreeMap<>(Collections.reverseOrder()); + Map lastPositionMap = new TreeMap<>(Collections.reverseOrder()); TextPosition previousTextPosition = null; for (TextPosition textPosition : textPositions) { + float fontSize = textPosition.getFontSizeInPt(); // Exclude unwanted text based on heuristics - if (isUnwantedText(previousTextPosition, textPosition)) { + if (isUnwantedText(previousTextPosition, textPosition) || + (lastPositionMap.containsKey(fontSize) && isFarAway(lastPositionMap.get(fontSize), textPosition))) { continue; } - float fontSize = textPosition.getFontSizeInPt(); fontSizeTextMap.putIfAbsent(fontSize, new StringBuilder()); if (previousTextPosition != null && isThereSpace(previousTextPosition, textPosition)) { fontSizeTextMap.get(fontSize).append(" "); } fontSizeTextMap.get(fontSize).append(textPosition.getUnicode()); + lastPositionMap.put(fontSize, textPosition); previousTextPosition = textPosition; } for (Map.Entry entry : fontSizeTextMap.entrySet()) { diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 43a551a8634..80d6c6cf092 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -150,7 +150,8 @@ private static Stream providePdfData() { Arguments.of("On the impact of service-oriented patterns on software evolvability: a controlled experiment and metric-based analysis", "/pdfs/PdfContentImporter/Bogner2019.pdf"), Arguments.of("Pandemic programming", "/pdfs/PdfContentImporter/Ralph2020.pdf"), Arguments.of("Do RESTful API design rules have an impact on the understandability of Web APIs?", "/pdfs/PdfContentImporter/Bogner2023.pdf"), - Arguments.of("Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study", "/pdfs/PdfContentImporter/Fritzsch2022.pdf") + Arguments.of("Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study", "/pdfs/PdfContentImporter/Fritzsch2022.pdf"), + Arguments.of("OPIUM: Optimal Package Install/Uninstall Manager", "/pdfs/PdfContentImporter/opium.pdf") ); } } diff --git a/src/test/resources/pdfs/PdfContentImporter/opium.pdf b/src/test/resources/pdfs/PdfContentImporter/opium.pdf new file mode 100644 index 00000000000..b34c19a48a5 Binary files /dev/null and b/src/test/resources/pdfs/PdfContentImporter/opium.pdf differ