Skip to content

Commit

Permalink
improve nav handling
Browse files Browse the repository at this point in the history
  • Loading branch information
Carmine DiMascio committed Dec 20, 2018
1 parent a1362a4 commit f9d05aa
Show file tree
Hide file tree
Showing 11 changed files with 1,149 additions and 37 deletions.
24 changes: 13 additions & 11 deletions src/main/kotlin/io/github/cdimascio/essence/cleaners/Cleaner.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Cleaner(private val doc: Document) {
removeScriptsStyles()
Traverse(
nodeRemovalRules = listOf(
// Rule::removeNonTextNodes,
Rule::removeCommentsTravRule,
Rule::removeBadTagsTravRule,
Rule::removeNavigationElements,
Expand Down Expand Up @@ -95,7 +96,7 @@ class Cleaner(private val doc: Document) {
}

private fun elementToParagraph(doc: Document, tagNames: List<String>) {
val elements = doc.select(tagNames.joinToString(","))
val elements = doc.select(tagNames.joinToString(",")) //\.reversed()
val tags = listOf("a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul")
for (element in elements) {
val items = element.matchFirstElementTags(tags, 1)
Expand All @@ -108,12 +109,8 @@ class Cleaner(private val doc: Document) {
val replaceNodes = getReplacementNodes(element)
val pReplacementElements = mutableListOf<Element>()
for (rNode in replaceNodes) {
if (rNode is TextNode && rNode.text().isNotEmpty()) {
pReplacementElements.add(Element("p").html(rNode.text()))
} else if (rNode is Element) {
if (rNode.html().isNotEmpty()) {
pReplacementElements.add(Element("p").html(rNode.html()))
}
if (rNode.html().isNotEmpty()) {
pReplacementElements.add(Element("p").html(rNode.html()))
}
}
element.parent().insertChildren(element.siblingIndex(), pReplacementElements)
Expand All @@ -122,9 +119,9 @@ class Cleaner(private val doc: Document) {
}
}

private fun getReplacementNodes(div: Node): List<Node> {
private fun getReplacementNodes(div: Node): List<Element> {
val children = div.childNodes()
val nodesToReturn = mutableListOf<Node>()
val nodesToReturn = mutableListOf<Element>()
val nodesToRemove = mutableListOf<Node>()
val replacmentText = mutableListOf<String>() // TODO: could be string buffer
val isGravityUsed = { e: Element -> e.attr(GRAVITY_USED_ALREADY) == "yes" }
Expand Down Expand Up @@ -162,7 +159,9 @@ class Cleaner(private val doc: Document) {
}
}
} else {
nodesToReturn.add(kid)
if (kid is Element) {
nodesToReturn += kid
}
}
}

Expand All @@ -176,7 +175,10 @@ class Cleaner(private val doc: Document) {
node.remove()
}

return nodesToReturn
val isInteresting = { e: Element ->
!listOf("meta", "head").contains(e.tagName())
}
return nodesToReturn.filter { isInteresting(it) }
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package io.github.cdimascio.essence.cleaners

import io.github.cdimascio.essence.util.NodeHeuristics
import io.github.cdimascio.essence.scorers.ScoredElement
import io.github.cdimascio.essence.util.NodeHeuristics
import io.github.cdimascio.essence.util.TraversalHelpers
import io.github.cdimascio.essence.util.find
import io.github.cdimascio.essence.words.StopWords
Expand All @@ -17,7 +17,8 @@ class ScoreCleaner(private val stopWords: StopWords) {
listOf("p", "a").contains(node.tagName())
}

addSiblingsToTopNode(element)?.let { updatedElement ->
val topNode = skipNonTextualTopNodes(element)
addSiblingsToTopNode(topNode)?.let { updatedElement ->
for (child in updatedElement.children()) {
if (!isParagraphOrAnchor(child)) {
if (NodeHeuristics.hasHighLinkDensity(child) ||
Expand All @@ -28,9 +29,24 @@ class ScoreCleaner(private val stopWords: StopWords) {
}
}
}
// else if (NodeHeuristics.hasFewWordsAndLowFewWordNeighbors(child, stopWords)) {
// if (child.hasParent()) {
// child.remove()
// }
// }
}
}
return element
return topNode
}

private fun skipNonTextualTopNodes(targetNode: Element): Element? {
if (targetNode.ownText().isBlank() && targetNode.childNodeSize() == 1) {
val child = targetNode.childNodes()[0]
if (child is Element) {
return skipNonTextualTopNodes(child)
}
}
return targetNode
}

// Why add only previous siblings -- change name of function
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,30 @@ private val REGEX_NAV = """(["#.'\-_]+|^)nav[\-_"']+""".toRegex(RegexOption.IGNO
private val REGEX_SPONSORED = """sponsored|(["#.'\-_]+|^)ad[\-_"']+|adzone""".toRegex(RegexOption.IGNORE_CASE)

object Rule {

fun removeSponsoredContent(node: Node): Boolean {
if (node !is Element) return false
return REGEX_SPONSORED.containsMatchIn(node.attr("class")) ||
node.attributes().filter {
REGEX_SPONSORED.containsMatchIn(it.value ?: "")
}.isNotEmpty()
}

fun removeCommentsTravRule(node: Node): Boolean {
return node.nodeName() == "#comment"
}

fun removeNavigationElements(node: Node): Boolean {
if (node !is Element) return false
return listOf("div", "li", "ul", "ol").contains(node.tagName()) && (
// remove checks for div nav
return listOf("li", "ul", "ol", "header", "span").contains(node.tagName()) && (
REGEX_NAV.containsMatchIn(node.attr("class")) ||
REGEX_NAV.containsMatchIn(node.attr("id")))
}


fun removeBadTagsTravRule(node: Node) = REGEX_BAD_TAGS.containsMatchIn(node.attr("id")) ||
REGEX_BAD_TAGS.containsMatchIn(node.attr("class")) ||
REGEX_BAD_TAGS.containsMatchIn(node.attr("name"))
REGEX_BAD_TAGS.containsMatchIn(node.attr("class")) ||
REGEX_BAD_TAGS.containsMatchIn(node.attr("name"))

fun removeMatching(re: Regex): (Node) -> Boolean {
return { node: Node ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,30 @@ import org.jsoup.nodes.TextNode
class TextFormatter(private val stopWords: StopWords) : Formatter {

override fun format(node: Element?) = node?.let {
val bestRoot = drillDownToCruxElement(node)
// TODO: Combine the following into a single pass
removeNegativescoresNodes(it)
linksToText(it)
addNewlineToBr(it)
replaceWithText(it)
removeFewwordsParagraphs(it)
convertToText(it)
removeNegativescoresNodes(bestRoot)
superSubScriptToText(bestRoot)
linksToText(bestRoot)
addNewlineToBr(bestRoot)
replaceWithText(bestRoot)
removeFewwordsParagraphs(bestRoot)
// TODO: find proper root
// look look at children. if one node, look at their children to see if there are many
// if there are many use that node as th root
convertToText(bestRoot)
} ?: ""

private fun drillDownToCruxElement(node: Element): Element {
if (node.ownText().isBlank() && node.childNodeSize() == 1) {
val onlyChild = node.childNode(0)
if (onlyChild is Element) {
drillDownToCruxElement(onlyChild)
}
}
return node
}

private fun removeNegativescoresNodes(node: Element) {
val gravityElements = node.find("*[gravityScore]")
gravityElements.forEach {
Expand All @@ -33,6 +48,18 @@ class TextFormatter(private val stopWords: StopWords) : Formatter {
}
}

private fun superSubScriptToText(node: Element) {
try {
if (listOf("sub", "sup", "small").contains(node.tagName())) {
node.ownText().trim().toDouble()
if (node.hasParent()) {
node.unwrap()
}
}
} catch (e: NumberFormatException) {
}
}

private fun linksToText(node: Element) {
val nodes = node.find("a")
nodes.forEach {
Expand Down Expand Up @@ -64,7 +91,8 @@ class TextFormatter(private val stopWords: StopWords) : Formatter {
val numStopWords = stopWords.statistics(text).stopWords.size
val hasObject = e.find("object").isNotEmpty()
val hasEmbed = e.find("embed").isNotEmpty()
if ((tag != "br" || text != "\\r") && numStopWords < 3 && !hasObject && !hasEmbed) {
val isEndline = tag == "br" || text == "\\r"
if (!isEndline && numStopWords < 3 && !hasObject && !hasEmbed) {
if (e.parent() != null)
e.remove()
} else {
Expand Down Expand Up @@ -93,6 +121,7 @@ class TextFormatter(private val stopWords: StopWords) : Formatter {
continue
}

// TODO if hanging text is blank here, we should reset the text to empty
if (hangingText.isNotBlank()) {
val text = cleanParagraphText(hangingText.toString())
texts.addAll(text.split("""\r?\n""".toRegex()))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ class DocumentScorer(private val stopWords: StopWords) : Scorer {
nodesWithText.add(node)
}
}

val numNodesWithText = nodesWithText.size
var startingBoost = 1.0
val negativeScoring = 0
Expand Down
48 changes: 48 additions & 0 deletions src/main/kotlin/io/github/cdimascio/essence/util/NodeHeuristics.kt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package io.github.cdimascio.essence.util

import io.github.cdimascio.essence.scorers.Scorer
import io.github.cdimascio.essence.words.StopWords
import org.jsoup.nodes.Element
import org.jsoup.nodes.Node

Expand Down Expand Up @@ -49,4 +50,51 @@ object NodeHeuristics {
}
return false
}

fun hasFewWordsAndLowFewWordNeighbors(node: Node, stopWords: StopWords): Boolean {
if (node is Element) {
val ownText = node.ownText()
if (node.childNodeSize() == 0 && (ownText.isBlank() || stopWords.statistics(ownText).stopWords.size < 5)) {
val n = 2
if (hasFewWordPrevSiblings(node, n, stopWords) && hasFewWordNextSiblings(node, n, stopWords)) {
return true
}
}
}
return false
}

private fun hasFewWordPrevSiblings(node: Node, numSibsToCheck: Int, stopWords: StopWords): Boolean {
var count = 0
var prevSib = node.previousSibling()
while (prevSib != null && count < numSibsToCheck) {
if (prevSib is Element) {
val ownText = prevSib.ownText()
// use regular words not stop words
if (!ownText.isBlank() && stopWords.statistics(ownText).stopWords.size > 5) {
return false
}
}
prevSib = prevSib.previousSibling()
count += 1
}
return true
}


private fun hasFewWordNextSiblings(node: Node, numSibsToCheck: Int, stopWords: StopWords): Boolean {
var count = 0
var nextSib = node.nextSibling()
while (nextSib != null && count < numSibsToCheck) {
if (nextSib is Element) {
val ownText = nextSib.ownText()
if (!ownText.isBlank() && stopWords.statistics(ownText).stopWords.size > 5) {
return false
}
}
nextSib = nextSib.nextSibling()
count += 1
}
return true
}
}
33 changes: 22 additions & 11 deletions src/test/kotlin/io/github/cdimascio/essence/EssenceSpec.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,27 @@ class EssenceSpec {

@Test
fun readsFavicon() {
checkFixture(site = "aolNews" , fields = listOf("favicon"))
checkFixture(site = "aolNews", fields = listOf("favicon"))
}

@Test
fun readsDescription() {
checkFixture("allnewlyrics1" , listOf("description"))
checkFixture("allnewlyrics1", listOf("description"))
}

@Test
fun readsOpenGraphDescription() {
checkFixture("twitter" , listOf("description"))
checkFixture("twitter", listOf("description"))
}

@Test
fun readsKeywords() {
checkFixture("allnewlyrics1" , listOf("keywords"))
checkFixture("allnewlyrics1", listOf("keywords"))
}

@Test
fun readsLang() {
checkFixture("allnewlyrics1" , listOf("lang"))
checkFixture("allnewlyrics1", listOf("lang"))
}

@Test
Expand Down Expand Up @@ -145,6 +145,16 @@ class EssenceSpec {
checkFixture(site = "cnet", fields = listOf("cleaned_text"))
}

// @Test
// fun getsCleanedTextSch() {
// checkFixture(site = "sch1", fields = listOf("cleaned_text"))
// }

@Test
fun getsCleanedTextKeras() {
checkFixture(site = "keras", fields = listOf("cleaned_text"))
}

@Test
fun getsCleanedTextYahoo() {
checkFixture(site = "yahoo", fields = listOf("cleaned_text"))
Expand Down Expand Up @@ -190,9 +200,7 @@ class EssenceSpec {
}

private fun cleanTestingTest(newText: String, originalText: String): String {
return newText.
replace("""\n\n""", " ").
replace("""\ \ """, " ")
return newText.replace("""\n\n""", " ").replace("""\ \ """, " ")
.substring(0, Math.min(newText.length, originalText.length))
}

Expand All @@ -215,9 +223,11 @@ class EssenceSpec {
val origText = cleanOrigText(expected["cleaned_text"].asText())
val newText = cleanTestingTest(data.text, origText)
assertNotEquals("text should not be null", "", newText)
assertTrue(data.text.length >= origText.length)

println(origText)
println(newText)
assertTrue(data.text.length >= origText.length)

assertEquals(origText, newText)
}
"link" -> {
Expand All @@ -228,7 +238,7 @@ class EssenceSpec {
}
"description" -> {
assertEquals(expected["meta_description"].asText(), data.description)
}
}
"lang" -> {
assertEquals(expected["meta_lang"].asText(), data.language)
}
Expand All @@ -247,7 +257,8 @@ class EssenceSpec {
}
"links" -> {
val links = data.links.sortedBy { it.text }
val expectedLinks = expected["links"]?.map { Link(it["href"].asText(), it["text"].asText()) } ?: emptyList()
val expectedLinks = expected["links"]?.map { Link(it["href"].asText(), it["text"].asText()) }
?: emptyList()
links.zip(expectedLinks).forEach { (actual, expected) ->
assertEquals(expected.text, actual.text)
assertEquals(expected.href, actual.href)
Expand Down
Loading

0 comments on commit f9d05aa

Please sign in to comment.