diff --git a/src/main/kotlin/io/github/cdimascio/essence/cleaners/Cleaner.kt b/src/main/kotlin/io/github/cdimascio/essence/cleaners/Cleaner.kt index d8f38d3..62c8ee4 100644 --- a/src/main/kotlin/io/github/cdimascio/essence/cleaners/Cleaner.kt +++ b/src/main/kotlin/io/github/cdimascio/essence/cleaners/Cleaner.kt @@ -20,6 +20,7 @@ class Cleaner(private val doc: Document) { removeScriptsStyles() Traverse( nodeRemovalRules = listOf( +// Rule::removeNonTextNodes, Rule::removeCommentsTravRule, Rule::removeBadTagsTravRule, Rule::removeNavigationElements, @@ -95,7 +96,7 @@ class Cleaner(private val doc: Document) { } private fun elementToParagraph(doc: Document, tagNames: List) { - val elements = doc.select(tagNames.joinToString(",")) + val elements = doc.select(tagNames.joinToString(",")) //\.reversed() val tags = listOf("a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul") for (element in elements) { val items = element.matchFirstElementTags(tags, 1) @@ -108,12 +109,8 @@ class Cleaner(private val doc: Document) { val replaceNodes = getReplacementNodes(element) val pReplacementElements = mutableListOf() for (rNode in replaceNodes) { - if (rNode is TextNode && rNode.text().isNotEmpty()) { - pReplacementElements.add(Element("p").html(rNode.text())) - } else if (rNode is Element) { - if (rNode.html().isNotEmpty()) { - pReplacementElements.add(Element("p").html(rNode.html())) - } + if (rNode.html().isNotEmpty()) { + pReplacementElements.add(Element("p").html(rNode.html())) } } element.parent().insertChildren(element.siblingIndex(), pReplacementElements) @@ -122,9 +119,9 @@ class Cleaner(private val doc: Document) { } } - private fun getReplacementNodes(div: Node): List { + private fun getReplacementNodes(div: Node): List { val children = div.childNodes() - val nodesToReturn = mutableListOf() + val nodesToReturn = mutableListOf() val nodesToRemove = mutableListOf() val replacmentText = mutableListOf() // TODO: could be string buffer val isGravityUsed = { e: Element -> e.attr(GRAVITY_USED_ALREADY) == "yes" } @@ -162,7 +159,9 @@ class Cleaner(private val doc: Document) { } } } else { - nodesToReturn.add(kid) + if (kid is Element) { + nodesToReturn += kid + } } } @@ -176,7 +175,10 @@ class Cleaner(private val doc: Document) { node.remove() } - return nodesToReturn + val isInteresting = { e: Element -> + !listOf("meta", "head").contains(e.tagName()) + } + return nodesToReturn.filter { isInteresting(it) } } } diff --git a/src/main/kotlin/io/github/cdimascio/essence/cleaners/ScoreCleaner.kt b/src/main/kotlin/io/github/cdimascio/essence/cleaners/ScoreCleaner.kt index c2c2e47..e23f695 100644 --- a/src/main/kotlin/io/github/cdimascio/essence/cleaners/ScoreCleaner.kt +++ b/src/main/kotlin/io/github/cdimascio/essence/cleaners/ScoreCleaner.kt @@ -1,7 +1,7 @@ package io.github.cdimascio.essence.cleaners -import io.github.cdimascio.essence.util.NodeHeuristics import io.github.cdimascio.essence.scorers.ScoredElement +import io.github.cdimascio.essence.util.NodeHeuristics import io.github.cdimascio.essence.util.TraversalHelpers import io.github.cdimascio.essence.util.find import io.github.cdimascio.essence.words.StopWords @@ -17,7 +17,8 @@ class ScoreCleaner(private val stopWords: StopWords) { listOf("p", "a").contains(node.tagName()) } - addSiblingsToTopNode(element)?.let { updatedElement -> + val topNode = skipNonTextualTopNodes(element) + addSiblingsToTopNode(topNode)?.let { updatedElement -> for (child in updatedElement.children()) { if (!isParagraphOrAnchor(child)) { if (NodeHeuristics.hasHighLinkDensity(child) || @@ -28,9 +29,24 @@ class ScoreCleaner(private val stopWords: StopWords) { } } } +// else if (NodeHeuristics.hasFewWordsAndLowFewWordNeighbors(child, stopWords)) { +// if (child.hasParent()) { +// child.remove() +// } +// } } } - return element + return topNode + } + + private fun skipNonTextualTopNodes(targetNode: Element): Element? { + if (targetNode.ownText().isBlank() && targetNode.childNodeSize() == 1) { + val child = targetNode.childNodes()[0] + if (child is Element) { + return skipNonTextualTopNodes(child) + } + } + return targetNode } // Why add only previous siblings -- change name of function diff --git a/src/main/kotlin/io/github/cdimascio/essence/cleaners/rules/Rule.kt b/src/main/kotlin/io/github/cdimascio/essence/cleaners/rules/Rule.kt index b447904..ca9bbe3 100644 --- a/src/main/kotlin/io/github/cdimascio/essence/cleaners/rules/Rule.kt +++ b/src/main/kotlin/io/github/cdimascio/essence/cleaners/rules/Rule.kt @@ -8,7 +8,6 @@ private val REGEX_NAV = """(["#.'\-_]+|^)nav[\-_"']+""".toRegex(RegexOption.IGNO private val REGEX_SPONSORED = """sponsored|(["#.'\-_]+|^)ad[\-_"']+|adzone""".toRegex(RegexOption.IGNORE_CASE) object Rule { - fun removeSponsoredContent(node: Node): Boolean { if (node !is Element) return false return REGEX_SPONSORED.containsMatchIn(node.attr("class")) || @@ -16,21 +15,23 @@ object Rule { REGEX_SPONSORED.containsMatchIn(it.value ?: "") }.isNotEmpty() } + fun removeCommentsTravRule(node: Node): Boolean { return node.nodeName() == "#comment" } fun removeNavigationElements(node: Node): Boolean { if (node !is Element) return false - return listOf("div", "li", "ul", "ol").contains(node.tagName()) && ( + // remove checks for div nav + return listOf("li", "ul", "ol", "header", "span").contains(node.tagName()) && ( REGEX_NAV.containsMatchIn(node.attr("class")) || REGEX_NAV.containsMatchIn(node.attr("id"))) } fun removeBadTagsTravRule(node: Node) = REGEX_BAD_TAGS.containsMatchIn(node.attr("id")) || - REGEX_BAD_TAGS.containsMatchIn(node.attr("class")) || - REGEX_BAD_TAGS.containsMatchIn(node.attr("name")) + REGEX_BAD_TAGS.containsMatchIn(node.attr("class")) || + REGEX_BAD_TAGS.containsMatchIn(node.attr("name")) fun removeMatching(re: Regex): (Node) -> Boolean { return { node: Node -> diff --git a/src/main/kotlin/io/github/cdimascio/essence/formatters/TextFormatter.kt b/src/main/kotlin/io/github/cdimascio/essence/formatters/TextFormatter.kt index 3d160e4..29faf45 100644 --- a/src/main/kotlin/io/github/cdimascio/essence/formatters/TextFormatter.kt +++ b/src/main/kotlin/io/github/cdimascio/essence/formatters/TextFormatter.kt @@ -9,15 +9,30 @@ import org.jsoup.nodes.TextNode class TextFormatter(private val stopWords: StopWords) : Formatter { override fun format(node: Element?) = node?.let { + val bestRoot = drillDownToCruxElement(node) // TODO: Combine the following into a single pass - removeNegativescoresNodes(it) - linksToText(it) - addNewlineToBr(it) - replaceWithText(it) - removeFewwordsParagraphs(it) - convertToText(it) + removeNegativescoresNodes(bestRoot) + superSubScriptToText(bestRoot) + linksToText(bestRoot) + addNewlineToBr(bestRoot) + replaceWithText(bestRoot) + removeFewwordsParagraphs(bestRoot) + // TODO: find proper root + // look look at children. if one node, look at their children to see if there are many + // if there are many use that node as th root + convertToText(bestRoot) } ?: "" + private fun drillDownToCruxElement(node: Element): Element { + if (node.ownText().isBlank() && node.childNodeSize() == 1) { + val onlyChild = node.childNode(0) + if (onlyChild is Element) { + drillDownToCruxElement(onlyChild) + } + } + return node + } + private fun removeNegativescoresNodes(node: Element) { val gravityElements = node.find("*[gravityScore]") gravityElements.forEach { @@ -33,6 +48,18 @@ class TextFormatter(private val stopWords: StopWords) : Formatter { } } + private fun superSubScriptToText(node: Element) { + try { + if (listOf("sub", "sup", "small").contains(node.tagName())) { + node.ownText().trim().toDouble() + if (node.hasParent()) { + node.unwrap() + } + } + } catch (e: NumberFormatException) { + } + } + private fun linksToText(node: Element) { val nodes = node.find("a") nodes.forEach { @@ -64,7 +91,8 @@ class TextFormatter(private val stopWords: StopWords) : Formatter { val numStopWords = stopWords.statistics(text).stopWords.size val hasObject = e.find("object").isNotEmpty() val hasEmbed = e.find("embed").isNotEmpty() - if ((tag != "br" || text != "\\r") && numStopWords < 3 && !hasObject && !hasEmbed) { + val isEndline = tag == "br" || text == "\\r" + if (!isEndline && numStopWords < 3 && !hasObject && !hasEmbed) { if (e.parent() != null) e.remove() } else { @@ -93,6 +121,7 @@ class TextFormatter(private val stopWords: StopWords) : Formatter { continue } + // TODO if hanging text is blank here, we should reset the text to empty if (hangingText.isNotBlank()) { val text = cleanParagraphText(hangingText.toString()) texts.addAll(text.split("""\r?\n""".toRegex())) diff --git a/src/main/kotlin/io/github/cdimascio/essence/scorers/DocumentScorer.kt b/src/main/kotlin/io/github/cdimascio/essence/scorers/DocumentScorer.kt index 551e5eb..84c4faa 100644 --- a/src/main/kotlin/io/github/cdimascio/essence/scorers/DocumentScorer.kt +++ b/src/main/kotlin/io/github/cdimascio/essence/scorers/DocumentScorer.kt @@ -21,7 +21,6 @@ class DocumentScorer(private val stopWords: StopWords) : Scorer { nodesWithText.add(node) } } - val numNodesWithText = nodesWithText.size var startingBoost = 1.0 val negativeScoring = 0 diff --git a/src/main/kotlin/io/github/cdimascio/essence/util/NodeHeuristics.kt b/src/main/kotlin/io/github/cdimascio/essence/util/NodeHeuristics.kt index 3a2a1ef..611d47e 100644 --- a/src/main/kotlin/io/github/cdimascio/essence/util/NodeHeuristics.kt +++ b/src/main/kotlin/io/github/cdimascio/essence/util/NodeHeuristics.kt @@ -1,6 +1,7 @@ package io.github.cdimascio.essence.util import io.github.cdimascio.essence.scorers.Scorer +import io.github.cdimascio.essence.words.StopWords import org.jsoup.nodes.Element import org.jsoup.nodes.Node @@ -49,4 +50,51 @@ object NodeHeuristics { } return false } + + fun hasFewWordsAndLowFewWordNeighbors(node: Node, stopWords: StopWords): Boolean { + if (node is Element) { + val ownText = node.ownText() + if (node.childNodeSize() == 0 && (ownText.isBlank() || stopWords.statistics(ownText).stopWords.size < 5)) { + val n = 2 + if (hasFewWordPrevSiblings(node, n, stopWords) && hasFewWordNextSiblings(node, n, stopWords)) { + return true + } + } + } + return false + } + + private fun hasFewWordPrevSiblings(node: Node, numSibsToCheck: Int, stopWords: StopWords): Boolean { + var count = 0 + var prevSib = node.previousSibling() + while (prevSib != null && count < numSibsToCheck) { + if (prevSib is Element) { + val ownText = prevSib.ownText() + // use regular words not stop words + if (!ownText.isBlank() && stopWords.statistics(ownText).stopWords.size > 5) { + return false + } + } + prevSib = prevSib.previousSibling() + count += 1 + } + return true + } + + + private fun hasFewWordNextSiblings(node: Node, numSibsToCheck: Int, stopWords: StopWords): Boolean { + var count = 0 + var nextSib = node.nextSibling() + while (nextSib != null && count < numSibsToCheck) { + if (nextSib is Element) { + val ownText = nextSib.ownText() + if (!ownText.isBlank() && stopWords.statistics(ownText).stopWords.size > 5) { + return false + } + } + nextSib = nextSib.nextSibling() + count += 1 + } + return true + } } diff --git a/src/test/kotlin/io/github/cdimascio/essence/EssenceSpec.kt b/src/test/kotlin/io/github/cdimascio/essence/EssenceSpec.kt index 945b7bf..7ef21c6 100644 --- a/src/test/kotlin/io/github/cdimascio/essence/EssenceSpec.kt +++ b/src/test/kotlin/io/github/cdimascio/essence/EssenceSpec.kt @@ -11,27 +11,27 @@ class EssenceSpec { @Test fun readsFavicon() { - checkFixture(site = "aolNews" , fields = listOf("favicon")) + checkFixture(site = "aolNews", fields = listOf("favicon")) } @Test fun readsDescription() { - checkFixture("allnewlyrics1" , listOf("description")) + checkFixture("allnewlyrics1", listOf("description")) } @Test fun readsOpenGraphDescription() { - checkFixture("twitter" , listOf("description")) + checkFixture("twitter", listOf("description")) } @Test fun readsKeywords() { - checkFixture("allnewlyrics1" , listOf("keywords")) + checkFixture("allnewlyrics1", listOf("keywords")) } @Test fun readsLang() { - checkFixture("allnewlyrics1" , listOf("lang")) + checkFixture("allnewlyrics1", listOf("lang")) } @Test @@ -145,6 +145,16 @@ class EssenceSpec { checkFixture(site = "cnet", fields = listOf("cleaned_text")) } +// @Test +// fun getsCleanedTextSch() { +// checkFixture(site = "sch1", fields = listOf("cleaned_text")) +// } + + @Test + fun getsCleanedTextKeras() { + checkFixture(site = "keras", fields = listOf("cleaned_text")) + } + @Test fun getsCleanedTextYahoo() { checkFixture(site = "yahoo", fields = listOf("cleaned_text")) @@ -190,9 +200,7 @@ class EssenceSpec { } private fun cleanTestingTest(newText: String, originalText: String): String { - return newText. - replace("""\n\n""", " "). - replace("""\ \ """, " ") + return newText.replace("""\n\n""", " ").replace("""\ \ """, " ") .substring(0, Math.min(newText.length, originalText.length)) } @@ -215,9 +223,11 @@ class EssenceSpec { val origText = cleanOrigText(expected["cleaned_text"].asText()) val newText = cleanTestingTest(data.text, origText) assertNotEquals("text should not be null", "", newText) - assertTrue(data.text.length >= origText.length) + println(origText) println(newText) + assertTrue(data.text.length >= origText.length) + assertEquals(origText, newText) } "link" -> { @@ -228,7 +238,7 @@ class EssenceSpec { } "description" -> { assertEquals(expected["meta_description"].asText(), data.description) - } + } "lang" -> { assertEquals(expected["meta_lang"].asText(), data.language) } @@ -247,7 +257,8 @@ class EssenceSpec { } "links" -> { val links = data.links.sortedBy { it.text } - val expectedLinks = expected["links"]?.map { Link(it["href"].asText(), it["text"].asText()) } ?: emptyList() + val expectedLinks = expected["links"]?.map { Link(it["href"].asText(), it["text"].asText()) } + ?: emptyList() links.zip(expectedLinks).forEach { (actual, expected) -> assertEquals(expected.text, actual.text) assertEquals(expected.href, actual.href) diff --git a/src/test/resources/fixtures/test_keras.html b/src/test/resources/fixtures/test_keras.html new file mode 100644 index 0000000..63ee49f --- /dev/null +++ b/src/test/resources/fixtures/test_keras.html @@ -0,0 +1,357 @@ + + + + + + + + + + + + + Why use Keras - Keras Documentation + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + +
+
+
+ + +
+
+
+
+ +

Why use Keras?

+

There are countless deep learning frameworks available today. Why use Keras rather than any other? Here are some of the areas in which Keras compares favorably to existing alternatives.

+
+

Keras prioritizes developer experience

+
    +
  • Keras is an API designed for human beings, not machines. Keras follows best practices for reducing cognitive load: it offers consistent & simple APIs, it minimizes the number of user actions required for common use cases, and it provides clear and actionable feedback upon user error.
  • +
  • This makes Keras easy to learn and easy to use. As a Keras user, you are more productive, allowing you to try more ideas than your competition, faster -- which in turn helps you win machine learning competitions.
  • +
  • This ease of use does not come at the cost of reduced flexibility: because Keras integrates with lower-level deep learning languages (in particular TensorFlow), it enables you to implement anything you could have built in the base language. In particular, as tf.keras, the Keras API integrates seamlessly with your TensorFlow workflows.
  • +
+
+

Keras has broad adoption in the industry and the research community

+

+ + +

+ Deep learning frameworks ranking computed by Jeff Hale, based on 11 data sources across 7 categories +

+

With over 250,000 individual users as of mid-2018, Keras has stronger adoption in both the industry and the research community than any other deep learning framework except TensorFlow itself (and the Keras API is the official frontend of TensorFlow, via the tf.keras module).

+

You are already constantly interacting with features built with Keras -- it is in use at Netflix, Uber, Yelp, Instacart, Zocdoc, Square, and many others. It is especially popular among startups that place deep learning at the core of their products.

+

Keras is also a favorite among deep learning researchers, coming in #2 in terms of mentions in scientific papers uploaded to the preprint server arXiv.org. Keras has also been adopted by researchers at large scientific organizations, in particular CERN and NASA.

+
+

Keras makes it easy to turn models into products

+

Your Keras models can be easily deployed across a greater range of platforms than any other deep learning framework:

+ +
+

Keras supports multiple backend engines and does not lock you into one ecosystem

+

Your Keras models can be developed with a range of different deep learning backends. Importantly, any Keras model that only leverages built-in layers will be portable across all these backends: you can train a model with one backend, and load it with another (e.g. for deployment). Available backends include:

+
    +
  • The TensorFlow backend (from Google)
  • +
  • The CNTK backend (from Microsoft)
  • +
  • The Theano backend
  • +
+

Amazon is also currently working on developing a MXNet backend for Keras.

+

As such, your Keras model can be trained on a number of different hardware platforms beyond CPUs:

+ +
+

Keras has strong multi-GPU support and distributed training support

+ +
+

Keras development is backed by key companies in the deep learning ecosystem

+

Keras development is backed primarily by Google, and the Keras API comes packaged in TensorFlow as tf.keras. Additionally, Microsoft maintains the CNTK Keras backend. Amazon AWS is developing MXNet support. Other contributing companies include NVIDIA, Uber, and Apple (with CoreML).

+

+ + +

+ +
+
+ + +
+
+ +
+ +
+ +
+ + + GitHub + + + « Previous + + + Next » + + +
+ + + + + + + diff --git a/src/test/resources/fixtures/test_keras.json b/src/test/resources/fixtures/test_keras.json new file mode 100644 index 0000000..550ebfc --- /dev/null +++ b/src/test/resources/fixtures/test_keras.json @@ -0,0 +1,7 @@ +{ + "url": "http://keras.io/why-use-keras/", + "expected": { + "cleaned_text": "There are countless deep learning frameworks available today. Why use Keras rather than any other? Here are some of the areas in which Keras compares favorably to existing alternatives.", + "meta_lang": null + } +} diff --git a/src/test/resources/fixtures/test_sch1.html b/src/test/resources/fixtures/test_sch1.html new file mode 100644 index 0000000..d3b6a6a --- /dev/null +++ b/src/test/resources/fixtures/test_sch1.html @@ -0,0 +1,633 @@ + + + + + + + + + + + + Scholastic GO! + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + + + + + + + + Menu + Close + + +
+
+
+
+ +
+
+ + + + +
+
+ +
+
+
+

Top of Page

+
+ + +
+
+ +
+
+
+ +

Table of Contents

+
+
    +
    + +
    + +
    + + + +

    Hippopotamus

    + + + +
    + Lexile + Lexile: 820 +
    +

    Cite This Article

    + +

    The hippopotamus is a barrel-shaped beast with a large head and short legs. Its hide is thick, brown, and practically naked. Its tail is short. Its feet are broad, with padded soles. The four toes on each foot end in four little hooves that spread out, helping the hippo to walk across the soft mud of riverbanks.

    "Hippopotamus" means "river horse." The animal got this name partly because it spends much of its time in the water. And it may have been called a horse because of its great size or its wide nostrils or its little horselike ears. But its closest living relatives are the pigs.

    The hippopotamus is far larger than any horse. A big hippo may be only about 5 feet (1.5 meters) tall at the shoulder. But it can be about 12 feet (3.5 meters) long and weigh more than 3 1/2 tons. The hippopotamus has the biggest mouth of any mammal except the whale. It has two tusks in the upper jaw and four in the lower. When it attacks, it can kill a smaller animal with a single bite. Usually, though, the hippopotamus would rather hide than attack. Often it will run to the water to hide.

    A hippo can run as fast as a person. In the water it can drop out of sight like a stone or it can float. When the hippo floats, only its bulging nostrils and eyes and its little ears show above the surface. It is almost hidden, but it can still breathe, smell, see, and hear.

    When it sinks, the hippo closes its nostrils to keep the water out of its nose. It can walk around on the bottom and gather the juicy water plants it likes to eat. It can easily stay under for 8 or 9 minutes. In a zoo a hippo once stayed under for 29 minutes.

    The baby hippo is usually born right in the water. It can swim almost as soon as it is born. Sometimes it nurses under water, coming up now and then for air. If a mother and her calf are going to swim a long way, the baby sometimes rides on the mother's back.

    A million years ago or more, certain kinds of hippos lived in India, Europe, and England, as well as in Africa. Today they live only in Africa. They usually live together in herds.

    The hippo has an appetite to match its size. A big hippo that lives in a zoo may eat about 100 pounds (45 kilograms) of food a day. A herd in the wild will eat many kinds of river plants and grasses. Hippos usually feed at night and rest during the day. Sometimes they raid farms and eat the crops. As a result, many hippos in Africa have been killed or driven away from farming areas. Now they live mostly in tropical central Africa, usually around rivers and lakes in the parks set aside for wild animals.

    Some people say that when the hippopotamus is excited, it sweats blood. It does not, but its skin does give off drops of reddish oil as a kind of sweat. The oil looks somewhat like blood.

    The big hippo has a small cousin called the pygmy hippo. It looks like the big hippo. But its legs are longer for its size. And its body is much smaller. It is about 3 feet (1 meter) tall and weighs under 650 pounds (300 kilograms). Its oily sweat is clear, not red.

    Reviewed by

    Robert M. McClung

    Author, science books for children

    See also: Hoofed Mammals

    + +
    + +

    How to cite this article:

    + +

    MLA (Modern Language Association) style:

    +

    "Hippopotamus." Scholastic GO!, go.scholastic.com/content/schgo/D/article/a20/134/a2013440-h.html. Accessed .

    + + +

    Chicago Manual of Style:

    +

    "Hippopotamus." Scholastic GO!. https://go.scholastic.com/content/schgo/D/article/a20/134/a2013440-h.html (accessed ).

    + + +

    APA (American Psychological Association) style:

    +

    (). Hippopotamus. Retrieved , from Scholastic GO!. https://go.scholastic.com/content/schgo/D/article/a20/134/a2013440-h.html

    +
    +
    + +
    + + + + + + + +
    +
      + +
    • + The hippopotamus, one of the largest land mammals, lives only in Africa. It dwells near rivers and lakes. Hippos spend much of their time in the water, even mating and giving birth there. +
    • + +
    + + + +
    + + +
    + + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/test/resources/fixtures/test_sch1.json b/src/test/resources/fixtures/test_sch1.json new file mode 100644 index 0000000..1561fa6 --- /dev/null +++ b/src/test/resources/fixtures/test_sch1.json @@ -0,0 +1,9 @@ +{ + "url": "https://go.scholastic.com/D/article/a20/134/a2013440-h.html", + "expected": { + "title": "Hippopotamus", + "cleaned_text": "The hippopotamus is a barrel-shaped beast with a large head and short legs. Its hide is thick, brown, and practically naked. Its tail is short. Its feet are broad, with padded soles. The four toes on each foot end in four little hooves that spread out, helping the hippo to walk across the soft mud of riverbanks.\n\n\"Hippopotamus\" means \"river horse.\" The animal got this name partly because it spends much of its time in the water. And it may have been called a horse because of its great size or its wide nostrils or its little horselike ears. But its closest living relatives are the pigs.\n\nThe hippopotamus is far larger than any horse. A big hippo may be only about 5 feet (1.5 meters) tall at the shoulder. But it can be about 12 feet (3.5 meters) long and weigh more than 3 1/2 tons. The hippopotamus has the biggest mouth of any mammal except the whale. It has two tusks in the upper jaw and four in the lower. When it attacks, it can kill a smaller animal with a single bite. Usually, though, the hippopotamus would rather hide than attack. Often it will run to the water to hide.\n\nA hippo can run as fast as a person. In the water it can drop out of sight like a stone or it can float. When the hippo floats, only its bulging nostrils and eyes and its little ears show above the surface. It is almost hidden, but it can still breathe, smell, see, and hear.\n\nWhen it sinks, the hippo closes its nostrils to keep the water out of its nose. It can walk around on the bottom and gather the juicy water plants it likes to eat. It can easily stay under for 8 or 9 minutes. In a zoo a hippo once stayed under for 29 minutes.\n\nThe baby hippo is usually born right in the water. It can swim almost as soon as it is born. Sometimes it nurses under water, coming up now and then for air. If a mother and her calf are going to swim a long way, the baby sometimes rides on the mother's back.\n\nA million years ago or more, certain kinds of hippos lived in India, Europe, and England, as well as in Africa. Today they live only in Africa. They usually live together in herds.\n\nThe hippo has an appetite to match its size. A big hippo that lives in a zoo may eat about 100 pounds (45 kilograms) of food a day. A herd in the wild will eat many kinds of river plants and grasses. Hippos usually feed at night and rest during the day. Sometimes they raid farms and eat the crops. As a result, many hippos in Africa have been killed or driven away from farming areas. Now they live mostly in tropical central Africa, usually around rivers and lakes in the parks set aside for wild animals.\n\nSome people say that when the hippopotamus is excited, it sweats blood. It does not, but its skin does give off drops of reddish oil as a kind of sweat. The oil looks somewhat like blood.\n\nThe big hippo has a small cousin called the pygmy hippo. It looks like the big hippo. But its legs are longer for its size. And its body is much smaller. It is about 3 feet (1 meter) tall and weighs under 650 pounds (300 kilograms). Its oily sweat is clear, not red.", + "meta_lang": "en", + "links": [] + } +}