Skip to content

Commit

Permalink
Add two tests for SiteMapParserBolt
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Dinzinger <michael.dinzinger@uni-passau.de>
  • Loading branch information
michaeldinzinger committed Dec 19, 2023
1 parent d67ba6b commit cc2b57d
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ public void setupParserBolt() {
setupParserBolt(bolt);
}

// TODO add a test for a sitemap containing links
// to other sitemap files

@Test
public void testSitemapParsing() throws IOException {

Expand All @@ -62,9 +59,48 @@ public void testSitemapParsing() throws IOException {
Assert.assertEquals(3, fields.size());
}

@Test
public void testSitemapIndexParsing() throws IOException {

prepareParserBolt("test.parsefilters.json");

Metadata metadata = new Metadata();
// specify that it is a sitemap file
metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
// and its mime-type
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");

parse(
"http://www.tripadvisor.com/sitemap-index.xml",
"tripadvisor.sitemap.index.xml",
metadata);

for (List<Object> fields : output.getEmitted(Constants.StatusStreamName)) {
Metadata parsedMetadata = (Metadata) fields.get(1);
Assert.assertEquals(
"true", parsedMetadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
}

Assert.assertEquals(5, output.getEmitted(Constants.StatusStreamName).size());
}

@Test
public void testGzipSitemapParsing() throws IOException {

prepareParserBolt("test.parsefilters.json");

Metadata metadata = new Metadata();
// specify that it is a sitemap file
metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");

parse("https://www.tripadvisor.com/sitemap.xml.gz", "tripadvisor.sitemap.xml.gz", metadata);

Assert.assertEquals(50001, output.getEmitted(Constants.StatusStreamName).size());
}

@Test
public void testSitemapParsingWithImageExtensions() throws IOException {
Map parserConfig = new HashMap();
Map<String, Object> parserConfig = new HashMap<>();
parserConfig.put("sitemap.extensions", Collections.singletonList(Extension.IMAGE.name()));
prepareParserBolt("test.parsefilters.json", parserConfig);

Expand All @@ -85,7 +121,7 @@ public void testSitemapParsingWithImageExtensions() throws IOException {

@Test
public void testSitemapParsingWithMobileExtensions() throws IOException {
Map parserConfig = new HashMap();
Map<String, Object> parserConfig = new HashMap<>();
parserConfig.put("sitemap.extensions", Collections.singletonList(Extension.MOBILE.name()));
prepareParserBolt("test.parsefilters.json", parserConfig);

Expand All @@ -106,7 +142,7 @@ public void testSitemapParsingWithMobileExtensions() throws IOException {

@Test
public void testSitemapParsingWithLinkExtensions() throws IOException {
Map parserConfig = new HashMap();
Map<String, Object> parserConfig = new HashMap<>();
parserConfig.put("sitemap.extensions", Collections.singletonList(Extension.LINKS.name()));
prepareParserBolt("test.parsefilters.json", parserConfig);

Expand All @@ -127,7 +163,7 @@ public void testSitemapParsingWithLinkExtensions() throws IOException {

@Test
public void testSitemapParsingWithNewsExtensions() throws IOException {
Map parserConfig = new HashMap();
Map<String, Object> parserConfig = new HashMap<>();
parserConfig.put("sitemap.extensions", Collections.singletonList(Extension.NEWS.name()));
prepareParserBolt("test.parsefilters.json", parserConfig);

Expand All @@ -148,7 +184,7 @@ public void testSitemapParsingWithNewsExtensions() throws IOException {

@Test
public void testSitemapParsingWithVideoExtensions() throws IOException {
Map parserConfig = new HashMap();
Map<String, Object> parserConfig = new HashMap<>();
parserConfig.put("sitemap.extensions", Collections.singletonList(Extension.VIDEO.name()));
prepareParserBolt("test.parsefilters.json", parserConfig);

Expand All @@ -169,7 +205,7 @@ public void testSitemapParsingWithVideoExtensions() throws IOException {

@Test
public void testSitemapParsingWithAllExtensions() throws IOException {
Map parserConfig = new HashMap();
Map<String, Object> parserConfig = new HashMap<>();

parserConfig.put(
"sitemap.extensions",
Expand Down Expand Up @@ -202,15 +238,15 @@ public void testSitemapParsingWithAllExtensions() throws IOException {

@Test(expected = IllegalArgumentException.class)
public void testSitemapParsingWithIllegalExtensionConfigured() throws IOException {
Map parserConfig = new HashMap();
Map<String, Object> parserConfig = new HashMap<>();
parserConfig.put("sitemap.extensions", Arrays.asList("AUDIONEWSLINKS"));
prepareParserBolt("test.parsefilters.json", parserConfig);
}

@Test
public void testSitemapParsingNoMT() throws IOException {

Map parserConfig = new HashMap();
Map<String, Object> parserConfig = new HashMap<>();
parserConfig.put("sitemap.sniffContent", true);
parserConfig.put("parsefilters.config.file", "test.parsefilters.json");
bolt.prepare(
Expand Down
22 changes: 22 additions & 0 deletions core/src/test/resources/tripadvisor.sitemap.index.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?><sitemapindex
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd">
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806509-en_US-hotel_review-1686849999.xml.gz</loc>
<lastmod>2023-06-15T17:26:39Z</lastmod>
</sitemap>
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806530-en_US-hotel_review-1686850054.xml.gz</loc>
<lastmod>2023-06-15T17:27:34Z</lastmod>
</sitemap>
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806537-en_US-hotel_review-1686850072.xml.gz</loc>
<lastmod>2023-06-15T17:27:52Z</lastmod>
</sitemap>
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1841024-en_US-hotel_review-1694976638.xml.gz</loc>
<lastmod>2023-09-17T18:50:38Z</lastmod>
</sitemap>
</sitemapindex>
Binary file not shown.

0 comments on commit cc2b57d

Please sign in to comment.