diff --git a/cli.js b/cli.js
index ffdd203..ac13a50 100755
--- a/cli.js
+++ b/cli.js
@@ -215,13 +215,14 @@ ${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file
`);
await siteCrawler.produceSiteLinks();
+ const numberOfSiteLinks = siteCrawler.linkSet.size;
if (!mainConfig.useExportedSitemap) {
await log.toConsole(`
- ||-> Site links exported to ${siteCrawler.exportFileName}
+ ||-> ${numberOfSiteLinks} URLs exported to ${siteCrawler.exportFileName}.sitemap.json
`);
} else {
await log.toConsole(`
- ||-> Site links read from ${siteCrawler.exportFileName}
+ ||-> ${numberOfSiteLinks} URLs read from ${siteCrawler.exportFileName}.sitemap.json
`);
}
@@ -247,8 +248,10 @@ ${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file
await outputter.writeDataAsync(formattedResult, outputFileName);
log.endTimer();
+ const { elapsedTime } = log;
+ const friendlyTime = elapsedTime > 300 ? `${(elapsedTime / 60).toFixed(2)}m` : `${elapsedTime}s`;
const endMessage = `
-| Finished after ${log.elapsedTime}s
+| Finished after ${friendlyTime}
| Pages Scanned: ${totalPagesSearched}
| Pages with a Match: ${pagesWithSelector.length}
| Total Results: ${totalMatches}
diff --git a/package-lock.json b/package-lock.json
index 76a4830..afddce2 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
{
"name": "selector-hound",
- "version": "2.0.0",
+ "version": "2.1.0",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "selector-hound",
- "version": "2.0.0",
+ "version": "2.1.0",
"license": "MIT",
"dependencies": {
"axios": "^0.21.1",
diff --git a/package.json b/package.json
index ae022f7..98c4c1d 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "selector-hound",
- "version": "2.0.0",
+ "version": "2.1.0",
"description": "Find an element that matches a particular CSS selector on a website ",
"type": "module",
"keywords": [
diff --git a/src/site-crawler.js b/src/site-crawler.js
index ca9cc4f..dd7f003 100644
--- a/src/site-crawler.js
+++ b/src/site-crawler.js
@@ -158,17 +158,22 @@ export default class SiteCrawler {
const parser = new this.libraries.Parser();
parsedXml = await parser.parseStringPromise(data);
} catch (getSitemapError) {
- await log
- .errorToFileAsync(getSitemapError)
- .errorToConsoleAsync(
- `Couldn't get the sitemap:\n ${getSitemapError}`,
- );
+ await log.errorToFileAsync(getSitemapError);
+ await log.errorToConsoleAsync(
+ `Couldn't get the sitemap:\n ${getSitemapError}`,
+ );
}
return parsedXml;
}
+ /**
+ * @description gets links to pages from a sitemap
+ * @param {Object} sitemapJson
+ * @returns {string[]} an array of href values to sitemaps
+ */
static getLinksFromSitemap(sitemapJson) {
if (!sitemapJson) throw new Error('Sitemap JSON was not provided');
+ if (!sitemapJson.urlset) return [];
const pageLinks = sitemapJson
.urlset
.url // note: each url node in the xml becomes object in an array called url
@@ -177,6 +182,22 @@ export default class SiteCrawler {
return pageLinks;
}
+ /**
+ * @description gets links to sitemaps from a sitemap
+ * @param {object} sitemapJson
+ * @returns {string[]} an array of href values to sitemaps
+ */
+ static getSitemapsFromSitemap(sitemapJson) {
+ if (!sitemapJson) throw new Error('Sitemap JSON was not provided');
+ if (!sitemapJson.sitemapindex) return [];
+ const sitemapLinks = sitemapJson
+ .sitemapindex
+ .sitemap
+ .map((urlObject) => urlObject.loc[0]);
+
+ return sitemapLinks;
+ }
+
/**
* @description Gets only links from a string containing markup
* @param {string} pageMarkup string containing markup
@@ -301,6 +322,32 @@ export default class SiteCrawler {
}
}
+ /**
+ * @description Fetches a sitemap and returns the links from it
+ * @param {string} [sitemapUrl=this.config.startPage]
+ * @returns {string[]} an array of href values
+ */
+ async getSitemapLinks(sitemapUrl = this.config.startPage) {
+ let sitemapUrls = [];
+ let nestedSitemaps = [];
+ try {
+ const sitemapJson = await this.getSitemapAsync(sitemapUrl);
+ sitemapUrls = SiteCrawler.getLinksFromSitemap(sitemapJson);
+ nestedSitemaps = SiteCrawler.getSitemapsFromSitemap(sitemapJson);
+
+ if (nestedSitemaps.length > 0) {
+ await forEachAsync(nestedSitemaps, async (nestedSitemap) => {
+ const nestedSitemapLinks = await this.getSitemapLinks(nestedSitemap);
+ sitemapUrls = [...sitemapUrls, ...nestedSitemapLinks];
+ });
+ }
+ } catch (setSitemapError) {
+ await log.errorToFileAsync(setSitemapError);
+ }
+
+ return sitemapUrls;
+ }
+
/**
* @description Fetches a sitemap and adds links to linkset
* @param {string} [sitemapUrl=this.config.startPage]
@@ -309,8 +356,7 @@ export default class SiteCrawler {
this.config.startPage = sitemapUrl;
try {
- const sitemapJson = await this.getSitemapAsync(sitemapUrl);
- const sitemapUrls = SiteCrawler.getLinksFromSitemap(sitemapJson);
+ const sitemapUrls = await this.getSitemapLinks(sitemapUrl);
this.addLinks(sitemapUrls);
} catch (setSitemapError) {
await this.errorToFileAsync(setSitemapError);
diff --git a/test/site-crawler.test.js b/test/site-crawler.test.js
index 1202562..e98d1ee 100644
--- a/test/site-crawler.test.js
+++ b/test/site-crawler.test.js
@@ -91,6 +91,40 @@ const MOCK_DATA = {
`,
+ otherSitemap: `
+
+ http://frankmtaylor.com/foo.html
+ 2022-01-06T16:36:33.516Z
+ monthly
+
+
+ http://frankmtaylor.com/bar.html
+ 2022-01-06T16:36:33.618Z
+ monthly
+
+
+ http://frankmtaylor.com/baz.html
+ 2022-01-06T16:36:33.664Z
+ monthly
+
+
+ http://frankmtaylor.com/beep.html
+ 2022-01-06T16:36:33.721Z
+ monthly
+
+
+ `,
+
+ nestedSitemap: `
+
+ https://frankmtaylor.com/sitemap.xml
+ 2022-01-06T16:36:33.721Z
+
+
+ https://frankmtaylor.com/other-sitemap.xml
+ 2022-01-06T16:36:33.721Z
+
+`,
};
axios.mockImplementation((url) => {
@@ -99,6 +133,14 @@ axios.mockImplementation((url) => {
return Promise.resolve({
data: MOCK_DATA.sitemap,
});
+ case 'https://frankmtaylor.com/nested-sitemap.xml':
+ return Promise.resolve({
+ data: MOCK_DATA.nestedSitemap,
+ });
+ case 'https://frankmtaylor.com/other-sitemap.xml':
+ return Promise.resolve({
+ data: MOCK_DATA.otherSitemap,
+ });
case 'https://frankmtaylor.com/portfolio/':
return Promise.resolve({
data: MOCK_DATA.portfolio,
@@ -132,7 +174,6 @@ describe('getting file', () => {
const siteCrawler = new SiteCrawler();
siteCrawler.libraries.ajax = axios;
-
test('getFileAsync', async () => {
const result = await siteCrawler.getFileAsync('https://frankmtaylor.com/qualified/');
@@ -387,6 +428,32 @@ describe('SiteCrawler: Fetching Sitemap', () => {
expect(sitemapLinks.length).toEqual(7);
});
});
+ describe('static getsitemaps', () => {
+ test('it will create an array from a json object', async () => {
+ const siteCrawler = new SiteCrawler();
+ siteCrawler.libraries.ajax = axios;
+ const siteMapJson = await siteCrawler.getSitemapAsync('https://frankmtaylor.com/nested-sitemap.xml');
+ const sitemapLinks = SiteCrawler.getSitemapsFromSitemap(siteMapJson);
+ expect(sitemapLinks).toBeInstanceOf(Array);
+ expect(sitemapLinks.length).toEqual(2);
+ });
+ });
+ describe('getSitemapLinks', () => {
+ const siteCrawler = new SiteCrawler();
+ siteCrawler.libraries.ajax = axios;
+
+ test('it gets links from a sitemap', async () => {
+ const sitemapLinks = await siteCrawler.getSitemapLinks('https://frankmtaylor.com/sitemap.xml');
+ expect(sitemapLinks).toBeInstanceOf(Array);
+ expect(sitemapLinks.length).toEqual(7);
+ });
+
+ test('it gets links from a nested sitemap', async () => {
+ const sitemapLinks = await siteCrawler.getSitemapLinks('https://frankmtaylor.com/nested-sitemap.xml');
+ expect(sitemapLinks).toBeInstanceOf(Array);
+ expect(sitemapLinks.length).toEqual(11);
+ });
+ });
describe('setSitemap', () => {
test('The linkSet will have the same links from sitemap', async () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
@@ -406,10 +473,20 @@ describe('SiteCrawler: Fetching Sitemap', () => {
describe('produceSiteLinks', () => {
test('when produceSiteLinks is run, a file is created and it knows it, and still has data', async () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
+ siteCrawler.libraries.ajax = axios;
await siteCrawler.produceSiteLinks();
expect(siteCrawler.hasExportedLinks).toEqual(true);
expect(siteCrawler.linkSet.size).toBeGreaterThan(0);
expect(siteCrawler.linkSet.has('http://frankmtaylor.com'));
});
});
+ describe('nested sitemap', () => {
+ test('it can crawl a nested sitemap', async () => {
+ const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/nested-sitemap.xml' });
+ siteCrawler.libraries.ajax = axios;
+
+ await siteCrawler.setSitemap();
+ expect(siteCrawler.linkSet.size).toEqual(11);
+ });
+ });
});