diff --git a/cli.js b/cli.js index ffdd203..ac13a50 100755 --- a/cli.js +++ b/cli.js @@ -215,13 +215,14 @@ ${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file `); await siteCrawler.produceSiteLinks(); + const numberOfSiteLinks = siteCrawler.linkSet.size; if (!mainConfig.useExportedSitemap) { await log.toConsole(` - ||-> Site links exported to ${siteCrawler.exportFileName} + ||-> ${numberOfSiteLinks} URLs exported to ${siteCrawler.exportFileName}.sitemap.json `); } else { await log.toConsole(` - ||-> Site links read from ${siteCrawler.exportFileName} + ||-> ${numberOfSiteLinks} URLs read from ${siteCrawler.exportFileName}.sitemap.json `); } @@ -247,8 +248,10 @@ ${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file await outputter.writeDataAsync(formattedResult, outputFileName); log.endTimer(); + const { elapsedTime } = log; + const friendlyTime = elapsedTime > 300 ? `${(elapsedTime / 60).toFixed(2)}m` : `${elapsedTime}s`; const endMessage = ` -| Finished after ${log.elapsedTime}s +| Finished after ${friendlyTime} | Pages Scanned: ${totalPagesSearched} | Pages with a Match: ${pagesWithSelector.length} | Total Results: ${totalMatches} diff --git a/package-lock.json b/package-lock.json index 76a4830..afddce2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "selector-hound", - "version": "2.0.0", + "version": "2.1.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "selector-hound", - "version": "2.0.0", + "version": "2.1.0", "license": "MIT", "dependencies": { "axios": "^0.21.1", diff --git a/package.json b/package.json index ae022f7..98c4c1d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "selector-hound", - "version": "2.0.0", + "version": "2.1.0", "description": "Find an element that matches a particular CSS selector on a website ", "type": "module", "keywords": [ diff --git a/src/site-crawler.js b/src/site-crawler.js index ca9cc4f..dd7f003 100644 --- a/src/site-crawler.js +++ b/src/site-crawler.js @@ -158,17 +158,22 @@ export default class SiteCrawler { const parser = new this.libraries.Parser(); parsedXml = await parser.parseStringPromise(data); } catch (getSitemapError) { - await log - .errorToFileAsync(getSitemapError) - .errorToConsoleAsync( - `Couldn't get the sitemap:\n ${getSitemapError}`, - ); + await log.errorToFileAsync(getSitemapError); + await log.errorToConsoleAsync( + `Couldn't get the sitemap:\n ${getSitemapError}`, + ); } return parsedXml; } + /** + * @description gets links to pages from a sitemap + * @param {Object} sitemapJson + * @returns {string[]} an array of href values to sitemaps + */ static getLinksFromSitemap(sitemapJson) { if (!sitemapJson) throw new Error('Sitemap JSON was not provided'); + if (!sitemapJson.urlset) return []; const pageLinks = sitemapJson .urlset .url // note: each url node in the xml becomes object in an array called url @@ -177,6 +182,22 @@ export default class SiteCrawler { return pageLinks; } + /** + * @description gets links to sitemaps from a sitemap + * @param {object} sitemapJson + * @returns {string[]} an array of href values to sitemaps + */ + static getSitemapsFromSitemap(sitemapJson) { + if (!sitemapJson) throw new Error('Sitemap JSON was not provided'); + if (!sitemapJson.sitemapindex) return []; + const sitemapLinks = sitemapJson + .sitemapindex + .sitemap + .map((urlObject) => urlObject.loc[0]); + + return sitemapLinks; + } + /** * @description Gets only links from a string containing markup * @param {string} pageMarkup string containing markup @@ -301,6 +322,32 @@ export default class SiteCrawler { } } + /** + * @description Fetches a sitemap and returns the links from it + * @param {string} [sitemapUrl=this.config.startPage] + * @returns {string[]} an array of href values + */ + async getSitemapLinks(sitemapUrl = this.config.startPage) { + let sitemapUrls = []; + let nestedSitemaps = []; + try { + const sitemapJson = await this.getSitemapAsync(sitemapUrl); + sitemapUrls = SiteCrawler.getLinksFromSitemap(sitemapJson); + nestedSitemaps = SiteCrawler.getSitemapsFromSitemap(sitemapJson); + + if (nestedSitemaps.length > 0) { + await forEachAsync(nestedSitemaps, async (nestedSitemap) => { + const nestedSitemapLinks = await this.getSitemapLinks(nestedSitemap); + sitemapUrls = [...sitemapUrls, ...nestedSitemapLinks]; + }); + } + } catch (setSitemapError) { + await log.errorToFileAsync(setSitemapError); + } + + return sitemapUrls; + } + /** * @description Fetches a sitemap and adds links to linkset * @param {string} [sitemapUrl=this.config.startPage] @@ -309,8 +356,7 @@ export default class SiteCrawler { this.config.startPage = sitemapUrl; try { - const sitemapJson = await this.getSitemapAsync(sitemapUrl); - const sitemapUrls = SiteCrawler.getLinksFromSitemap(sitemapJson); + const sitemapUrls = await this.getSitemapLinks(sitemapUrl); this.addLinks(sitemapUrls); } catch (setSitemapError) { await this.errorToFileAsync(setSitemapError); diff --git a/test/site-crawler.test.js b/test/site-crawler.test.js index 1202562..e98d1ee 100644 --- a/test/site-crawler.test.js +++ b/test/site-crawler.test.js @@ -91,6 +91,40 @@ const MOCK_DATA = { `, + otherSitemap: ` + + http://frankmtaylor.com/foo.html + 2022-01-06T16:36:33.516Z + monthly + + + http://frankmtaylor.com/bar.html + 2022-01-06T16:36:33.618Z + monthly + + + http://frankmtaylor.com/baz.html + 2022-01-06T16:36:33.664Z + monthly + + + http://frankmtaylor.com/beep.html + 2022-01-06T16:36:33.721Z + monthly + + + `, + + nestedSitemap: ` + + https://frankmtaylor.com/sitemap.xml + 2022-01-06T16:36:33.721Z + + + https://frankmtaylor.com/other-sitemap.xml + 2022-01-06T16:36:33.721Z + +`, }; axios.mockImplementation((url) => { @@ -99,6 +133,14 @@ axios.mockImplementation((url) => { return Promise.resolve({ data: MOCK_DATA.sitemap, }); + case 'https://frankmtaylor.com/nested-sitemap.xml': + return Promise.resolve({ + data: MOCK_DATA.nestedSitemap, + }); + case 'https://frankmtaylor.com/other-sitemap.xml': + return Promise.resolve({ + data: MOCK_DATA.otherSitemap, + }); case 'https://frankmtaylor.com/portfolio/': return Promise.resolve({ data: MOCK_DATA.portfolio, @@ -132,7 +174,6 @@ describe('getting file', () => { const siteCrawler = new SiteCrawler(); siteCrawler.libraries.ajax = axios; - test('getFileAsync', async () => { const result = await siteCrawler.getFileAsync('https://frankmtaylor.com/qualified/'); @@ -387,6 +428,32 @@ describe('SiteCrawler: Fetching Sitemap', () => { expect(sitemapLinks.length).toEqual(7); }); }); + describe('static getsitemaps', () => { + test('it will create an array from a json object', async () => { + const siteCrawler = new SiteCrawler(); + siteCrawler.libraries.ajax = axios; + const siteMapJson = await siteCrawler.getSitemapAsync('https://frankmtaylor.com/nested-sitemap.xml'); + const sitemapLinks = SiteCrawler.getSitemapsFromSitemap(siteMapJson); + expect(sitemapLinks).toBeInstanceOf(Array); + expect(sitemapLinks.length).toEqual(2); + }); + }); + describe('getSitemapLinks', () => { + const siteCrawler = new SiteCrawler(); + siteCrawler.libraries.ajax = axios; + + test('it gets links from a sitemap', async () => { + const sitemapLinks = await siteCrawler.getSitemapLinks('https://frankmtaylor.com/sitemap.xml'); + expect(sitemapLinks).toBeInstanceOf(Array); + expect(sitemapLinks.length).toEqual(7); + }); + + test('it gets links from a nested sitemap', async () => { + const sitemapLinks = await siteCrawler.getSitemapLinks('https://frankmtaylor.com/nested-sitemap.xml'); + expect(sitemapLinks).toBeInstanceOf(Array); + expect(sitemapLinks.length).toEqual(11); + }); + }); describe('setSitemap', () => { test('The linkSet will have the same links from sitemap', async () => { const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' }); @@ -406,10 +473,20 @@ describe('SiteCrawler: Fetching Sitemap', () => { describe('produceSiteLinks', () => { test('when produceSiteLinks is run, a file is created and it knows it, and still has data', async () => { const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' }); + siteCrawler.libraries.ajax = axios; await siteCrawler.produceSiteLinks(); expect(siteCrawler.hasExportedLinks).toEqual(true); expect(siteCrawler.linkSet.size).toBeGreaterThan(0); expect(siteCrawler.linkSet.has('http://frankmtaylor.com')); }); }); + describe('nested sitemap', () => { + test('it can crawl a nested sitemap', async () => { + const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/nested-sitemap.xml' }); + siteCrawler.libraries.ajax = axios; + + await siteCrawler.setSitemap(); + expect(siteCrawler.linkSet.size).toEqual(11); + }); + }); });