From 0e1f62309d9bd3c18325018d478f8f928e5f3a3e Mon Sep 17 00:00:00 2001 From: JakubVavera <36154592+JakubVavera@users.noreply.github.com> Date: Mon, 25 Sep 2023 14:03:53 +0200 Subject: [PATCH] Vavera (#42) * Fixing issues * Remove comment * Fix node version * Fix deepscan issue * Fix wrong data type * Prettier formatting * Fix excluding pattern --------- Co-authored-by: Jakub --- .gitignore | 1 + README.md | 5 ++- package-lock.json | 89 +++++++++++++++++++++++++++++++++++++++++- package.json | 14 +++++++ src/cli-inputs.ts | 15 +++++-- src/index.ts | 25 ++++++------ src/link-visit.ts | 15 +++++-- src/logger.ts | 1 - src/sitemap-parsers.ts | 43 ++++++++++---------- src/types.ts | 3 +- 10 files changed, 166 insertions(+), 45 deletions(-) diff --git a/.gitignore b/.gitignore index f786fc3..8b9bdac 100644 --- a/.gitignore +++ b/.gitignore @@ -211,6 +211,7 @@ cmake-build-*/ # IntelliJ out/ +.idea # mpeltonen/sbt-idea plugin .idea_modules/ diff --git a/README.md b/README.md index b81380d..9b00e11 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # Site Spectre CLI -![npm](https://img.shields.io/npm/v/site-spectre) -![npm type definitions](https://img.shields.io/npm/types/site-spectre) + +[![npm](https://img.shields.io/npm/v/site-spectre)](https://www.npmjs.com/package/site-spectre) +[![npm type definitions](https://img.shields.io/npm/types/site-spectre)](https://www.typescriptlang.org/) [![DeepScan grade](https://deepscan.io/api/teams/22045/projects/25384/branches/794017/badge/grade.svg)](https://deepscan.io/dashboard#view=project&tid=22045&pid=25384&bid=794017) [![codecov](https://codecov.io/gh/profiq/site-spectre-cli/graph/badge.svg?token=AIW4AXPQ4R)](https://codecov.io/gh/profiq/site-spectre-cli) ![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/profiq/site-spectre-cli/publish-rc.yaml) diff --git a/package-lock.json b/package-lock.json index 7051469..d3b2875 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,18 +1,19 @@ { "name": "site-spectre", - "version": "1.0.0", + "version": "0.0.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "site-spectre", - "version": "1.0.0", + "version": "0.0.1", "license": "MIT", "dependencies": { "chalk": "^4.1.2", "commander": "^11.0.0", "fast-xml-parser": "^4.2.6", "lodash.chunk": "^4.2.0", + "node-fetch": "^3.3.2", "playwright-chromium": "^1.36.2", "winston": "^3.10.0" }, @@ -2955,6 +2956,14 @@ "node": ">= 8" } }, + "node_modules/data-uri-to-buffer": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz", + "integrity": "sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==", + "engines": { + "node": ">= 12" + } + }, "node_modules/debug": { "version": "4.3.4", "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", @@ -3205,6 +3214,28 @@ "resolved": "https://registry.npmjs.org/fecha/-/fecha-4.2.3.tgz", "integrity": "sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw==" }, + "node_modules/fetch-blob": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.2.0.tgz", + "integrity": "sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "paypal", + "url": "https://paypal.me/jimmywarting" + } + ], + "dependencies": { + "node-domexception": "^1.0.0", + "web-streams-polyfill": "^3.0.3" + }, + "engines": { + "node": "^12.20 || >= 14.13" + } + }, "node_modules/fill-range": { "version": "7.0.1", "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", @@ -3235,6 +3266,17 @@ "resolved": "https://registry.npmjs.org/fn.name/-/fn.name-1.1.0.tgz", "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==" }, + "node_modules/formdata-polyfill": { + "version": "4.0.10", + "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz", + "integrity": "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==", + "dependencies": { + "fetch-blob": "^3.1.2" + }, + "engines": { + "node": ">=12.20.0" + } + }, "node_modules/fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", @@ -4761,6 +4803,41 @@ "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==", "dev": true }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz", + "integrity": "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==", + "dependencies": { + "data-uri-to-buffer": "^4.0.0", + "fetch-blob": "^3.1.4", + "formdata-polyfill": "^4.0.10" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/node-fetch" + } + }, "node_modules/node-int64": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", @@ -5720,6 +5797,14 @@ "makeerror": "1.0.12" } }, + "node_modules/web-streams-polyfill": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.2.1.tgz", + "integrity": "sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==", + "engines": { + "node": ">= 8" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", diff --git a/package.json b/package.json index 74575ae..54c5eb5 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,20 @@ "reinstall": "rm -fr node_modules && npm install", "prepare": "husky install" }, + "engines": { + "node": ">18" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/profiq/site-spectre-cli.git" + }, + "keywords": [ + "cli", + "tool", + "console", + "terminal", + "command" + ], "author": "profiq", "license": "MIT", "bin": { diff --git a/src/cli-inputs.ts b/src/cli-inputs.ts index 079dee2..f96980f 100644 --- a/src/cli-inputs.ts +++ b/src/cli-inputs.ts @@ -36,6 +36,7 @@ const createConfig = (options: any) => { // custom headesr --> array parallelBlockSize: Number(options.parallel), customHeaders: {}, + excludePattern: options.exclude, dryRun: options.dry, debugMode: options.debug, silentRun: options.silent, @@ -54,7 +55,7 @@ const pasteConfigFile = (config: configType, customConfigFile: any) => { // custom headesr --> array config.parallelBlockSize = Number(customConfigFile.parallelBlockSize); config.customHeaders = {}; - config.dryRun = customConfigFile.dryRun; + (config.excludePattern = customConfigFile.exclude), (config.dryRun = customConfigFile.dryRun); config.debugMode = customConfigFile.debugMode; config.silentRun = customConfigFile.silentRun; config.configFilePath = customConfigFile.configFilePath; @@ -80,7 +81,8 @@ const updateConfig = (config: configType, options: OptionValues, customConfigFil config.pageLoadType = "network"; } - config.dryRun = options.dry ? true : customConfigFile.dryRun; + (config.excludePattern = options.exclude ? true : customConfigFile.exclude), + (config.dryRun = options.dry ? true : customConfigFile.dryRun); config.debugMode = options.debug ? true : customConfigFile.debugMode; config.silentRun = options.silent ? true : customConfigFile.silentRun; config.configFilePath = options.configFile @@ -105,4 +107,11 @@ const checkConfigFile = (config: any, options: OptionValues) => { } }; -export { sitesInput, createConfig, checkConfigFile, defaultParallel, defaultTimeout }; +export { + sitesInput, + createConfig, + pasteConfigFile, + checkConfigFile, + defaultParallel, + defaultTimeout, +}; diff --git a/src/index.ts b/src/index.ts index e04b8b5..cab5964 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,10 +1,7 @@ -import { processSitemap, _readSitemap, _parseSitemap } from "./sitemap-parsers"; +import { processSitemap } from "./sitemap-parsers"; import { visitConfigPrint, visitSitesWinston } from "./link-visit"; -import yargs, { option } from "yargs"; -import { Interface } from "readline"; import { Command, Option } from "commander"; import packageJSON from "../package.json"; -import { logger } from "./logger"; import { checkConfigFile, createConfig, @@ -12,9 +9,6 @@ import { defaultTimeout, sitesInput, } from "./cli-inputs"; -import chalk from "chalk"; -import { configType } from "./types"; -import { existsSync, readFileSync } from "fs"; const program = new Command(); @@ -41,10 +35,11 @@ program .default("document"), ) .option("-w, --no-wait-page-load", "Disable waiting for page to be loaded.") - .option("-h, --custom-headers", "Pass a custom header.") // - // .option("-d, --debug", "Sets the prinout level to debug.") // + .option("-h, --custom-headers", "Pass a custom header.") + .option("-d, --debug", "Sets the prinout level to debug.") .option("-D, --dry", "Just prints the links it would visit without visiting.") .option("-s, --silent", "Log only errors.") + .option("-e, --exclude ", "Regex expression for links that should be excluded.") .option( "-c, --config-file ", "JSON config file, if you specify any other parameters, they take priority over the config file.", @@ -56,6 +51,9 @@ const options = program.opts(); const config = createConfig(options); +const regexPattern = new RegExp(options.exclude); +config.excludePattern = regexPattern; + checkConfigFile(config, options); let sites: string[] = [ @@ -63,15 +61,18 @@ let sites: string[] = [ // "https://www.profiq.com/wp-sitemap-posts-post-1.xml", // "https://www.profiq.com/wp-sitemap-posts-page-1.xml", // "https://www.profiq.com/wp-sitemap-posts-job-1.xml", - // "https://movingfast.tech/post-sitemap.xml" + // "https://movingfast.tech/post-sitemap.xml", // "https://www.advancedhtml.co.uk/sitemap.txt" ]; visitConfigPrint(config); const runMain = async () => { - //prejmenovat linksToarary ProcessSitemap - const linksToVisit = await processSitemap(program.args[0], sitesInput(options.inputFile)); + const linksToVisit = await processSitemap( + program.args[0], + sitesInput(options.inputFile), + config, + ); await visitSitesWinston(linksToVisit, config); }; diff --git a/src/link-visit.ts b/src/link-visit.ts index 9b3058d..c529f1e 100644 --- a/src/link-visit.ts +++ b/src/link-visit.ts @@ -1,6 +1,5 @@ -import exp from "constants"; -import { BrowserContext, Page, chromium, devices } from "playwright-chromium"; -import { formatConnectionMessage, newLogger, logger, printPrefix } from "./logger"; +import { BrowserContext, chromium, devices } from "playwright-chromium"; +import { formatConnectionMessage, logger, printPrefix } from "./logger"; import chalk from "chalk"; import { totalNumberOfLinks } from "./sitemap-parsers"; import { configType } from "./types"; @@ -139,6 +138,14 @@ const visitConfigPrint = (config: configType) => { ), ); } + if (config.excludePattern) { + logger.log( + "info", + printPrefix( + `exclude: ${config.excludePattern} | Will exclude sites with ${config.excludePattern} regex. Setup using -e.`, + ), + ); + } logger.log( "info", printPrefix( @@ -182,7 +189,7 @@ async function visitSitesWinston(links: string[], config: configType) { let numOfOK = 0; let numOfErrors = 0; - logger.log("info", `expected total number of links: ${totalNumberOfLinks}\n`); + logger.log("info", `\nExpected total number of links: ${totalNumberOfLinks}\n`); const sTotalTime = performance.now(); diff --git a/src/logger.ts b/src/logger.ts index 14ff642..4f18ddc 100644 --- a/src/logger.ts +++ b/src/logger.ts @@ -1,5 +1,4 @@ import { createLogger, format, transports } from "winston"; -import chalk from "chalk"; const newLogger = () => { //@ts-ignore diff --git a/src/sitemap-parsers.ts b/src/sitemap-parsers.ts index fbb87d5..6b8f97b 100644 --- a/src/sitemap-parsers.ts +++ b/src/sitemap-parsers.ts @@ -1,8 +1,6 @@ import { XMLParser } from "fast-xml-parser"; -import { formatConnectionMessage, newLogger, logger } from "./logger"; -import chalk from "chalk"; -import { log } from "console"; -import { link } from "fs"; +import { logger } from "./logger"; +import { configType } from "./types"; let totalNumberOfLinks = 0; @@ -36,8 +34,9 @@ const _parseSitemap = async (url: string) => { * @param links array of sitemap links * @returns array of extracted links */ -const extractLinks = async (links: string[]): Promise => { +const extractLinks = async (links: string[], config?: configType): Promise => { let expandedLinks: string[] = []; + let excludedLinks = 0; let tmpLinks: string[] = []; for (let i = 0; i < links.length; i++) { @@ -47,20 +46,29 @@ const extractLinks = async (links: string[]): Promise => { tmpLinks = await _objToArray(parsedSitemapObject); } else if (links[i].endsWith(".txt")) { tmpLinks = await _txtLinkToArray(links[i]); - //aa } else { logger.log("error", `Invalid sitemap: ${links[i]}\n`); continue; } for (let j = 0; j < tmpLinks.length; j++) { + if (config?.excludePattern.source != "(?:)" && config?.excludePattern.test(tmpLinks[j])) { + if (config.debugMode) { + logger.log("info", `Skipping link (excluded): ${tmpLinks[j]}\n`); + } + excludedLinks++; + continue; + } expandedLinks.push(tmpLinks[j]); + totalNumberOfLinks++; } logger.log( "info", - `Found sitemap: ${links[i]}\nNumber of links in sitemap: ${tmpLinks.length}\n`, + `Found sitemap: ${links[i]}\nNumber of links in sitemap: ${tmpLinks.length}`, ); - //totalNumberOfLinks += tmpLinks.length; + if (config?.excludePattern.source != "(?:)") { + logger.log("info", `Number of excluded links in sitemap: ${excludedLinks}`); + } } catch (error) { logger.log("error", `Error extracting links from ${links[i]}, error: ${error}`); } @@ -87,7 +95,6 @@ const _objToArray = async (parsedSitemapObject: any): Promise => { } else if (parsedSitemapObject.hasOwnProperty("urlset")) { for (let i = 0; i < parsedSitemapObject.urlset.url.length; i++) { links.push(parsedSitemapObject.urlset.url[i].loc); - totalNumberOfLinks++; } return links; } else { @@ -125,7 +132,6 @@ const _txtLinkToArray = async (url: string) => { tmpArray.forEach((element) => { if (element.startsWith("http")) { finalArray.push(element); - totalNumberOfLinks++; } }); @@ -144,28 +150,25 @@ const _txtLinkToArray = async (url: string) => { * @param sites - array of xml site links we want to visit, optional * @returns array of all xml sitemap links we will visit */ -const processSitemap = async (url: string, sites: string[] = []): Promise => { +const processSitemap = async ( + url: string, + sites: string[] = [], + config?: configType, +): Promise => { let links = []; if (sites.length) { - logger.log("info", `reading from array of xml sites, num of links: ${sites.length}\n`); + logger.log("info", `Reading from array of xml sites, num of links: ${sites.length}\n`); return extractLinks(sites); } if (url.endsWith(".txt")) { logger.log("info", `Reading from txt sitemap, link: ${url}\n`); - // links = await _txtLinkToArray(url); - // return links; } else { logger.log("info", `Reading from xml sitemap, link: ${url}\n`); } - //vymenit objToarray a parseSitemap za extractLinks - //try catch do extractLinks - // let parsedSitemapObject = await _parseSitemap(url); - return extractLinks([url]); - // links = await _objToArray(parsedSitemapObject); - // return links; + return extractLinks([url], config); }; export { diff --git a/src/types.ts b/src/types.ts index b62f1fe..5fe795e 100644 --- a/src/types.ts +++ b/src/types.ts @@ -5,10 +5,11 @@ interface configType { parallelBlockSize: number; // 0 or 1 for non parallel mode customHeaders: Record; - dryRun: boolean; //nazev? + dryRun: boolean; debugMode: boolean; silentRun: boolean; + excludePattern: RegExp; configFilePath: string; //problem pri tvoreni sitesFilePath: string; }