Skip to content

Commit

Permalink
Merge pull request #1 from surenpoghosian/feature/Scheduler
Browse files Browse the repository at this point in the history
[feature/Scheduler] add scheduler, add scrape validator factory
  • Loading branch information
surenpoghosian authored Jul 23, 2024
2 parents bc96268 + 126a2eb commit f580e65
Show file tree
Hide file tree
Showing 13 changed files with 223 additions and 45 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ web_modules/

# dotenv environment variable files
.env
.env.production
.env.development
.env.staging
.env.development.local
Expand Down
4 changes: 4 additions & 0 deletions README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,9 @@
### RUN
`npx tsx src/index.ts`

### RUN THE SCHEDULER

`npx tsx src/Scheduler/index.ts`

### TEST
`npm run test`
40 changes: 40 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
"devDependencies": {
"@types/jest": "^29.5.12",
"@types/node": "^20.14.10",
"@types/node-cron": "^3.0.11",
"@types/uuid": "^10.0.0",
"eslint": "^9.6.0",
"fs": "^0.0.1-security",
"globals": "^15.8.0",
"https": "^1.0.0",
"jest": "^29.7.0",
"node-cron": "^3.0.3",
"ts-jest": "^29.1.5",
"ts-node": "^10.9.2",
"tsx": "^4.16.2",
Expand Down
67 changes: 67 additions & 0 deletions src/Scheduler/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import cron from 'node-cron';
import Main from '..';

class ScraperScheduler {
private cronExpression: string;
private task: cron.ScheduledTask | null = null;

constructor(
cronExpression: string
) {
this.cronExpression = cronExpression;
}

async scrape(): Promise<void> {
console.log("Waking up to scrape...");

const main = new Main();
await main.run();

console.log("Scraping done. Going back to sleep.");
}

start(): void {
this.task = cron.schedule(this.cronExpression, () => {
console.log('cron triggered');
this.scrape().catch((error) => {
console.error("Error during scraping:", error);
});
});

console.log(`Scheduler started with cron expression: ${this.cronExpression}`);
}

stop(): void {
if (this.task) {
this.task.stop();
console.log("Scheduler stopped.");
}
}
}

// Configuration and initialization
// const cronExpression = '0 */12 * * *'; // Every 12 hours
const cronExpression = '*/5 * * * *'; // Every 5 minutes
// const cronExpression = '* * * * *'; // Every minute

const scheduler = new ScraperScheduler(
cronExpression
);

// Start the scheduler
scheduler.start();

// Handle graceful shutdown
process.on('SIGINT', () => {
console.log("Received SIGINT. Gracefully shutting down.");
scheduler.stop();
process.exit(0);
});

process.on('SIGTERM', () => {
console.log("Received SIGTERM. Gracefully shutting down.");
scheduler.stop();
process.exit(0);
});

export default ScraperScheduler;
6 changes: 3 additions & 3 deletions src/ScrapableFactory/index.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import ListAm from "./products/ListAm";
import MobileCentre from "./products/MobileCentre";
import { IScraper, ScrapeableVariant } from "../configs/types";
import { IScraper, IScrapeValidator, ScrapeableVariant } from "../configs/types";

class ScrapableFactory {
static createScrapable(type: ScrapeableVariant, scraper: IScraper) {
static createScrapable(type: ScrapeableVariant, scraper: IScraper, validator: IScrapeValidator) {
switch (type) {
case ScrapeableVariant.LISTAM:
return new ListAm(scraper);
return new ListAm(scraper, validator);
case ScrapeableVariant.MOBILECENTRE:
return new MobileCentre(scraper);
default:
Expand Down
49 changes: 20 additions & 29 deletions src/ScrapableFactory/products/ListAm.ts
Original file line number Diff line number Diff line change
@@ -1,44 +1,35 @@
import ScrapersFactory from "../../ScrapersFactory";
import { IScraper, Scrapable, ScraperType, ScrapeType } from "../../configs/types";
import { IScraper, IScrapeValidator, Scrapable, ScrapeType } from "../../configs/types";
import { ListAmBaseURL as baseUrl } from '../../configs/constants';

class ListAm implements Scrapable {
public scraper: IScraper;
private scraper: IScraper;
private validator: IScrapeValidator;

constructor(scraper: IScraper) {
constructor(scraper: IScraper, validator: IScrapeValidator) {
this.scraper = scraper;
this.validator = validator;
}

scrape = async (scrapeId: string, path: string, scrapeType: ScrapeType) => {
const html = await this.scraper.get(baseUrl, path);

const data = this.scraper.parse(html);

if (scrapeType === ScrapeType.ITEM) {
const price = data.querySelector('.xprice')?.rawText;
const description = data.querySelector('.body')?.rawText;

const otherDetails = data.querySelectorAll('.c')
.map(item => {
return {
key: item.querySelector('.t')?.rawText,
data: item.querySelector('.i')?.rawText,
};
})
.filter(item => item.key !== undefined && item.data !== undefined) as { key: string; data: string }[];

await this.scraper.save({ scrapeId, baseUrl, path, scrapeType, html, price, description, otherDetails });

} else if (scrapeType === ScrapeType.LIST) {
const parentDivs = data.querySelectorAll('.gl');
console.log({html, parentDivs})
if (!parentDivs.length) {
throw new Error('Parent div not found');
}

const hrefs = parentDivs.map(item => item.querySelectorAll('a').map(link => link.getAttribute('href')));

await this.scraper.save({ scrapeId, baseUrl, path, scrapeType, html, hrefs });
const isHtmlValid = this.validator.validate(data, scrapeType);

if (isHtmlValid) {
await this.scraper.save({ scrapeId, baseUrl, path, scrapeType, html });
} else {
console.error('scraped html is not valid');
}

const nextLinkExists = data.querySelectorAll('a').some(a => {
const text = a.text.trim();
return text === 'Հաջորդը >' || text === 'Next >' || text === 'Следующая >';
});

if (!nextLinkExists) {
throw new Error('...it was the last page');
}
};
};
Expand Down
1 change: 0 additions & 1 deletion src/ScrapableFactory/products/MobileCentre.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { Scrapable } from "../../configs/types";
import { IScraper } from "../../configs/types";
import { MobileCentreBaseURL } from "../../configs/constants";

export default class MobileCentre implements Scrapable {
public scraper: IScraper;
Expand Down
18 changes: 18 additions & 0 deletions src/ScrapeValidatorFactory/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import ListAm from "./products/ListAm";
import MobileCentre from "./products/MobileCentre";
import { ScrapeValidatorVariant } from "../configs/types";

class ScrapeValidatorFactory {
static createScrapeValidator(type: ScrapeValidatorVariant) {
switch (type) {
case ScrapeValidatorVariant.LISTAM:
return new ListAm();
case ScrapeValidatorVariant.MOBILECENTRE:
return new MobileCentre();
default:
throw new Error(`Unsupported scrape type: ${type}`);
}
}
};

export default ScrapeValidatorFactory;
27 changes: 27 additions & 0 deletions src/ScrapeValidatorFactory/products/ListAm.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import { HTMLElement } from "node-html-parser";
import { ScrapeType, IScrapeValidator } from "../../configs/types";

class ListAm implements IScrapeValidator {

validate(html: HTMLElement, scrapeType: ScrapeType): boolean {
console.log(`Validating scrape...`);

switch (scrapeType) {
case ScrapeType.ITEM:
throw new Error('ScrapeType.ITEM validator logic is not implemented!');
case ScrapeType.LIST:
const parentDivs = html.querySelectorAll('.gl');
if (!parentDivs.length) {
return false;
}
break;
default:
break;
}

console.log('HTML is valid');
return true;
};
}

export default ListAm;
11 changes: 11 additions & 0 deletions src/ScrapeValidatorFactory/products/MobileCentre.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { HTMLElement } from "node-html-parser";
import { ScrapeType, IScrapeValidator } from "../../configs/types";

class MobileCentre implements IScrapeValidator {

validate(html: HTMLElement, scrapeType: ScrapeType): boolean {
throw new Error('function not implemented')
};
}

export default MobileCentre;
13 changes: 11 additions & 2 deletions src/configs/types.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { HTMLElement } from "node-html-parser";

export interface Scrapable {
scraper: IScraper;
scrape: (scrapeId: string, path: string, scrapeType: ScrapeType) => void;
}

Expand Down Expand Up @@ -55,7 +54,7 @@ export enum ListAmCurrency {
}

export enum ListAmGeolocation {
YEREVAN = 'n=1', // n=1 Yerevan
YEREVAN = 'n=1', // n=1 Yerevan
AJAPNYAK = 'n=2', // n=2 Yerevan/Ajapnyak
ARABKIR = 'n=3', // n=3 Yerevan/Arabkir
AVAN = 'n=4', // n=4 Yerevan/Avan
Expand All @@ -79,3 +78,13 @@ export enum ListAmCategory {
PARKING_LOT_AND_GARAGE_RENT = '175',
ROOM_FOR_A_RENT = '212',
}

export interface IScrapeValidator {
validate(html: HTMLElement, scrapeType: ScrapeType): boolean;
}


export enum ScrapeValidatorVariant {
LISTAM,
MOBILECENTRE,
}
Loading

0 comments on commit f580e65

Please sign in to comment.