[enhancement] Added ability to resume from cache (#217)

Added the ability to pause the tool and resume where you left off. Read the README for more details.
AlexJSully · Aug 26, 2024 · 740b403 · 740b403
1 parent 4db21a0
commit 740b403
Show file tree

Hide file tree

Showing 7 changed files with 96 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this p
 
 To see tags and releases, please go to [Tags](https://github.com/AlexJSully/Publication-Figure-Retrieval/tags) on [GitHub](https://github.com/AlexJSully/Publication-Figure-Retrieval).
 
+## [3.0.1] - 2024-08-26
+
+Feature:
+
+-   Re-added the ability to resume the process if it was canceled
+
 ## [3.0.0] - 2024-08-25
 
 The `Publication Figures Web Scraper` has been renamed to `Publication Figure Retrieval` as it no longer scrapes data from the web. Instead, it retrieves data from the NCBI API. This major change was done to comply with the NCBI's terms of service and policies.

diff --git a/README.md b/README.md
@@ -30,17 +30,19 @@ Then run
 npm install
 ```
 
-followed by
+### Running locally
+
+To start and run the publication figure retrieval tool, run the following command:
 
 ```bash
-npm start
+npm run start
 ```
 
-This tool runs within your Node.js environment. On Windows, this script may need to run in administrator mode.
+If you chose to cancel this process at any time, you can resume and continue where you left off by running the same command. It will store the already processed PMC IDs in `build/output/cache/id.json`. To reset the cache, delete the `id.json` file.
 
 ### Usage
 
-The images are downloaded locally within the `build/processor/output` directory.
+The images are downloaded locally within the `build/output` directory. They are organized by species then by publication ID.
 
 ### API Key
 

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "publication-figure-retriever",
-	"version": "3.0.0",
+	"version": "3.0.1",
 	"description": "This tool provides a method for retrieving figures from NCBI's PubMed publications using NIH APIs for open access and publicly available publications.",
 	"main": "index.ts",
 	"scripts": {

diff --git a/src/processor/fetchArticleDetails.test.ts b/src/processor/fetchArticleDetails.test.ts
@@ -1,4 +1,6 @@
 import axios from "axios";
+import fs from "fs";
+import path from "path";
 import { fetchArticleDetails } from "./fetchArticleDetails";
 import { parseFigures } from "./parseFigures";
 
@@ -9,9 +11,27 @@ describe("fetchArticleDetails", () => {
 	const throttle = jest.fn((fn) => fn());
 	const pmids = ["PMC123456", "PMC654321"];
 	const species = "Homo sapiens";
+	const cachedIDsFilePath = path.resolve(__dirname, "../output/cache/id.json");
 
 	beforeEach(() => {
 		jest.clearAllMocks();
+		const dir = path.dirname(cachedIDsFilePath);
+		if (!fs.existsSync(dir)) {
+			fs.mkdirSync(dir, { recursive: true });
+		}
+		if (fs.existsSync(cachedIDsFilePath)) {
+			fs.unlinkSync(cachedIDsFilePath);
+		}
+		// Ensure the cached ID file is empty
+		fs.writeFileSync(cachedIDsFilePath, JSON.stringify([]));
+	});
+
+	afterEach(() => {
+		// Delete the output directory
+		const dir = path.dirname(cachedIDsFilePath);
+		if (fs.existsSync(dir)) {
+			fs.rmdirSync(dir, { recursive: true });
+		}
 	});
 
 	it("should fetch article details in batches and call parseFigures", async () => {
@@ -38,4 +58,21 @@ describe("fetchArticleDetails", () => {
 
 		consoleErrorSpy.mockRestore();
 	});
+
+	it("should cache fetched IDs and skip already cached IDs", async () => {
+		const mockResponse = { data: "<xml>mock data</xml>" };
+		(axios.get as jest.Mock).mockResolvedValue(mockResponse);
+
+		// Initial fetch to cache the IDs
+		await fetchArticleDetails(throttle, pmids, species);
+
+		expect(fs.existsSync(cachedIDsFilePath)).toBe(true);
+		const cachedIDs = JSON.parse(fs.readFileSync(cachedIDsFilePath, "utf-8"));
+		expect(cachedIDs).toEqual(pmids);
+
+		// Fetch again with the same IDs, should skip fetching
+		await fetchArticleDetails(throttle, pmids, species);
+
+		expect(axios.get).toHaveBeenCalledTimes(1); // Should not call axios.get again
+	});
 });
diff --git a/src/processor/fetchArticleDetails.ts b/src/processor/fetchArticleDetails.ts
@@ -1,4 +1,6 @@
 import axios from "axios";
+import fs from "fs";
+import path from "path";
 import { parseFigures } from "./parseFigures";
 
 /**
@@ -19,34 +21,64 @@ import { parseFigures } from "./parseFigures";
 export async function fetchArticleDetails(
 	/** The throttling function to control the rate of API requests. */
 	throttle: any,
-	/** An array of PMCIDs to fetch details for. */
+	/** An array of PMC IDs to fetch details for. */
 	pmids: string[],
 	/** The species name to be used in the processing of figures. */
 	species: string,
 ): Promise<void> {
-	/** Number of PMCIDs per batch. */
+	/** Number of PMC IDs per batch. */
 	const batchSize = 50;
 
+	// Grab cached IDs
+	/** Path to the cached IDs file. */
+	const cachedIDsFilePath = path.resolve(__dirname, "../output/cache/id.json");
+	/** Cached IDs list. */
+	let cachedIDs: string[] = [];
+	// Check if the cached IDs file exists
+	if (fs.existsSync(cachedIDsFilePath)) {
+		const data = fs.readFileSync(cachedIDsFilePath, "utf-8");
+		cachedIDs = JSON.parse(data);
+	} else {
+		// Create the directory if it doesn't exist
+		fs.mkdirSync(path.dirname(cachedIDsFilePath), { recursive: true });
+	}
+
+	// Get article details based on PMC IDs
 	for (let i = 0; i < pmids.length; i += batchSize) {
-		// Extract a batch of 50 PMCIDs
-		/** A batch of 50 PMCIDs. */
+		// Extract a batch of 50 PMC IDs
 		const batch = pmids.slice(i, i + batchSize);
-		/** Comma-separated list of PMCIDs. */
-		const ids = batch.join(",");
-		/** The URL to fetch article details for the current batch. */
+
+		// Filter out IDs that are already cached
+		const newBatch = batch.filter((id) => !cachedIDs.includes(id));
+
+		if (newBatch.length === 0) {
+			console.log(
+				`All IDs in ${species.replace("_", " ")} batch ${i + 1}-${i + batch.length} are already cached.`,
+			);
+
+			continue;
+		}
+
+		/** Comma-separated string of PMC IDs for the batch. */
+		const ids = newBatch.join(",");
+		/** URL for fetching article details from the NCBI API. */
 		let url = `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=${ids}&retmode=xml`;
-		// Check if there is a NCBI API key available and if so, add it to the URL
+		// Add the API key if available
 		if (process?.env?.NCBI_API_KEY) {
 			url += `&api_key=${process.env.NCBI_API_KEY}`;
 		}
 
-		console.log(`Fetching article details for batch ${i + 1}-${i + batch.length}...`);
+		console.log(
+			`Fetching ${species.replace("_", " ")} article details for batch ${i + 1}-${i + newBatch.length}...`,
+		);
 
 		try {
-			// Make HTTP request to fetch article details in XML format for the current batch
-			/** The response from the API request. */
 			const response = await throttle(async () => await axios.get(url));
 			await parseFigures(throttle, response.data, species);
+
+			// Add the new IDs to the cached list and write to the file
+			cachedIDs.push(...newBatch);
+			fs.writeFileSync(cachedIDsFilePath, JSON.stringify(cachedIDs, null, 2));
 		} catch (error) {
 			console.error("Error fetching article details:", error);
 		}

diff --git a/src/processor/parseFigures.ts b/src/processor/parseFigures.ts
@@ -56,7 +56,7 @@ export async function parseFigures(
 			// Download all figures for this article
 			for (const url of figureUrls) {
 				// Create the directory path for species and PMC ID
-				const outputDir = path.join(__dirname, "output", species, pmcId);
+				const outputDir = path.join(__dirname, "../output", species, pmcId);
 				if (!fs.existsSync(outputDir)) {
 					fs.mkdirSync(outputDir, { recursive: true });
 				}