Skip to content

Commit

Permalink
fix(scraper): add lock to web content access to avoid duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Dec 17, 2024
1 parent 1911889 commit a381706
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions npiai/tools/web/scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,17 @@ class Scraper(BrowserTool):

_navigator: NavigatorAgent

# asyncio lock to prevent concurrent access to the webpage
# to avoid retrieving the same items multiple times
_webpage_access_lock: asyncio.Lock

def __init__(self, batch_size: int = 10, **kwargs):
super().__init__(**kwargs)
self._navigator = NavigatorAgent(
playwright=self.playwright,
)
self._batch_size = batch_size
self._webpage_access_lock = asyncio.Lock()
self.add_tool(self._navigator)

@classmethod
Expand Down Expand Up @@ -392,13 +397,14 @@ async def _parse(
limit: int = -1,
skip_item_hashes: Set[str] | None = None,
) -> ParsedResult | None | None:
# convert relative links to absolute links
await self._process_relative_links()
async with self._webpage_access_lock:
# convert relative links to absolute links
await self._process_relative_links()

if items_selector is None:
return await self._parse_ancestor(ancestor_selector, skip_item_hashes)
else:
return await self._parse_items(items_selector, limit, skip_item_hashes)
if items_selector is None:
return await self._parse_ancestor(ancestor_selector, skip_item_hashes)
else:
return await self._parse_items(items_selector, limit, skip_item_hashes)

async def _parse_items(
self,
Expand Down

0 comments on commit a381706

Please sign in to comment.