Merge pull request #5 from shivendrra/dev

pulling new updated version from dev branch
shivendrra · Aug 30, 2024 · b771ffe · b771ffe
2 parents fc6c89e + fad7f8e
commit b771ffe
Show file tree

Hide file tree

Showing 30 changed files with 765 additions and 1,079 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -27,6 +27,6 @@ If you encounter any bugs, issues, or have feature requests, please [open an iss
 
 ## Questions
 
-If you have any questions or need further clarification, feel free to reach out to me, email: shivharsh44@gmai.com
+If you have any questions or need further clarification, feel free to reach out to me, email: shivharsh44@gmail.com
 
 I appreciate your contributions and hope you enjoy working with us on this project!
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ This repository contains a collection of scripts to scrape content from various
 - [Usage](#usage)
   - [YouTube Scraper](#youtube-scraper)
   - [Wikipedia Scraper](#wikipedia-scraper)
-  - [Britannica Scraper](#britannica-scraper)
+  - [Unsplash Scraper](#unsplash-scraper)
 - [Configuration](#configuration)
 - [Logging](#logging)
 
@@ -33,7 +33,35 @@ This repository contains a collection of scripts to scrape content from various
 
 ## Usage
 
-### YouTube Scraper
+### 1. Queries
+
+This library contains some topics, keywords, search queries & channel ids which you can just load & use it with the respective scrapers.
+
+#### Channel Ids
+
+```python
+from graze.queries import Queries
+
+queries = Queries(category="channel")
+```
+
+#### Search Queries
+
+```python
+from graze.queries import Queries
+
+queries = Queries(category="search")
+```
+
+#### Image Topics
+
+```python
+from graze.queries import Queries
+
+queries = Queries(category="channel")
+```
+
+### 2. YouTube Scraper
 
 The YouTube scraper fetches video captions from a list of channels.
 
@@ -54,65 +82,40 @@ The YouTube scraper fetches video captions from a list of channels.
 #### Running the Scraper
 
 ```python
+import os
 from dotenv import load_dotenv
 load_dotenv()
+current_directory = os.path.dirname(os.path.abspath(__file__))
+os.chdir(current_directory)
+
 api_key = os.getenv('yt_key')
 
-from graze import youtube
+from graze import Youtube
+from graze.queries import Queries
+
+queries = Queries(category="channel")
 
-scraper = youtube(api_key=api_key, filepath='./output.txt')
-scraper()
+youtube = Youtube(api_key=api_key, filepath='../transcripts', max_results=50)
+youtube(channel_ids=queries(), videoUrls=True)
 ```
 
-### Wikipedia Scraper
+### 3. Wikipedia Scraper
 
 The Wikipedia scraper generates target URLs from provided queries, fetches the complete web page, and writes it to a file.
 
-#### Configuration
-- Define your search queries in `queries.py`:
-  ```python
-  class WikiQueries:
-      def __init__(self):
-          self.search_queries = ["topic1", "topic2", "topic3"]
-
-      def __call__(self):
-          return self.search_queries
-  ```
-
 #### Running the Scraper
 
 ```python
-from graze import wikipedia
-
-wiki = wikipedia()
-wiki(out_file='./output.txt')
-```
-
-### Britannica Scraper
-
-The Britannica scraper fetches content based on search queries and writes it to a file.
+from graze import Wikipedia
+from graze.queries import Queries
 
-#### Configuration
-- Define your search queries in `queries.py`:
-  ```python
-  class BritannicaQueries:
-      def __init__(self):
-          self.search_queries = ["topic1", "topic2", "topic3"]
-
-      def __call__(self):
-          return self.search_queries
-  ```
+queries = Queries(category="search")
+wiki = Wikipedia(filepath='../data.txt', metrics=True)
 
-#### Running the Scraper
-
-```python
-from graze import britannica
-
-scraper = britannica(max_limit=20)
-scraper(out_file='./output.txt')
+wiki(queries=queries(), extra_urls=True)
 ```
 
-### Unsplash Scraper
+### 4. Unsplash Scraper
 
 The Unsplash Image scraper fetches images based on given topics & saves them in their respective folders
 
@@ -125,9 +128,13 @@ The Unsplash Image scraper fetches images based on given topics & saves them in
 #### Running the Scraper
 
 ```python
-import graze
+from graze import Unsplash
+from graze.queries import Queries
+
+topics = Queries("images")
 
-scraper = graze.unsplash(topics=search_queries)
+image = Unsplash(directory='../images', metrics=True)
+image(topics=topics())
 ```
 
 #### Output:
@@ -148,7 +155,7 @@ Downloading : 100%|████████████████████
 
 ## Logging
 
-The YouTube scraper logs errors to `youtube_fetch.log`. Make sure to check this file for detailed error messages and troubleshooting information.
+Each scraper logs errors to respective `.log` file. Make sure to check this file for detailed error messages & troubleshooting information.
 
 ## Contribution
 Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. Please make sure to update tests as appropriate.

diff --git a/graze/__init__.py b/graze/__init__.py
@@ -1,4 +1,6 @@
-from .youtube.base import Youtube as youtube
-from .britannica.main import Britannica as britannica
-from .wikipedia.main import WikiScraper as wikipedia
-from .unsplash import unsplash
+from .queries import Queries
+from .utils import *
+from ._transcripts import Youtube
+from ._unsplash import Unsplash
+from ._wiki import Wikipedia
+from ._britannica import Britannica
diff --git a/graze/_britannica.py b/graze/_britannica.py
@@ -0,0 +1,102 @@
+import os
+import logging
+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+import timeit, time
+import re
+
+logging.basicConfig(filename="britannica_scraper.log", level=logging.ERROR)
+current_dir = os.path.dirname(os.path.abspath(__file__))
+os.chdir(current_dir)
+
+def build_britannica_url(query, page_no):
+  formatted_query = '%20'.join(query.split(' '))
+  url = f"https://www.britannica.com/search?query={formatted_query}&page={page_no}"
+  return url
+
+def get_target_url(target_url, headers):
+  while True:
+    r = requests.get(target_url, headers=headers)
+    if r.status_code == 200:
+      html_content = r.content
+      soup = BeautifulSoup(html_content, 'html.parser')
+      fetched_urls = soup.find_all('a', class_='md-crosslink')
+      list_url = [url.get('href') for url in fetched_urls]
+      return list_url
+    elif r.status_code == 429:
+      print(f"Rate limit exceeded. Waiting 30secs before retrying: {target_url}")
+      time.sleep(30)
+    else:
+      print(f"Skipping this URL due to status code {r.status_code}: {target_url}")
+      return []
+
+class Britannica:
+  def __init__(self, filepath:str, max_limit:int=10, metrics:bool=False) -> None:
+    self.directory, filename_with_ext = os.path.split(filepath)
+    self.filename, ext = os.path.splitext(filename_with_ext)
+    self.filename = self.filename.strip()
+    if not os.path.exists(self.directory):
+      os.makedirs(self.directory)
+    self.max_limit = max_limit
+    self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
+    self.metrics = metrics
+    self.total_urls = 0
+    self.total_pages = 0
+
+  def __call__(self, queries:list[str]):
+    if not queries:
+      raise ValueError("Search queries can't be empty.")
+    else:
+      self.total_time = timeit.default_timer()
+      for query in tqdm(queries, desc="Generating Britannica URLs"):
+        page_no = 1
+        for i in range(self.max_limit):
+          target_url = build_britannica_url(query, page_no)
+          new_urls = get_target_url(target_url, self.headers)
+          if new_urls:
+            self.write_urls_to_file(new_urls)
+            self.total_urls += len(new_urls)
+          page_no += 1
+
+      self.total_time = timeit.default_timer() - self.total_time
+      if self.metrics:
+        self.get_metrics()
+
+  def text_extractor(self, url_snippet):
+    target_url = f"https://britannica.com{url_snippet}"
+    r = requests.get(target_url, headers=self.headers)
+
+    if r.status_code == 200:
+      soup = BeautifulSoup(r.content, 'html.parser')
+      paragraphs = soup.find_all('p')
+      page = '\n'.join([p.get_text() for p in paragraphs if "Our editors will review what you’ve submitted and determine whether to revise the article." not in p.get_text()])
+      page = re.sub('&\w+;', '', page)
+      self.total_pages += 1
+      return page
+    else:
+      print(f"Failed to fetch page content: {target_url}")
+      return None
+
+  def write_urls_to_file(self, url_snippets):
+    filepath = os.path.join(self.directory, f"{self.filename}.txt")
+    with open(filepath, 'a', encoding='utf-8') as f:
+      for snippet in url_snippets:
+        page = self.text_extractor(snippet)
+        if page:
+          f.write(page)
+          f.write("\n")
+
+  def get_metrics(self):
+    print("\n")
+    print("Britannica scraping metrics:\n")
+    print("------------------------------------------------------")
+    print(f"Total URLs fetched: {self.total_urls}")
+    print(f"Total pages extracted: {self.total_pages}")
+    if self.total_time < 60:
+      print(f"Total time taken: {self.total_time:.2f} seconds")
+    elif self.total_time < 3600:
+      print(f"Total time taken: {self.total_time/60:.2f} minutes")
+    else:
+      print(f"Total time taken: {self.total_time/3600:.2f} hours")
+    print("------------------------------------------------------")