diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 60ce418..d5ba21c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,6 +27,6 @@ If you encounter any bugs, issues, or have feature requests, please [open an iss ## Questions -If you have any questions or need further clarification, feel free to reach out to me, email: shivharsh44@gmai.com +If you have any questions or need further clarification, feel free to reach out to me, email: shivharsh44@gmail.com I appreciate your contributions and hope you enjoy working with us on this project! diff --git a/README.md b/README.md index 06ec79c..e2208f4 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This repository contains a collection of scripts to scrape content from various - [Usage](#usage) - [YouTube Scraper](#youtube-scraper) - [Wikipedia Scraper](#wikipedia-scraper) - - [Britannica Scraper](#britannica-scraper) + - [Unsplash Scraper](#unsplash-scraper) - [Configuration](#configuration) - [Logging](#logging) @@ -33,7 +33,35 @@ This repository contains a collection of scripts to scrape content from various ## Usage -### YouTube Scraper +### 1. Queries + +This library contains some topics, keywords, search queries & channel ids which you can just load & use it with the respective scrapers. + +#### Channel Ids + +```python +from graze.queries import Queries + +queries = Queries(category="channel") +``` + +#### Search Queries + +```python +from graze.queries import Queries + +queries = Queries(category="search") +``` + +#### Image Topics + +```python +from graze.queries import Queries + +queries = Queries(category="channel") +``` + +### 2. YouTube Scraper The YouTube scraper fetches video captions from a list of channels. @@ -54,65 +82,40 @@ The YouTube scraper fetches video captions from a list of channels. #### Running the Scraper ```python +import os from dotenv import load_dotenv load_dotenv() +current_directory = os.path.dirname(os.path.abspath(__file__)) +os.chdir(current_directory) + api_key = os.getenv('yt_key') -from graze import youtube +from graze import Youtube +from graze.queries import Queries + +queries = Queries(category="channel") -scraper = youtube(api_key=api_key, filepath='./output.txt') -scraper() +youtube = Youtube(api_key=api_key, filepath='../transcripts', max_results=50) +youtube(channel_ids=queries(), videoUrls=True) ``` -### Wikipedia Scraper +### 3. Wikipedia Scraper The Wikipedia scraper generates target URLs from provided queries, fetches the complete web page, and writes it to a file. -#### Configuration -- Define your search queries in `queries.py`: - ```python - class WikiQueries: - def __init__(self): - self.search_queries = ["topic1", "topic2", "topic3"] - - def __call__(self): - return self.search_queries - ``` - #### Running the Scraper ```python -from graze import wikipedia - -wiki = wikipedia() -wiki(out_file='./output.txt') -``` - -### Britannica Scraper - -The Britannica scraper fetches content based on search queries and writes it to a file. +from graze import Wikipedia +from graze.queries import Queries -#### Configuration -- Define your search queries in `queries.py`: - ```python - class BritannicaQueries: - def __init__(self): - self.search_queries = ["topic1", "topic2", "topic3"] - - def __call__(self): - return self.search_queries - ``` +queries = Queries(category="search") +wiki = Wikipedia(filepath='../data.txt', metrics=True) -#### Running the Scraper - -```python -from graze import britannica - -scraper = britannica(max_limit=20) -scraper(out_file='./output.txt') +wiki(queries=queries(), extra_urls=True) ``` -### Unsplash Scraper +### 4. Unsplash Scraper The Unsplash Image scraper fetches images based on given topics & saves them in their respective folders @@ -125,9 +128,13 @@ The Unsplash Image scraper fetches images based on given topics & saves them in #### Running the Scraper ```python -import graze +from graze import Unsplash +from graze.queries import Queries + +topics = Queries("images") -scraper = graze.unsplash(topics=search_queries) +image = Unsplash(directory='../images', metrics=True) +image(topics=topics()) ``` #### Output: @@ -148,7 +155,7 @@ Downloading : 100%|████████████████████ ## Logging -The YouTube scraper logs errors to `youtube_fetch.log`. Make sure to check this file for detailed error messages and troubleshooting information. +Each scraper logs errors to respective `.log` file. Make sure to check this file for detailed error messages & troubleshooting information. ## Contribution Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. Please make sure to update tests as appropriate. diff --git a/graze/__init__.py b/graze/__init__.py index 3876f6e..89dbcaf 100644 --- a/graze/__init__.py +++ b/graze/__init__.py @@ -1,4 +1,6 @@ -from .youtube.base import Youtube as youtube -from .britannica.main import Britannica as britannica -from .wikipedia.main import WikiScraper as wikipedia -from .unsplash import unsplash \ No newline at end of file +from .queries import Queries +from .utils import * +from ._transcripts import Youtube +from ._unsplash import Unsplash +from ._wiki import Wikipedia +from ._britannica import Britannica \ No newline at end of file diff --git a/graze/_britannica.py b/graze/_britannica.py new file mode 100644 index 0000000..547dde2 --- /dev/null +++ b/graze/_britannica.py @@ -0,0 +1,102 @@ +import os +import logging +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm +import timeit, time +import re + +logging.basicConfig(filename="britannica_scraper.log", level=logging.ERROR) +current_dir = os.path.dirname(os.path.abspath(__file__)) +os.chdir(current_dir) + +def build_britannica_url(query, page_no): + formatted_query = '%20'.join(query.split(' ')) + url = f"https://www.britannica.com/search?query={formatted_query}&page={page_no}" + return url + +def get_target_url(target_url, headers): + while True: + r = requests.get(target_url, headers=headers) + if r.status_code == 200: + html_content = r.content + soup = BeautifulSoup(html_content, 'html.parser') + fetched_urls = soup.find_all('a', class_='md-crosslink') + list_url = [url.get('href') for url in fetched_urls] + return list_url + elif r.status_code == 429: + print(f"Rate limit exceeded. Waiting 30secs before retrying: {target_url}") + time.sleep(30) + else: + print(f"Skipping this URL due to status code {r.status_code}: {target_url}") + return [] + +class Britannica: + def __init__(self, filepath:str, max_limit:int=10, metrics:bool=False) -> None: + self.directory, filename_with_ext = os.path.split(filepath) + self.filename, ext = os.path.splitext(filename_with_ext) + self.filename = self.filename.strip() + if not os.path.exists(self.directory): + os.makedirs(self.directory) + self.max_limit = max_limit + self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} + self.metrics = metrics + self.total_urls = 0 + self.total_pages = 0 + + def __call__(self, queries:list[str]): + if not queries: + raise ValueError("Search queries can't be empty.") + else: + self.total_time = timeit.default_timer() + for query in tqdm(queries, desc="Generating Britannica URLs"): + page_no = 1 + for i in range(self.max_limit): + target_url = build_britannica_url(query, page_no) + new_urls = get_target_url(target_url, self.headers) + if new_urls: + self.write_urls_to_file(new_urls) + self.total_urls += len(new_urls) + page_no += 1 + + self.total_time = timeit.default_timer() - self.total_time + if self.metrics: + self.get_metrics() + + def text_extractor(self, url_snippet): + target_url = f"https://britannica.com{url_snippet}" + r = requests.get(target_url, headers=self.headers) + + if r.status_code == 200: + soup = BeautifulSoup(r.content, 'html.parser') + paragraphs = soup.find_all('p') + page = '\n'.join([p.get_text() for p in paragraphs if "Our editors will review what you’ve submitted and determine whether to revise the article." not in p.get_text()]) + page = re.sub('&\w+;', '', page) + self.total_pages += 1 + return page + else: + print(f"Failed to fetch page content: {target_url}") + return None + + def write_urls_to_file(self, url_snippets): + filepath = os.path.join(self.directory, f"{self.filename}.txt") + with open(filepath, 'a', encoding='utf-8') as f: + for snippet in url_snippets: + page = self.text_extractor(snippet) + if page: + f.write(page) + f.write("\n") + + def get_metrics(self): + print("\n") + print("Britannica scraping metrics:\n") + print("------------------------------------------------------") + print(f"Total URLs fetched: {self.total_urls}") + print(f"Total pages extracted: {self.total_pages}") + if self.total_time < 60: + print(f"Total time taken: {self.total_time:.2f} seconds") + elif self.total_time < 3600: + print(f"Total time taken: {self.total_time/60:.2f} minutes") + else: + print(f"Total time taken: {self.total_time/3600:.2f} hours") + print("------------------------------------------------------") \ No newline at end of file diff --git a/graze/_transcripts.py b/graze/_transcripts.py new file mode 100644 index 0000000..3e866c6 --- /dev/null +++ b/graze/_transcripts.py @@ -0,0 +1,129 @@ +from typing import * +from tqdm import tqdm +import json +import os +import logging +from googleapiclient.discovery import build +from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled +import timeit + +logging.basicConfig(filename="youtube_fetch.log", level=logging.ERROR) +current_dir = os.path.dirname(os.path.realpath(__file__)) +os.chdir(current_dir) + +def get_captions(videoId): + try: + captions = YouTubeTranscriptApi.get_transcript(videoId, languages=["en"], preserve_formatting=True) + if captions: + formatted_captions = [caption["text"] for caption in captions] + transcript = "\n".join(formatted_captions) + return transcript, True + else: + return "", False + except TranscriptsDisabled as e: + logging.error(f"Error while fetching the videoId: {videoId} 's transcripts: \n{str(e)}") + return "", False + except Exception as e: + logging.error(f"An exception occured while fetching videoId: {videoId} \n{str(e)}") + return "", False + +class Youtube: + def __init__(self, api_key:str, filepath:str, max_results:int=50, metrics:bool=False) -> None: + self.api_key = api_key + self.directory, filename_with_ext = os.path.split(filepath) + self.filename, ext = os.path.splitext(filename_with_ext) + self.filename = self.filename.strip() + if not os.path.exists(self.directory): + os.makedirs(self.directory) + self.youtube_build = build('youtube', 'v3', developerKey=api_key) + self.maxResults = max_results + self.metrics = metrics + self.videoNo = 0 + self.total_time = 0 + self.valid_videoNo = 0 + + def __call__(self, channel_ids:list[str], videoUrls:bool=False) -> str: + if channel_ids is not None: + for channelId in channel_ids: + assert isinstance(channelId, str) and len(channelId) == 24 and channelId.startswith("UC"), "Invalid YouTube channel ID" + videoIds = self.fetch_url(channelId) + if videoUrls: + urldict = [] + for i in videoIds: + videoLink = f"https://www.youtube.com/watch?v={i}" + urldict.append(videoLink) + self.write_links_in_json(urldict) + else: + del videoIds + if self.metrics: + self.get_metrics() + else: + raise ValueError("Channel_Ids can't be empty") + + def fetch_url(self, channelId): + next_page_token = None + videoIds = [] + while True: + response = self.youtube_build.channels().list( + part='snippet', id=channelId + ).execute() + channel_name = response["items"][0]["snippet"]["title"] + + response = self.youtube_build.channels().list( + part="contentDetails", id=channelId + ).execute() + + if "items" in response and response["items"]: + playlistId = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"] + playlistRes = self.youtube_build.playlistItems().list( + part="contentDetails", playlistId=playlistId, + maxResults=self.maxResults, pageToken=next_page_token + ).execute() + + for item in playlistRes.get("items", []): + videoIds.extend([item["contentDetails"]["videoId"]]) + + next_page_token = playlistRes.get('nextPageToken') + if not next_page_token: + break + else: + logging.error(f"can't find any available video for channel id: {channelId}") + start_time = timeit.default_timer() + with tqdm(total=len(videoIds), desc=f"Fetching Captions for channelId: {channel_name}") as pbar: + for ids in videoIds: + transcripts, _captions = get_captions(videoId=ids) + if _captions is True: + self.write_in_file(transcripts) + self.valid_videoNo += 1 + self.videoNo += 1 + pbar.update(1) + end_time = timeit.default_timer() + self.total_time += end_time - start_time + return videoIds + + def get_metrics(self): + def format_time(seconds): + if seconds < 60: + return f"{seconds:.2f} seconds" + elif seconds < 3600: + return f"{seconds / 60:.2f} minutes" + else: + return f"{seconds / 3600:.2f} hours" + + print("\n") + print("Youtube video caption fetching metrics:\n") + print("-------------------------------------------") + print(f"Total videos fetched: {self.videoNo}") + print(f"Total videos that had captions: {self.valid_videoNo}") + print(f"Total time taken: {format_time(self.total_time)}") + print("-------------------------------------------") + + def write_in_file(self, transcripts): + filepath = os.path.join(self.directory, f"{self.filename}.txt") + with open(f"{filepath}", "a", encoding="utf-8") as outfile: + outfile.write(transcripts) + + def write_links_in_json(self, urls): + filepath = os.path.join(self.directory, f"{self.filename}.json") + with open(f"{filepath}", "a", encoding="utf-8") as outfile: + json.dump(urls, outfile, indent=2) \ No newline at end of file diff --git a/graze/_unsplash.py b/graze/_unsplash.py new file mode 100644 index 0000000..fc2afa1 --- /dev/null +++ b/graze/_unsplash.py @@ -0,0 +1,102 @@ +import os +import logging +import requests +import base64 +from bs4 import BeautifulSoup as bs +import re +from tqdm import tqdm +import timeit + +logging.basicConfig(filename="image_downloading.log", level=logging.ERROR) +current_dir = os.path.dirname(os.path.realpath(__file__)) +os.chdir(current_dir) + +def download_img(src, img_name): + try: + image_data = requests.get(src).content + with open(img_name, 'wb') as img_file: + img_file.write(image_data) + except Exception as e: + logging.error(f"Error while downloading image: {str(e)}") + +def download_base64_img(data, img_name): + try: + image_data = base64.b64decode(data) + with open(img_name, 'wb') as img_file: + img_file.write(image_data) + except Exception as e: + logging.error(f"Error while downloading image: {str(e)}") + +class Unsplash: + def __init__(self, directory:str, metrics:bool=False) -> None: + self.download_dir = directory + self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" } + self.imagesNo = 0 + self.total_time = 0 + self.metrics = metrics + + def __call__(self, topics): + if not topics: + raise ValueError("Enter some topic, can't be empty") + self.topicNo = len(topics) + for topic in topics: + print(f"\nDownloading '{topic}' images:") + self.fetch_url(topic) + if self.metrics: + self.get_metrics() + + def get_metrics(self): + def format_time(seconds): + if seconds < 60: + return f"{seconds:.2f} seconds" + elif seconds < 3600: + return f"{seconds / 60:.2f} minutes" + else: + return f"{seconds / 3600:.2f} hours" + + print("\n") + print("Youtube video caption fetching metrics:\n") + print("-------------------------------------------") + print(f"total topics fetched: {self.topicNo}") + print(f"Total images downloaded: {self.imagesNo}") + print(f"Total time taken: {format_time(self.total_time)}") + print("-------------------------------------------") + + def fetch_url(self, topic): + formatted_query = '-'.join(topic.split(' ')) + url = f"https://unsplash.com/s/photos/{formatted_query}" + + r = requests.get(url, headers=self.headers) + if r.status_code == 200: + html_content = r.content + soup = bs(html_content, 'html.parser') + img_tags = soup.find_all('img') + + topic_dir = os.path.join(self.download_dir, formatted_query) + if not os.path.exists(topic_dir): + os.makedirs(topic_dir) + + start_time = timeit.default_timer() + with tqdm(total=len(img_tags), desc=f"Downloading '") as pbar: + for idx, img in enumerate(img_tags): + try: + img_src = img.get('src') + if not img_src: + img_src = img.get('data-src') + + if img_src: + img_name = os.path.join(topic_dir, f'{idx}.jpg') + + if img_src.startswith('data:image'): + base64_data = re.sub('^data:image/.+;base64,', '', img_src) + download_base64_img(base64_data, img_name) + else: + download_img(img_src, img_name) + self.imagesNo += 1 + pbar.update(1) + except Exception as e: + logging.error(f"Error while fetching images: {str(e)}") + end_time = timeit.default_timer() + self.total_time += end_time - start_time + else: + logging.error(f"Failed to fetch the URL: {url}. Status code: {r.status_code}") \ No newline at end of file diff --git a/graze/_wiki.py b/graze/_wiki.py new file mode 100644 index 0000000..8cb0c84 --- /dev/null +++ b/graze/_wiki.py @@ -0,0 +1,124 @@ +import requests +from bs4 import BeautifulSoup as bs +from tqdm import tqdm +import os +import logging +import timeit + +logging.basicConfig(filename="wiki_scraper.log", level=logging.ERROR) +current_dir = os.path.dirname(os.path.realpath(__file__)) +os.chdir(current_dir) + +def build_urls(query): + new_query = "_".join(query.split(" ")) + wiki_url = f"https://en.wikipedia.org/wiki/{new_query}" + return wiki_url + +def scrapper(urls, headers): + noUrl = 0 + r = requests.get(urls, headers=headers) + if r.status_code == 200: + soup = bs(r.content, "html.parser") + paragraphs = soup.find_all("p") + noUrl += 1 + return paragraphs, noUrl + else: + logging.error(f"Failed to fetch the URL: {urls}. Status code: {r.status_code}") + return None, 0 + +def fetch_extra_urls(query, headers): + urls = [] + new_query = "_".join(query.split(" ")) + wiki_url = f"https://en.wikipedia.org/wiki/{new_query}" + + r = requests.get(wiki_url, headers=headers) + if r.status_code == 200: + soup = bs(r.content, "html.parser") + links = soup.find_all("a") + urls.extend([url.get("href") for url in links]) + else: + logging.error(f"Failed to fetch the URL: {wiki_url}. Status code: {r.status_code}") + + return urls + +def extra_scrape(url, headers): + noUrl = 0 + if url.startswith("/"): + target_url = f"https://en.wikipedia.org{url}" + r = requests.get(target_url, headers=headers) + else: + return None, 0 + if r.status_code == 200: + soup = bs(r.content, "html.parser") + paragraphs = soup.find_all("p") + noUrl += 1 + return paragraphs, noUrl + else: + logging.error(f"Failed to fetch the URL: {target_url}. Status code: {r.status_code}") + return None, 0 + +class Wikipedia: + def __init__(self, filepath: str, metrics: bool = False) -> None: + self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} + self.directory, filename_with_ext = os.path.split(filepath) + self.filename, ext = os.path.splitext(filename_with_ext) + self.filename = self.filename.strip() + if not os.path.exists(self.directory): + os.makedirs(self.directory) + self.metrics = metrics + self.list_urls = [] + self.extra_urls = [] + self.total_time = 0 + self.fetched_urls = 0 + + def __call__(self, queries: list[str], extra_urls: bool = False): + if not queries: + raise ValueError("Queries can't be empty, add some!") + else: + start_time = timeit.default_timer() + for query in tqdm(queries, desc="Generating Valid Target Urls"): + target_url = build_urls(query) + self.list_urls.append(target_url) + + for url in tqdm(self.list_urls, desc="Scraping the web-pages"): + out_page, noUrl1 = scrapper(url, self.headers) + if out_page is not None: + self.write_to_file(out_page) + self.fetched_urls += noUrl1 + + if extra_urls: + for query in queries: + extra_urls = fetch_extra_urls(query, self.headers) + for url in extra_urls: + extra_content, noUrl2 = extra_scrape(url, self.headers) + if extra_content is not None: + self.write_to_file(extra_content) + self.fetched_urls += noUrl2 + + end_time = timeit.default_timer() + self.total_time = end_time - start_time + if self.metrics: + self.get_metrics() + + def write_to_file(self, paragraphs): + filepath = os.path.join(self.directory, f"{self.filename}.txt") + with open(filepath, "a", encoding="utf-8") as f: + for paragraph in paragraphs: + text = paragraph.get_text() + f.write(text) + + def get_metrics(self): + if self.total_time < 60: + time_display = f"{self.total_time:.2f} secs" + elif self.total_time < 3600: + time_display = f"{self.total_time / 60:.2f} mins" + else: + time_display = f"{self.total_time / 3600:.2f} hours" + + print("\n") + print("Wikipedia scraping metrics:\n") + print("------------------------------------------------------") + print(f"Total URL's fetched: {len(self.list_urls) + len(self.extra_urls)}") + print(f"Total URLs processed: {self.fetched_urls}") + print(f"Total time taken: {time_display}") + print("------------------------------------------------------") \ No newline at end of file diff --git a/graze/britannica/base.py b/graze/britannica/base.py deleted file mode 100644 index e19fed4..0000000 --- a/graze/britannica/base.py +++ /dev/null @@ -1,47 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import time -class BritannicaUrls: - def __init__(self, search_queries, max_limit): - self.max_limit = max_limit - self.search_queries = search_queries - self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} - - def build_url(self, query, pageNo): - formattedQuery = '%20'.join(query.split(' ')) - url = f"https://www.britannica.com/search?query={formattedQuery}&page={pageNo}" - return url - - def get_target_url(self, target_url): - while True: - r = requests.get(target_url, headers=self.headers) - if r.status_code == 200: - html_content = r.content - soup = BeautifulSoup(html_content, 'html.parser') - fetched_urls = soup.find_all('a', class_='md-crosslink') - list_url = [url.get('href') for url in fetched_urls] - return list_url - - elif r.status_code == 429: - print(f"Rate limit exceeded. Waiting 30secs before retrying: {target_url}") - time.sleep(30) - else: - print(f"Skipping this URL due to status code {r.status_code}: {target_url}") - return [] - - def generate_urls(self, progress_bar=None): - page_urls = [] - current_iteration = 0 - - for query in self.search_queries: - pageNo = 1 - for i in range(self.max_limit): - target_url = self.build_url(query, pageNo) - pageNo += 1 - new_url = self.get_target_url(target_url) - if new_url: - page_urls.extend(new_url) - current_iteration += 1 - if progress_bar: - progress_bar.update(1) - return page_urls \ No newline at end of file diff --git a/graze/britannica/main.py b/graze/britannica/main.py deleted file mode 100644 index ddec877..0000000 --- a/graze/britannica/main.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -current_directory = os.path.dirname(os.path.abspath(__file__)) -os.chdir(current_directory) - -from .base import BritannicaUrls -import requests -import re -from .queries import searchQueries -from bs4 import BeautifulSoup as bs -from tqdm import tqdm - -class Britannica(BritannicaUrls): - def __init__(self, search_queries=None, max_limit=10): - if search_queries: - self.search_queries = search_queries - else: - self.search_queries = searchQueries() - self.max_limit = max_limit - self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} - - def __call__(self, outfile): - assert outfile.endswith('txt'), "File name error, should be a .txt file" - url_snippets = self.scraper(self.search_queries()) - with tqdm(total=len(url_snippets), desc="Scraping in progress: ") as pbar: - for snippet in url_snippets: - page = self.text_extractor(snippet) - if outfile is not None: - if page is not None: - with open(outfile, 'a', encoding='utf-8') as f: - f.write(page) - pbar.update(1) - else: - raise ValueError("Provide valid outfile with path") - print("Data collected and saved successfully!!") - - def text_extractor(self, url_snippet): - target_url = f"https://britannica.com{url_snippet}" - r = requests.get(target_url, headers=self.headers) - - if r.status_code == 200: - soup = bs(r.content, 'html.parser') - paragraphs = soup.find_all('p') - - page = '\n'.join([p.get_text() for p in paragraphs if "Our editors will review what you’ve submitted and determine whether to revise the article." not in p.get_text()]) - page = re.sub('&\w+;', '', page) - - return page - else: - print(f"Failed to fetch page content: {target_url}") - return None - - def scraper(self, query): - scrapped = BritannicaUrls(query, max_limit=self.max_limit) - with tqdm(total=len(self.search_queries()) * self.max_limit, desc="Generating URL snippets: ") as pbar: - url_snippets = scrapped.generate_urls(progress_bar=pbar) - return url_snippets \ No newline at end of file diff --git a/graze/britannica/queries.py b/graze/britannica/queries.py deleted file mode 100644 index c000329..0000000 --- a/graze/britannica/queries.py +++ /dev/null @@ -1,34 +0,0 @@ -""" - --> contains some sample search queries for the britannica scrapper -""" - -class searchQueries: - def __init__(self): - self.search_queries = [ - "antarctica", "colonization", "world war", "asia", "africa", - "australia", "holocaust", "voyages", "biological viruses", - "Martin Luther King Jr", "Abraham Lincoln", "Quarks", "Quantum Mechanics", - "Biological Viruses", "Drugs", "Rockets", "Physics", "Mathematics", - "nuclear physics", "nuclear fusion", "CRISPR CAS-9", "virginia woolf", - "cocaine", "marijuana", "apollo missions", "birds", "blogs", "journal", - "Adolf Hitler", "Presidents of United States", "genders and sexes", - "journalism", "maths theories", "matter and particles", "discoveries", - "authors and writers", "poets and novel writers", "literature", "awards and honors", - "climate change", "renewable energy", "artificial intelligence", "machine learning", - "blockchain technology", "cryptocurrencies", "space exploration", "Mars missions", - "black holes", "string theory", "evolution", "human genome project", "stem cells", - "pandemics", "influenza", "COVID-19", "vaccination", "genetic engineering", - "nanotechnology", "3D printing", "cybersecurity", "quantum computing", - "robotics", "drones", "self-driving cars", "electric vehicles", "smart cities", - "internet of things", "big data", "cloud computing", "augmented reality", - "virtual reality", "mixed reality", "social media", "digital marketing", - "e-commerce", "fintech", "global warming", "deforestation", "ocean acidification", - "biodiversity", "conservation", "sustainable agriculture", "organic farming", - "hydropower", "solar energy", "wind energy", "geothermal energy", "tidal energy", - "nuclear power", "space tourism", "interstellar travel", "terraforming", - "exoplanets", "SETI", "astrobiology", "dark matter", "dark energy", - "the big bang theory", "cosmic microwave background", "multiverse", "extraterrestrial life" - ] - - def __call__(self): - return self.search_queries \ No newline at end of file diff --git a/graze/britannica/search_queries.json b/graze/britannica/search_queries.json deleted file mode 100644 index b10b6c0..0000000 --- a/graze/britannica/search_queries.json +++ /dev/null @@ -1,41 +0,0 @@ -[ - "antarctica", - "colonization", - "world war", - "asia", - "africa", - "australia", - "holocaust", - "voyages", - "biological viruses", - "Martin Luther King Jr", - "Abraham Lincon", - "Quarks", - "Quantum Mechanincs", - "Biological Viruses", - "Drugs", - "Rockets", - "Physics", - "Mathematics", - "nuclear physics", - "nuclear fusion", - "CRISPR CAS-9", - "virginia woolf", - "cocaine", - "marijuana", - "apollo missions", - "birds", - "blogs", - "journal", - "Adolf Hitler", - "Presidents of United States", - "genders and sexes", - "journalism", - "maths theories", - "matter and particles", - "discoveries", - "authoers and writers", - "poets and novel writers", - "literature", - "awards and honors" -] \ No newline at end of file diff --git a/graze/imageQuery.json b/graze/imageQuery.json deleted file mode 100644 index b9c5869..0000000 --- a/graze/imageQuery.json +++ /dev/null @@ -1,104 +0,0 @@ -[ - "american football", - "india", - "indian festivals", - "cars", - "street", - "street photography", - "wild animals", - "politics", - "mountains", - "rivers", - "sunsets", - "sunrises", - "forests", - "ocean", - "beaches", - "architecture", - "bridges", - "castles", - "cityscapes", - "clouds", - "deserts", - "flowers", - "gardens", - "landscapes", - "night sky", - "waterfalls", - "birds", - "fish", - "reptiles", - "insects", - "trees", - "autumn", - "winter", - "spring", - "summer", - "abstract art", - "black and white", - "portraits", - "macro photography", - "minimalism", - "patterns", - "space", - "planets", - "stars", - "galaxies", - "technology", - "robots", - "drones", - "sports", - "yoga", - "fitness", - "cycling", - "running", - "hiking", - "camping", - "travel", - "historical sites", - "monuments", - "museums", - "food", - "drinks", - "desserts", - "coffee", - "tea", - "wine", - "cocktails", - "fashion", - "clothing", - "shoes", - "accessories", - "makeup", - "hairstyles", - "tattoos", - "piercings", - "jewelry", - "watches", - "books", - "libraries", - "education", - "schools", - "universities", - "graduation", - "science", - "laboratories", - "microscopes", - "experiments", - "meditation", - "relaxation", - "spa", - "wellness", - "interior design", - "furniture", - "home decor", - "gardening", - "pets", - "dogs", - "cats", - "horses", - "farm animals", - "zoo", - "circus", - "theater" -] \ No newline at end of file diff --git a/graze/imgScrape.py b/graze/imgScrape.py deleted file mode 100644 index eadfab8..0000000 --- a/graze/imgScrape.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import requests -from bs4 import BeautifulSoup -from urllib.parse import quote - -def create_directory(topic): - directory = f'images/{topic}' - if not os.path.exists(directory): - os.makedirs(directory) - return directory - -def download_image(url, folder): - try: - image_data = requests.get(url).content - image_name = os.path.join(folder, url.split('/')[-1]) - with open(image_name, 'wb') as image_file: - image_file.write(image_data) - except Exception as e: - print(f"Failed to download {url}. Reason: {e}") - -def scrape_images_for_topic(topic): - print(f"Scraping images for topic: {topic}") - topic_folder = create_directory(topic) - - query = quote(topic) - url = f"https://www.google.com/search?q={query}&source=lnms&tbm=isch" - - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - } - - response = requests.get(url, headers=headers) - soup = BeautifulSoup(response.content, "html.parser") - image_tags = soup.find_all("img") - print(image_tags) - - # for img_tag in image_tags: - # try: - # img_url = img_tag['src'] - # if not img_url: - # img_url = img_tag['data-src'] - # if img_url: - # download_image(img_url, topic_folder) - # except Exception as e: - # print(f"Failed to process an image tag. Reason: {e}") - -topics = [ - "antarctica", "colonization", "world war", "asia", "africa", - "australia", "holocaust", "voyages", "biological viruses" -] - -for topic in topics: - scrape_images_for_topic(topic) \ No newline at end of file diff --git a/graze/queries.py b/graze/queries.py new file mode 100644 index 0000000..9457e04 --- /dev/null +++ b/graze/queries.py @@ -0,0 +1,28 @@ +import os +import json +current_dir = os.path.dirname(os.path.realpath(__file__)) +os.chdir(current_dir) + +with open(f"./topics.json", "r") as infile: + queries = json.load(infile) + +error_msg = ( + "can't identify the category:\n" + "Choose 'channel' for YouTube ChannelIds\n" + "'search' for Britannica or Wikipedia search topics\n" + "'images' for Unsplash & Pexels search topics" +) + +class Queries: + def __init__(self, category:str) -> None: + self.category = category + + def __call__(self) -> list: + if self.category == "search": + return queries["search_topics"] + elif self.category == "channel": + return queries["channel_ids"] + elif self.category == "images": + return queries["image_topics"] + else: + raise TypeError(error_msg) \ No newline at end of file diff --git a/graze/topics.json b/graze/topics.json new file mode 100644 index 0000000..1ddb6bf --- /dev/null +++ b/graze/topics.json @@ -0,0 +1,182 @@ +{ + "search_topics": [ + "antarctica", "colonization", "world war", "asia", "africa", + "australia", "holocaust", "voyages", "biological viruses", + "Martin Luther King Jr", "Abraham Lincoln", "Quarks", "Quantum Mechanics", + "Biological Viruses", "Drugs", "Rockets", "Physics", "Mathematics", + "nuclear physics", "nuclear fusion", "CRISPR CAS-9", "virginia woolf", + "cocaine", "marijuana", "apollo missions", "birds", "blogs", "journal", + "Adolf Hitler", "Presidents of United States", "genders and sexes", + "journalism", "maths theories", "matter and particles", "discoveries", + "authors and writers", "poets and novel writers", "literature", "awards and honors", + "climate change", "renewable energy", "artificial intelligence", "machine learning", + "blockchain technology", "cryptocurrencies", "space exploration", "Mars missions", + "black holes", "string theory", "evolution", "human genome project", "stem cells", + "pandemics", "influenza", "COVID-19", "vaccination", "genetic engineering", + "nanotechnology", "3D printing", "cybersecurity", "quantum computing", + "robotics", "drones", "self-driving cars", "electric vehicles", "smart cities", + "internet of things", "big data", "cloud computing", "augmented reality", + "virtual reality", "mixed reality", "social media", "digital marketing", + "e-commerce", "fintech", "global warming", "deforestation", "ocean acidification", + "biodiversity", "conservation", "sustainable agriculture", "organic farming", + "hydropower", "solar energy", "wind energy", "geothermal energy", "tidal energy", + "nuclear power", "space tourism", "interstellar travel", "terraforming", + "exoplanets", "SETI", "astrobiology", "dark matter", "dark energy", + "the big bang theory", "cosmic microwave background", "multiverse", "extraterrestrial life", + "neuroscience", "psychology", "behavioral economics", "cryptography", "linguistics", + "paleontology", "archaeology", "anthropology", "medieval history", "Renaissance art", + "Baroque music", "classical literature", "philosophy of science", "ethics", + "existentialism", "surrealism", "cubism", "impressionism", "romanticism", + "modernism", "postmodernism", "futurism", "dadaism", "expressionism", "neoclassicism", + "avant-garde", "minimalism", "pop art", "abstract art", "photorealism", "conceptual art", + "installation art", "performance art", "digital art", "cyberpunk", "steampunk", + "biopunk", "solarpunk", "architecture", "urban planning", "landscape architecture", + "industrial design", "fashion design", "graphic design", "web design", "interior design", + "product design", "animation", "film production", "video game design", "sound design", + "photography", "cinematography", "documentary filmmaking", "screenwriting", + "theater production", "stage design", "costume design", "makeup artistry", + "special effects", "visual effects", "motion capture", "virtual production", + "voice acting", "puppetry", "mime", "improvisation", "stand-up comedy", "satire", + "parody", "slapstick", "absurdism", "farce", "musical theater", "opera", "ballet", + "modern dance", "hip-hop dance", "tap dance", "ballroom dance", "folk dance", + "contemporary dance", "choreography", "dance therapy", "somatics", "martial arts", + "yoga", "meditation", "mindfulness", "holistic health", "naturopathy", "homeopathy", + "ayurveda", "traditional Chinese medicine", "acupuncture", "herbal medicine", "aromatherapy", + "reflexology", "reiki", "crystal healing", "energy medicine", "biofeedback", "hypnotherapy", + "sound healing", "art therapy", "music therapy", "drama therapy", "play therapy", + "adventure therapy", "wilderness therapy", "animal-assisted therapy", "horticulture therapy", + "nutrition therapy", "sports medicine", "physical therapy", "occupational therapy", + "speech therapy", "respiratory therapy", "cardiovascular health", "diabetes management", + "cancer treatment", "autoimmune diseases", "infectious diseases", "chronic pain management", + "geriatric care", "palliative care", "end-of-life care", "sleep disorders", "mental health", + "substance abuse", "eating disorders", "child development", "adolescent psychology", + "adult development", "gerontology", "family therapy", "couples therapy", "group therapy", + "psychopharmacology", "neuroplasticity", "brain-computer interface", "transhumanism", + "bioethics", "neuroethics", "genetic counseling", "biostatistics", "epidemiology", + "public health", "global health", "health informatics", "telemedicine", "digital health", + "wearable technology", "health economics", "health policy", "healthcare administration", + "medical education", "clinical trials", "evidence-based medicine", "patient advocacy", + "community health", "health equity", "social determinants of health", "health disparities", + "disability studies", "inclusive design", "universal design", "assistive technology", + "adaptive sports", "paralympic games", "special olympics", "accessible travel", + "inclusive education", "diversity and inclusion", "cultural competence", "multiculturalism", + "cross-cultural communication", "intercultural relations", "global citizenship", + "international relations", "diplomacy", "peace studies", "conflict resolution", + "negotiation", "mediation", "arbitration", "human rights", "social justice", + "environmental justice", "economic justice", "gender equality", "racial equality", + "LGBTQ+ rights", "disability rights", "children's rights", "indigenous rights", + "animal rights", "environmental sustainability", "corporate social responsibility", + "ethical leadership", "social entrepreneurship", "impact investing", "philanthropy", + "volunteerism", "community organizing", "grassroots movements", "advocacy", + "policy analysis", "public administration", "governance", "public management", + "e-governance", "smart governance", "participatory governance", "collaborative governance", + "network governance", "transparency", "accountability", "anti-corruption", "open data", + "data privacy", "cyber law", "intellectual property", "patent law", "trademark law", + "copyright law", "digital rights", "internet freedom", "media literacy", "information literacy", + "critical thinking", "creative problem solving", "innovation management", "design thinking", + "lean startup", "agile methodology", "project management", "product management", + "business strategy", "competitive analysis", "market research", "consumer behavior", + "branding", "advertising", "public relations", "sales management", "customer service", + "supply chain management", "logistics", "operations management", "quality management", + "risk management", "crisis management", "change management", "organizational behavior", + "human resource management", "talent management", "performance management", + "employee engagement", "workplace culture", "remote work", "flexible work", + "work-life balance", "professional development", "career planning", "leadership development", + "succession planning", "executive coaching", "mentorship", "networking", + "personal branding", "financial planning", "investment strategies", "retirement planning", + "estate planning", "tax planning", "insurance planning", "real estate", + "property management", "home improvement", "interior decoration", "gardening", + "landscaping", "home automation", "smart home technology", "DIY projects", + "crafts", "hobbies", "collecting", "board games", "card games", "video games", + "sports", "outdoor recreation", "camping", "hiking", "fishing", "hunting", + "boating", "sailing", "scuba diving", "snorkeling", "surfing", "skateboarding", + "snowboarding", "skiing", "mountaineering", "rock climbing", "bouldering", + "trail running", "marathon running", "triathlon", "ironman", "cycling", + "mountain biking", "road biking", "swimming", "water polo", "synchronized swimming", + "diving", "gymnastics", "cheerleading", "yoga", "pilates", "aerobics", + "strength training", "bodybuilding", "powerlifting", "crossfit", "functional fitness", + "high-intensity interval training", "calisthenics", "martial arts", "boxing", + "kickboxing", "taekwondo", "karate", "judo", "jiu-jitsu", "muay thai", + "wrestling", "sumo wrestling", "fencing", "archery", "shooting", "hunting", + "equestrian sports", "horse racing", "polo", "show jumping", "dressage", + "rodeo", "bull riding", "barrel racing", "team roping", "calf roping", + "ranch sorting", "cowboy action shooting", "reining", "cutting", "working cow horse", + "freestyle reining", "reined cow horse", "ranch riding", "trail riding", + "endurance riding", "pleasure riding", "driving", "carriage driving", + "combined driving", "pleasure driving", "team driving", "draft horse driving" + ], + + "channel_ids": [ + "UCb_MAhL8Thb3HJ_wPkH3gcw", "UCA295QVkf9O1RQ8_-s3FVXg", "UCpFFItkfZz1qz5PpHpqzYBw", + "UCY1kMZp36IQSyNx_9h4mpCg", "UCA19mAJURyYHbJzhfpqhpCA", "UCqnbDFdCpuN8CMEg0VuEBqA", + "UCddiUEpeqJcYeBxX1IVBKvQ", "UCcefcZRL2oaA_uBNeo5UOWg", "UCLXo7UDZvByw2ixzpQCufnA", + "UCsQoiOrh7jzKmE8NBofhTnQ", "UCUyvQV2JsICeLZP4c_h40kA", "UCvjgXvBlbQiydffZU7m1_aw", + "UCRI00CwLZdLRCWg5BdDOsNw", "UCEIwxahdLz7bap-VDs9h35A", "UC4bq21IPPbpu0Qrsl7LW0sw", + "UCR1IuLEqb6UEA_zQ81kwXfg", "UCIlU5KDHKFSaebYviKfOidw", "UCtYKe7-XbaDjpUwcU5x0bLg", + "UCBJycsmduvYEL83R_U4JriQ", "UCRcgy6GzDeccI7dkbbBna3Q", "UC3_BakzLfadvFrsnClMFWmQ", + "UCmGSJVG3mCRXVOP4yZrU1Dw", "UCFN6lQpfY8XIRdhv9G-f4bg", "UConJDkGk921yT9hISzFqpzw", + "UClWTCPVi-AU9TeCN6FkGARg", "UCyHJ94JzwY92NsBVzJ2aE3Q", "UCTqEu1wZDBju2tHkNP1dwzQ", + "UC6nSFpj9HTCZ5t-N3Rm3-HA", "UCX6b17PVsYBQ0ip5gyeme-Q", "UCONtPx56PSebXJOxbFv-2jQ", + "UCZYTClx2T1of7BRZ86-8fow", "UCzWQYUVCpZqtN93H8RR44Qw", "UCYbK_tjZ2OrIZFBvU6CCMiA", + "UCxzC4EngIsMrPmbm6Nxvb-A", "UCcabW7890RKJzL968QWEykA", "UCamLstJyCa-t5gfZegxsFMw", + "UC415bOPUcGSamy543abLmRA", "UCpMcsdZf2KkAnfmxiq2MfMQ", "UCqVEHtQoXHmUCfJ-9smpTSg", + "UCYO_jab_esuFRV4b17AJtAw", "UCHnyfMqiRRG1u-2MsSQLbXA", "UCsXVk37bltHxD1rDPwtNM8Q", + "UC9RM-iSvTu1uPJb8X5yp3EQ", "UCZaT_X_mc0BI-djXOlfhqWQ", "UCMiJRAwDNSNzuYeN2uWa0pA", + "UCHpw8xwDNhU9gdohEcJu4aA", "UCK7tptUDHh-RYDsdxO1-5QQ", "UCsooa4yRKGN_zEE8iknghZA", + "UC6n8I1UDTKP1IWjQMg6_TwA", "UC8butISFwT-Wl7EV0hUK0BQ", "UCgRQHK8Ttr1j9xCEpCAlgbQ", + "UCEBb1b_L6zDS3xTUrIALZOw", "UCN0QBfKk0ZSytyX_16M11fA", "UCBpxspUNl1Th33XbugiHJzw", + "UC3osNjJeuDdvyALIEP-nh0g", "UCaSCt8s_4nfkRglWCvNSDrg", "UCjgpFI5dU-D1-kh9H1muoxQ", + "UCBa659QWEk1AI4Tg--mrJ2A", "UCdBK94H6oZT2Q7l0-b0xmMg", "UCBA9cAuPy9L5IYYXqOduIvw", + "UCXjmz8dFzRJZrZY8eFiXNUQ", "UClZbmi9JzfnB2CEb0fG8iew", "UCUFoQUaVRt3MVFxqwPUMLCQ", + "UCgLxmJ8xER7Y7sywMN5SfWg", "UCac1MisHGa0qtzf0oWlU8Zw", "UCSIvk78tK2TiviLQn4fSHaw", + "UCUyvQV2JsICeLZP4c_h40kA", "UCqFzWxSCi39LnW1JKFR3efg", "UCqFzWxSCi39LnW1JKFR3efg", + "UCccjdJEay2hpb5scz61zY6Q", "UC8CX0LD98EDXl4UYX1MDCXg", "UC6VcWc1rAoWdBCM0JxrRQ3A", + "UCSHZKyawb77ixDdsGog4iWA", "UCVHdvAX5-R8y5l9xp6nroBQ", "UCTb6Oy0TXI03iEUdRMR9dnw", + "UCqoAEDirJPjEUFcF2FklnBA", "UCccjdJEay2hpb5scz61zY6Q", "UCNVBYBxWj9dMHqKEl_V8HBQ", + "UCNVBYBxWj9dMHqKEl_V8HBQ", "UCb-vZWBeWA5Q2818JmmJiqQ", "UChDKyKQ59fYz3JO2fl0Z6sg", + "UCupvZG-5ko_eiXAupbDfxWw", "UCDrLGkZTcNCshOLiKi5NtEw", "UCWOA1ZGywLbqmigxE4Qlvuw", + "UCrM7B7SL_g1edFOnmj-SDKg", "UCUMZ7gohGI9HcU9VNsr2FJQ", "UCF9imwPMSGz4Vq1NiTWCC7g", + "UCjmJDM5pRKbUlVIzDYYWb6g", "UCrRttZIypNTA1Mrfwo745Sg", "UC0k238zFx-Z8xFH0sxCrPJg", + "UCT9zcQNlyht7fRlcjmflRSA", "UC0C-w0YjGpqDXGB8IHb662A", "UCgQna2EqpzqzfBjlSmzT72w", + "UCeLHszkByNZtPKcaVXOCOQQ", "UCjNRJBlxvvS0UXAT2Ack-QQ", "UC-J-KZfRV8c13fOCkhXdLiQ", + "UCfM3zsQsOnfWNUppiycmBuw", "UCNjHgaLpdy1IMNK57pYiKiQ", "UCqECaJ8Gagnn7YCbPEzWH6g", + "UCb2HGwORFBo94DmRx4oLzow", "UCi4EDAgjULwwNBHOg1aaCig", "UCDPM_n1atn2ijUwHd0NNRQw", + "UCcgqSM4YEo5vVQpqwN-MaNw", "UCoUM-UJ7rirJYP8CQ0EIaHA", "UC0WP5P-ufpRfjbNrmOWwLBQ", + "UCBVjMGOIkavEAhyqpxJ73Dw", "UCPHjpfnnGklkRBBTd0k6aHg", "UCmHhviensDlGQeU8Yo80zdg", + "UC6IBMCQ6-d7p41KHxOsq4RA", "UCiMhD4jzUqG-IgPzUmmytRQ", "UCB0JSO6d5ysH2Mmqz5I9rIw", + "UC-lHJZR3Gqxm24_Vd_AJ5Yw", "UCwx7Y3W30N8aS_tiCy2x-2g", "UCZ_cuJGBis0vi6U3bWmvDIg", + "UCcNQQEvWA9BJG_yBQ9JkhnA", "UCf_XYgupvdx7rA44Ap3uI5w", "UCs52U_Q9TYSHtd9oxD4WN0A", + "UConVfxXodg78Tzh5nNu85Ew", "UCNIkB2IeJ-6AmZv7bQ1oBYg", "UCcPI9kEPhyUDLBHGOhKqxOw", + "UCVYamHliCI9rw1tHR1xbkfw", "UC-7nELDbJEPF3muAzSeT74g", "UCxqAWLTk1CmBvZFPzeZMd9A", + "UCKWaEZ-_VweaEx1j62do_vQ", "UCFQDtftsHGzSh1-TReNT4lA", "UC6biysICWOJ-C3P4Tyeggzg", + "UCFe-pfe0a9bDvWy74Jd7vFg", "UCUgZq9PkDp1xaEivtcfJPSg", "UC0-7PyfpOIJpNyi8WrHiyXA", + "UCPb7xe-MQ0KiJpaKBWFZtTA", "UCycGV6fAhD_-7GPmCkkESdw", "UCG7J20LhUeLl6y_Emi7OJrA" + ], + "image_topics": [ "american football", "india", "europe", + "indian festivals", "cars", "street", "street photography", + "wild animals", "politics", "mountains", "rivers", + "sunsets", "sunrises", "forests", "ocean", + "beaches", "architecture", "bridges", "castles", + "cityscapes", "clouds", "deserts", "flowers", + "gardens", "landscapes", "night sky", "waterfalls", + "birds", "fish", "reptiles", "insects", + "trees", "autumn", "winter", "spring", + "summer", "abstract art", "black and white", "portraits", + "macro photography", "minimalism", "patterns", "space", + "planets", "stars", "galaxies", "technology", + "robots", "drones", "sports", "yoga", + "fitness", "cycling", "running", "hiking", + "camping", "travel", "historical sites", "monuments", + "museums", "food", "drinks", "desserts", + "coffee", "tea", "wine", "cocktails", + "fashion", "clothing", "shoes", "accessories", + "makeup", "hairstyles", "tattoos", "piercings", + "jewelry", "watches", "books", "libraries", + "education", "schools", "universities", "graduation", + "science", "laboratories", "microscopes", "experiments", + "meditation", "relaxation", "spa", "wellness", + "interior design", "furniture", "home decor", "gardening", + "pets", "dogs", "cats", "horses", + "farm animals", "zoo", "circus", "theater" + ] +} \ No newline at end of file diff --git a/graze/unsplash.py b/graze/unsplash.py deleted file mode 100644 index 763a18c..0000000 --- a/graze/unsplash.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -current_directory = os.path.dirname(os.path.abspath(__file__)) -os.chdir(current_directory) - -import logging -logging.basicConfig(filename='image_downloading.log', level=logging.ERROR) - -import requests -from bs4 import BeautifulSoup as bs -import base64 -import re -import json -from tqdm import tqdm - -headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} - -def download_img(src, img_name): - try: - image_data = requests.get(src).content - with open(img_name, 'wb') as img_file: - img_file.write(image_data) - except Exception as e: - logging.error(f"Error while downloading image: {str(e)}") - -def download_base64_img(data, img_name): - try: - image_data = base64.b64decode(data) - with open(img_name, 'wb') as img_file: - img_file.write(image_data) - except Exception as e: - logging.error(f"Error while downloading image: {str(e)}") - -def fetch_url(topic): - formatted_query = '-'.join(topic.split(' ')) - url = f"https://unsplash.com/s/photos/{formatted_query}" - - r = requests.get(url, headers=headers) - if r.status_code == 200: - html_content = r.content - soup = bs(html_content, 'html.parser') - img_tags = soup.find_all('img') - - topic_dir = f'../images/{formatted_query}' - if not os.path.exists(topic_dir): - os.makedirs(topic_dir) - - with tqdm(total=len(img_tags), desc=f"Downloading ") as pbar: - for idx, img in enumerate(img_tags): - try: - img_src = img.get('src') - if not img_src: - img_src = img.get('data-src') - - if img_src: - img_name = f'{topic_dir}/{idx}.jpg' - - if img_src.startswith('data:image'): - base64_data = re.sub('^data:image/.+;base64,', '', img_src) - download_base64_img(base64_data, img_name) - else: - download_img(img_src, img_name) - pbar.update(1) - except Exception as e: - logging.error(f"Error while fetching images: {str(e)}") - else: - logging.error(f"Failed to fetch the URL: {url}. Status code: {r.status_code}") - -def unsplash(topics=None): - if topics: - target_topics = topics - else: - with open('./imageQuery.json', 'r') as infile: - target_topics = json.load(infile) - - for topic in target_topics: - print(f"\nDownloading '{topic}' images:") - fetch_url(topic) \ No newline at end of file diff --git a/graze/utils/_britannica.py b/graze/utils/_britannica.py new file mode 100644 index 0000000..e69de29 diff --git a/graze/utils/_wikipedia.py b/graze/utils/_wikipedia.py new file mode 100644 index 0000000..e69de29 diff --git a/graze/wikipedia/main.py b/graze/wikipedia/main.py deleted file mode 100644 index 950e111..0000000 --- a/graze/wikipedia/main.py +++ /dev/null @@ -1,100 +0,0 @@ -""" - --> generates a target wikipeida-url from the provided queries - --> sends a request to that url and fetches the comeplete webpage - --> writes it in a file -""" - -import os -current_dir = os.path.dirname(os.path.abspath(__file__)) -os.chdir(current_dir) - -import requests -from bs4 import BeautifulSoup as bs -from tqdm import tqdm -from .queries import WikiQueries - -class WikiScraper: - def __init__(self, search_queries=None): - self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} - self.list_urls = [] - self.extra_urls = [] - self.total_urls = 0 - if search_queries: - self.search_queries = search_queries - else: - self.search_queries = WikiQueries() - - def __call__(self, out_file=None, extra_urls=False): - if out_file is not None: - for query in tqdm(self.search_queries(), desc="Generating valid urls"): - target_url = self.build_urls(query) - self.list_urls.append(target_url) - - for url in tqdm(self.list_urls, desc="Scrapping the web-pages\t"): - out_page = self.scrapper(url) - with open(out_file, 'a', encoding='utf-8') as f: - if out_page is not None: - for paragraph in out_page: - text = paragraph.get_text() - f.write(text) - else: continue - - if extra_urls is True: - for query in tqdm(self.search_queries(), desc="Generating extra urls"): - extra_urls = self.fetch_extra_urls(query) - self.extra_urls.append(extra_urls) - for url in tqdm([item for sublist in self.extra_urls for item in sublist], desc="Scrapping extra urls"): - extra_output = self.extra_scrape(url) - with open(out_file, 'a', encoding='utf-8') as f: - if extra_output is not None: - for para in extra_output: - new_text = para.get_text() - f.write(new_text) - else: continue - else: - pass - print('\n total fetched urls: ', self.total_urls) - else: - raise ValueError('provide a output file') - - def build_urls(self, query): - new_query = '_'.join(query.split(' ')) - wiki_url = f"https://en.wikipedia.org/wiki/{new_query}" - return wiki_url - - def scrapper(self, urls): - r = requests.get(urls, headers=self.headers) - if r.status_code == 200: - soup = bs(r.content, 'html.parser') - paragraphs = soup.find_all('p') - self.total_urls += 1 - return paragraphs - else: - pass - - def fetch_extra_urls(self, query): - urls = [] - new_query = '_'.join(query.split(' ')) - wiki_url = f"https://en.wikipedia.org/wiki/{new_query}" - - r = requests.get(wiki_url, headers=self.headers) - if r.status_code == 200: - soup = bs(r.content, 'html.parser') - links = soup.find_all('a') - urls.extend([url.get('href') for url in links]) - - return urls - - def extra_scrape(self, url): - if url.startswith('/'): - target_url = f"https://en.wikipedia.org{url}" - r = requests.get(target_url, headers=self.headers) - else: - return None - if r.status_code == 200: - soup = bs(r.content, 'html.parser') - paragraphs = soup.find_all('p') - self.total_urls += 1 - return paragraphs - else: - return None \ No newline at end of file diff --git a/graze/wikipedia/queries.py b/graze/wikipedia/queries.py deleted file mode 100644 index b802c18..0000000 --- a/graze/wikipedia/queries.py +++ /dev/null @@ -1,186 +0,0 @@ -""" - --> contains sample queries, similar to britannicaScrapper -""" - -class WikiQueries: - def __init__(self): - self.search_queries = [ - "antarctica", "colonization", "world war", "asia", "africa", - "australia", "holocaust", "voyages", "biological viruses", - "Martin Luther King Jr", "Abraham Lincoln", "Quarks", "Quantum Mechanics", - "Biological Viruses", "Drugs", "Rockets", "Physics", "Mathematics", - "nuclear physics", "nuclear fusion", "CRISPR CAS-9", "virginia woolf", - "cocaine", "marijuana", "apollo missions", "birds", "blogs", "journal", - "Adolf Hitler", "Presidents of United States", "genders and sexes", - "journalism", "maths theories", "matter and particles", "discoveries", - "authors and writers", "poets and novel writers", "literature", "awards and honors", - "climate change", "renewable energy", "artificial intelligence", "machine learning", - "blockchain technology", "cryptocurrencies", "space exploration", "Mars missions", - "black holes", "string theory", "evolution", "human genome project", "stem cells", - "pandemics", "influenza", "COVID-19", "vaccination", "genetic engineering", - "nanotechnology", "3D printing", "cybersecurity", "quantum computing", - "robotics", "drones", "self-driving cars", "electric vehicles", "smart cities", - "internet of things", "big data", "cloud computing", "augmented reality", - "virtual reality", "mixed reality", "social media", "digital marketing", - "e-commerce", "fintech", "global warming", "deforestation", "ocean acidification", - "biodiversity", "conservation", "sustainable agriculture", "organic farming", - "hydropower", "solar energy", "wind energy", "geothermal energy", "tidal energy", - "nuclear power", "space tourism", "interstellar travel", "terraforming", - "exoplanets", "SETI", "astrobiology", "dark matter", "dark energy", - "the big bang theory", "cosmic microwave background", "multiverse", "extraterrestrial life", - "neuroscience", "psychology", "behavioral economics", "cryptography", "linguistics", - "paleontology", "archaeology", "anthropology", "medieval history", "Renaissance art", - "Baroque music", "classical literature", "philosophy of science", "ethics", - "existentialism", "surrealism", "cubism", "impressionism", "romanticism", - "modernism", "postmodernism", "futurism", "dadaism", "expressionism", "neoclassicism", - "avant-garde", "minimalism", "pop art", "abstract art", "photorealism", "conceptual art", - "installation art", "performance art", "digital art", "cyberpunk", "steampunk", - "biopunk", "solarpunk", "architecture", "urban planning", "landscape architecture", - "industrial design", "fashion design", "graphic design", "web design", "interior design", - "product design", "animation", "film production", "video game design", "sound design", - "photography", "cinematography", "documentary filmmaking", "screenwriting", - "theater production", "stage design", "costume design", "makeup artistry", - "special effects", "visual effects", "motion capture", "virtual production", - "voice acting", "puppetry", "mime", "improvisation", "stand-up comedy", "satire", - "parody", "slapstick", "absurdism", "farce", "musical theater", "opera", "ballet", - "modern dance", "hip-hop dance", "tap dance", "ballroom dance", "folk dance", - "contemporary dance", "choreography", "dance therapy", "somatics", "martial arts", - "yoga", "meditation", "mindfulness", "holistic health", "naturopathy", "homeopathy", - "ayurveda", "traditional Chinese medicine", "acupuncture", "herbal medicine", "aromatherapy", - "reflexology", "reiki", "crystal healing", "energy medicine", "biofeedback", "hypnotherapy", - "sound healing", "art therapy", "music therapy", "drama therapy", "play therapy", - "adventure therapy", "wilderness therapy", "animal-assisted therapy", "horticulture therapy", - "nutrition therapy", "sports medicine", "physical therapy", "occupational therapy", - "speech therapy", "respiratory therapy", "cardiovascular health", "diabetes management", - "cancer treatment", "autoimmune diseases", "infectious diseases", "chronic pain management", - "geriatric care", "palliative care", "end-of-life care", "sleep disorders", "mental health", - "substance abuse", "eating disorders", "child development", "adolescent psychology", - "adult development", "gerontology", "family therapy", "couples therapy", "group therapy", - "psychopharmacology", "neuroplasticity", "brain-computer interface", "transhumanism", - "bioethics", "neuroethics", "genetic counseling", "biostatistics", "epidemiology", - "public health", "global health", "health informatics", "telemedicine", "digital health", - "wearable technology", "health economics", "health policy", "healthcare administration", - "medical education", "clinical trials", "evidence-based medicine", "patient advocacy", - "community health", "health equity", "social determinants of health", "health disparities", - "disability studies", "inclusive design", "universal design", "assistive technology", - "adaptive sports", "paralympic games", "special olympics", "accessible travel", - "inclusive education", "diversity and inclusion", "cultural competence", "multiculturalism", - "cross-cultural communication", "intercultural relations", "global citizenship", - "international relations", "diplomacy", "peace studies", "conflict resolution", - "negotiation", "mediation", "arbitration", "human rights", "social justice", - "environmental justice", "economic justice", "gender equality", "racial equality", - "LGBTQ+ rights", "disability rights", "children's rights", "indigenous rights", - "animal rights", "environmental sustainability", "corporate social responsibility", - "ethical leadership", "social entrepreneurship", "impact investing", "philanthropy", - "volunteerism", "community organizing", "grassroots movements", "advocacy", - "policy analysis", "public administration", "governance", "public management", - "e-governance", "smart governance", "participatory governance", "collaborative governance", - "network governance", "transparency", "accountability", "anti-corruption", "open data", - "data privacy", "cyber law", "intellectual property", "patent law", "trademark law", - "copyright law", "digital rights", "internet freedom", "media literacy", "information literacy", - "critical thinking", "creative problem solving", "innovation management", "design thinking", - "lean startup", "agile methodology", "project management", "product management", - "business strategy", "competitive analysis", "market research", "consumer behavior", - "branding", "advertising", "public relations", "sales management", "customer service", - "supply chain management", "logistics", "operations management", "quality management", - "risk management", "crisis management", "change management", "organizational behavior", - "human resource management", "talent management", "performance management", - "employee engagement", "workplace culture", "remote work", "flexible work", - "work-life balance", "professional development", "career planning", "leadership development", - "succession planning", "executive coaching", "mentorship", "networking", - "personal branding", "financial planning", "investment strategies", "retirement planning", - "estate planning", "tax planning", "insurance planning", "real estate", - "property management", "home improvement", "interior decoration", "gardening", - "landscaping", "home automation", "smart home technology", "DIY projects", - "crafts", "hobbies", "collecting", "board games", "card games", "video games", - "sports", "outdoor recreation", "camping", "hiking", "fishing", "hunting", - "boating", "sailing", "scuba diving", "snorkeling", "surfing", "skateboarding", - "snowboarding", "skiing", "mountaineering", "rock climbing", "bouldering", - "trail running", "marathon running", "triathlon", "ironman", "cycling", - "mountain biking", "road biking", "swimming", "water polo", "synchronized swimming", - "diving", "gymnastics", "cheerleading", "yoga", "pilates", "aerobics", - "strength training", "bodybuilding", "powerlifting", "crossfit", "functional fitness", - "high-intensity interval training", "calisthenics", "martial arts", "boxing", - "kickboxing", "taekwondo", "karate", "judo", "jiu-jitsu", "muay thai", - "wrestling", "sumo wrestling", "fencing", "archery", "shooting", "hunting", - "equestrian sports", "horse racing", "polo", "show jumping", "dressage", - "rodeo", "bull riding", "barrel racing", "team roping", "calf roping", - "ranch sorting", "cowboy action shooting", "reining", "cutting", "working cow horse", - "freestyle reining", "reined cow horse", "ranch riding", "trail riding", - "endurance riding", "pleasure riding", "driving", "carriage driving", - "combined driving", "pleasure driving", "team driving", "draft horse driving", - "hackney horse", "shetland pony", "welsh pony", "miniature horse", "falabella horse", - "paint horse", "quarter horse", "appaloosa horse", "arabian horse", - "thoroughbred horse", "warmblood horse", "friesian horse", "andalusian horse", - "lusitano horse", "lipizzaner horse", "haflinger horse", "fjord horse", - "connemara pony", "new forest pony", "highland pony", "fell pony", - "dales pony", "dartmoor pony", "exmoor pony", "suffolk punch", "cleveland bay", - "shire horse", "clydesdale horse", "percheron horse", "belgian horse", - "boulonnais horse", "ardennais horse", "comtois horse", "auxois horse", - "italian heavy draft", "norwegian fjord horse", "american cream draft horse", - "haflinger", "hafling pony", "hackney horse", "hackney pony", "trotter", - "pacer", "harness horse", "standardbred horse", "roadster horse", - "roadster pony", "saddle seat horse", "saddle seat pony", "fine harness horse", - "fine harness pony", "pleasure harness horse", "pleasure harness pony", - "draft horse", "heavy horse", "plow horse", "working horse", "farm horse", - "riding horse", "driving horse", "pony", "miniature pony", "dwarf horse", - "small horse", "large pony", "small pony", "medium pony", "large horse", - "small draft horse", "medium draft horse", "large draft horse", "medium light horse", - "light horse", "heavy light horse", "medium heavy horse", "heavy heavy horse", - "small heavy horse", "medium heavy horse", "large heavy horse", "small light horse", - "medium light horse", "large light horse", "small medium light horse", - "medium large light horse", "large medium light horse", "small large light horse", - "large small light horse", "medium large heavy horse", "large medium heavy horse", - "small medium heavy horse", "medium small heavy horse", "small large heavy horse", - "large small heavy horse", "medium large medium horse", "large medium medium horse", - "small medium medium horse", "medium small medium horse", "small large medium horse", - "large small medium horse", "medium large small horse", "large medium small horse", - "small medium small horse", "medium small small horse", "small large small horse", - "large small small horse", "medium large tiny horse", "large medium tiny horse", - "small medium tiny horse", "medium small tiny horse", "small large tiny horse", - "large small tiny horse", "medium large giant horse", "large medium giant horse", - "small medium giant horse", "medium small giant horse", "small large giant horse", - "large small giant horse", "medium large huge horse", "large medium huge horse", - "small medium huge horse", "medium small huge horse", "small large huge horse", - "large small huge horse", "medium large enormous horse", "large medium enormous horse", - "small medium enormous horse", "medium small enormous horse", "small large enormous horse", - "large small enormous horse", "medium large tiny pony", "large medium tiny pony", - "small medium tiny pony", "medium small tiny pony", "small large tiny pony", - "large small tiny pony", "medium large giant pony", "large medium giant pony", - "small medium giant pony", "medium small giant pony", "small large giant pony", - "large small giant pony", "medium large huge pony", "large medium huge pony", - "small medium huge pony", "medium small huge pony", "small large huge pony", - "large small huge pony", "medium large enormous pony", "large medium enormous pony", - "small medium enormous pony", "medium small enormous pony", "small large enormous pony", - "large small enormous pony", "medium large tiny horse", "large medium tiny horse", - "small medium tiny horse", "medium small tiny horse", "small large tiny horse", - "large small tiny horse", "medium large giant horse", "large medium giant horse", - "small medium giant horse", "medium small giant horse", "small large giant horse", - "large small giant horse", "medium large huge horse", "large medium huge horse", - "small medium huge horse", "medium small huge horse", "small large huge horse", - "large small huge horse", "medium large enormous horse", "large medium enormous horse", - "small medium enormous horse", "medium small enormous horse", "small large enormous horse", - "large small enormous horse", "medium large tiny pony", "large medium tiny pony", - "small medium tiny pony", "medium small tiny pony", "small large tiny pony", - "large small tiny pony", "medium large giant pony", "large medium giant pony", - "small medium giant pony", "medium small giant pony", "small large giant pony", - "large small giant pony", "medium large huge pony", "large medium huge pony", - "small medium huge pony", "medium small huge pony", "small large huge pony", - "large small huge pony", "medium large enormous pony", "large medium enormous pony", - "small medium enormous pony", "medium small enormous pony", "small large enormous pony", - "large small enormous pony", "medium large tiny horse", "large medium tiny horse", - "small medium tiny horse", "medium small tiny horse", "small large tiny horse", - "large small tiny horse", "medium large giant horse", "large medium giant horse", - "small medium giant horse", "medium small giant horse", "small large giant horse", - "large small giant horse", "medium large huge horse", "large medium huge horse", - "small medium huge horse", "medium small huge horse", "small large huge horse", - "large small huge horse", "medium large enormous horse", "large medium enormous horse", - "small medium enormous horse", "medium small enormous horse", "small large enormous horse", - "large small enormous horse", "medium large tiny pony", "large medium tiny pony", - "small medium tiny pony", "medium small tiny pony", "small large tiny pony", - "large small tiny pony", "medium large giant pony", "large medium giant pony", - "small medium giant pony", "medium small giant pony", "small large giant pony" - ] - - def __call__(self): - return self.search_queries \ No newline at end of file diff --git a/graze/wikipedia/wiki_extractor.py b/graze/wikipedia/wiki_extractor.py deleted file mode 100644 index 03afe79..0000000 --- a/graze/wikipedia/wiki_extractor.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import timeit -import requests -from bs4 import BeautifulSoup as bs -from concurrent.futures import ThreadPoolExecutor -from tqdm import tqdm - -class WikiScraper: - def __init__(self): - self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} - self.processed_urls = set() - - def scrape_url(self, url): - out_page = self.scrapper(url.strip()) - if out_page is not None: - text = ''.join([paragraph.get_text() for paragraph in out_page]) - return text - else: - return '' - - def scrape_from_file(self, url_file, out_file, batch_size=1000): - with open(url_file, 'r', encoding='utf-8') as f: - urls = f.readlines() - - urls = list(set([url.strip() for url in urls])) - - with ThreadPoolExecutor(max_workers=40) as executor: - futures = [] - for i in range(0, len(urls), batch_size): - urls_batch = urls[i:i+batch_size] - for url in urls_batch: - if url not in self.processed_urls: - future = executor.submit(self.scrape_url, url) - futures.append(future) - self.processed_urls.add(url) - - with open(out_file, 'a', encoding='utf-8') as outfile: - for future in tqdm(futures, desc="Scrapping URLs"): - text = future.result() - outfile.write(text) - - print(f'Total fetched URLs: {len(self.processed_urls)}') - - def scrapper(self, url): - r = requests.get(url, headers=self.headers) - if r.status_code == 200: - soup = bs(r.content, 'html.parser') - paragraphs = soup.find_all('p') - return paragraphs - else: - return None - -if __name__ == "__main__": - current_dir = os.path.dirname(os.path.abspath(__file__)) - os.chdir(current_dir) - scraper = WikiScraper() - url_file = 'extracted_urls.txt' - output_file = 'Datasets/wiki_110k.txt' - - start_time = timeit.default_timer() - scraper.scrape_from_file(url_file, output_file, batch_size=500) - print(f"Total time taken: {timeit.default_timer() - start_time:.2f} seconds") diff --git a/graze/wikipedia/xml_parser.py b/graze/wikipedia/xml_parser.py deleted file mode 100644 index 4e8fdac..0000000 --- a/graze/wikipedia/xml_parser.py +++ /dev/null @@ -1,17 +0,0 @@ -import os -current_directory = os.path.dirname(os.path.abspath(__file__)) -os.chdir(current_directory) -import xml.etree.ElementTree as ET - -# Load XML file -tree = ET.parse('xml dumps/enwiki2.xml') -root = tree.getroot() - -urls = [link.text for link in root.findall(".//link")] - -output_file = 'extracted_urls1.txt' -with open(output_file, 'w', encoding='utf-8') as f: - for url in urls: - f.write(url + '\n') - -print(f"Extracted URLs saved to {output_file}") \ No newline at end of file diff --git a/graze/youtube/base.py b/graze/youtube/base.py deleted file mode 100644 index 7881a91..0000000 --- a/graze/youtube/base.py +++ /dev/null @@ -1,93 +0,0 @@ -from googleapiclient.discovery import build -from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled -import logging -from tqdm import tqdm -import json -import os - -logging.basicConfig(filename='youtube_fetch.log', level=logging.ERROR) -current_dir = os.path.dirname(os.path.realpath(__file__)) -os.chdir(current_dir) - -class Youtube: - def __init__(self, api_key, filepath) -> None: - self.api_key = api_key - self.filepath = filepath - self.youtube = build('youtube', 'v3', developerKey=api_key) - - def __call__(self, channel_id=None): - if channel_id: - self.channelData = [channel_id] - else: - with open('./channelIds.json', 'r') as infile: - self.channelData = json.load(infile) - self.run() - - def fetch_url(self, channelId): - next_page_token = None - videoIds = [] - while True: - response = self.youtube.channels().list( - part='contentDetails', id=channelId - ).execute() - - if 'items' in response and response['items']: - playlistId = response['items'][0]['contentDetails']['relatedPlaylists']['uploads'] - - playlistsRes = self.youtube.playlistItems().list( - part='contentDetails', playlistId=playlistId, - maxResults=100, pageToken=next_page_token - ).execute() - - videoIds.extend([item['contentDetails']['videoId'] for item in playlistsRes.get('items', [])]) - next_page_token = playlistsRes.get('nextPageToken') - if not next_page_token: - break - return videoIds - - def convert(self, result): - with open(f"{self.filepath}.json", 'w') as outfile: - json.dump(result, outfile, indent=2) - print("Data written successfully in JSON file") - - def get_captions(self, videoId): - try: - raw_transcripts = [] - videoNo = 0 - for ids in videoId: - try: - captions = YouTubeTranscriptApi.get_transcript( - ids, languages=['en'], preserve_formatting=True - ) - if captions: - formatted_captions = [{'text': caption['text']} for caption in captions] - raw_transcripts.append(formatted_captions) - videoNo += 1 - else: - continue - except TranscriptsDisabled as e: - logging.error(f"Error while fetching the videos: {str(e)}") - except Exception as e: - logging.error(f"Error while fetching the videos: {str(e)}") - print(f"Number of videos that had captions: {videoNo}") - return raw_transcripts - except Exception as e: - logging.error(f"Error while fetching the videos: {str(e)}") - - def save_captions(self, transcripts): - with open(f"{self.filepath}.txt", 'a', encoding='utf-8') as file: - for video_captions in transcripts: - for line in video_captions: - file.write(line['text']) - - def run(self): - urldict = [] - for channel_id in tqdm(self.channelData, desc="Fetching captions from Youtube: "): - video_ids = self.fetch_url(channel_id) - captions = self.get_captions(video_ids) - self.save_captions(captions) - for i in video_ids: - videoLink = f"https://www.youtube.com/watch?v={i}" - urldict.append(videoLink) - - self.convert(urldict) \ No newline at end of file diff --git a/graze/youtube/channelIds.json b/graze/youtube/channelIds.json deleted file mode 100644 index 13bb1ce..0000000 --- a/graze/youtube/channelIds.json +++ /dev/null @@ -1,114 +0,0 @@ -[ - "UCb_MAhL8Thb3HJ_wPkH3gcw", - "UCA295QVkf9O1RQ8_-s3FVXg", - "UCpFFItkfZz1qz5PpHpqzYBw", - "UCY1kMZp36IQSyNx_9h4mpCg", - "UCA19mAJURyYHbJzhfpqhpCA", - "UCqnbDFdCpuN8CMEg0VuEBqA", - "UCddiUEpeqJcYeBxX1IVBKvQ", - "UCcefcZRL2oaA_uBNeo5UOWg", - "UCLXo7UDZvByw2ixzpQCufnA", - "UCsQoiOrh7jzKmE8NBofhTnQ", - "UCUyvQV2JsICeLZP4c_h40kA", - "UCvjgXvBlbQiydffZU7m1_aw", - "UCRI00CwLZdLRCWg5BdDOsNw", - "UCEIwxahdLz7bap-VDs9h35A", - "UC4bq21IPPbpu0Qrsl7LW0sw", - "UCR1IuLEqb6UEA_zQ81kwXfg", - "UCIlU5KDHKFSaebYviKfOidw", - "UCtYKe7-XbaDjpUwcU5x0bLg", - "UCBJycsmduvYEL83R_U4JriQ", - "UCRcgy6GzDeccI7dkbbBna3Q", - "UC3_BakzLfadvFrsnClMFWmQ", - "UCmGSJVG3mCRXVOP4yZrU1Dw", - "UCFN6lQpfY8XIRdhv9G-f4bg", - "UConJDkGk921yT9hISzFqpzw", - "UClWTCPVi-AU9TeCN6FkGARg", - "UCyHJ94JzwY92NsBVzJ2aE3Q", - "UCTqEu1wZDBju2tHkNP1dwzQ", - "UC6nSFpj9HTCZ5t-N3Rm3-HA", - "UCX6b17PVsYBQ0ip5gyeme-Q", - "UCONtPx56PSebXJOxbFv-2jQ", - "UCZYTClx2T1of7BRZ86-8fow", - "UCzWQYUVCpZqtN93H8RR44Qw", - "UCYbK_tjZ2OrIZFBvU6CCMiA", - "UCxzC4EngIsMrPmbm6Nxvb-A", - "UCcabW7890RKJzL968QWEykA", - "UCamLstJyCa-t5gfZegxsFMw", - "UC415bOPUcGSamy543abLmRA", - "UCpMcsdZf2KkAnfmxiq2MfMQ", - "UCqVEHtQoXHmUCfJ-9smpTSg", - "UCYO_jab_esuFRV4b17AJtAw", - "UCHnyfMqiRRG1u-2MsSQLbXA", - "UCsXVk37bltHxD1rDPwtNM8Q", - "UC9RM-iSvTu1uPJb8X5yp3EQ", - "UCZaT_X_mc0BI-djXOlfhqWQ", - "UCMiJRAwDNSNzuYeN2uWa0pA", - "UCHpw8xwDNhU9gdohEcJu4aA", - "UCK7tptUDHh-RYDsdxO1-5QQ", - "UCsooa4yRKGN_zEE8iknghZA", - "UC6n8I1UDTKP1IWjQMg6_TwA", - "UC8butISFwT-Wl7EV0hUK0BQ", - "UCgRQHK8Ttr1j9xCEpCAlgbQ", - "UCEBb1b_L6zDS3xTUrIALZOw", - "UCN0QBfKk0ZSytyX_16M11fA", - "UCBpxspUNl1Th33XbugiHJzw", - "UC3osNjJeuDdvyALIEP-nh0g", - "UCaSCt8s_4nfkRglWCvNSDrg", - "UCjgpFI5dU-D1-kh9H1muoxQ", - "UCBa659QWEk1AI4Tg--mrJ2A", - "UCdBK94H6oZT2Q7l0-b0xmMg", - "UCBA9cAuPy9L5IYYXqOduIvw", - "UCXjmz8dFzRJZrZY8eFiXNUQ", - "UClZbmi9JzfnB2CEb0fG8iew", - "UCUFoQUaVRt3MVFxqwPUMLCQ", - "UCgLxmJ8xER7Y7sywMN5SfWg", - "UCac1MisHGa0qtzf0oWlU8Zw", - "UCSIvk78tK2TiviLQn4fSHaw", - "UCUyvQV2JsICeLZP4c_h40kA", - "UCqFzWxSCi39LnW1JKFR3efg", - "UCqFzWxSCi39LnW1JKFR3efg", - "UCccjdJEay2hpb5scz61zY6Q", - "UC8CX0LD98EDXl4UYX1MDCXg", - "UC6VcWc1rAoWdBCM0JxrRQ3A", - "UCSHZKyawb77ixDdsGog4iWA", - "UCVHdvAX5-R8y5l9xp6nroBQ", - "UCTb6Oy0TXI03iEUdRMR9dnw", - "UCqoAEDirJPjEUFcF2FklnBA", - "UCccjdJEay2hpb5scz61zY6Q", - "UCNVBYBxWj9dMHqKEl_V8HBQ", - "UCNVBYBxWj9dMHqKEl_V8HBQ", - "UCb-vZWBeWA5Q2818JmmJiqQ", - "UChDKyKQ59fYz3JO2fl0Z6sg", - "UCupvZG-5ko_eiXAupbDfxWw", - "UCDrLGkZTcNCshOLiKi5NtEw", - "UCWOA1ZGywLbqmigxE4Qlvuw", - "UCrM7B7SL_g1edFOnmj-SDKg", - "UCUMZ7gohGI9HcU9VNsr2FJQ", - "UCF9imwPMSGz4Vq1NiTWCC7g", - "UCjmJDM5pRKbUlVIzDYYWb6g", - "UCrRttZIypNTA1Mrfwo745Sg", - "UC0k238zFx-Z8xFH0sxCrPJg", - "UCT9zcQNlyht7fRlcjmflRSA", - "UC0C-w0YjGpqDXGB8IHb662A", - "UCgQna2EqpzqzfBjlSmzT72w", - "UCeLHszkByNZtPKcaVXOCOQQ", - "UCjNRJBlxvvS0UXAT2Ack-QQ", - "UC-J-KZfRV8c13fOCkhXdLiQ", - "UCfM3zsQsOnfWNUppiycmBuw", - "UCNjHgaLpdy1IMNK57pYiKiQ", - "UCqECaJ8Gagnn7YCbPEzWH6g", - "UCb2HGwORFBo94DmRx4oLzow", - "UCi4EDAgjULwwNBHOg1aaCig", - "UCDPM_n1atn2ijUwHd0NNRQw", - "UCcgqSM4YEo5vVQpqwN-MaNw", - "UCoUM-UJ7rirJYP8CQ0EIaHA", - "UC0WP5P-ufpRfjbNrmOWwLBQ", - "UCBVjMGOIkavEAhyqpxJ73Dw", - "UCPHjpfnnGklkRBBTd0k6aHg", - "UCmHhviensDlGQeU8Yo80zdg", - "UC6IBMCQ6-d7p41KHxOsq4RA", - "UCiMhD4jzUqG-IgPzUmmytRQ", - "UCB0JSO6d5ysH2Mmqz5I9rIw", - "UC-lHJZR3Gqxm24_Vd_AJ5Yw" -] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 61f6a1e..592298d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,8 @@ requests tqdm timeit json -re \ No newline at end of file +re +googleapiclient +youtube_transcript_api +logging +typing \ No newline at end of file diff --git a/run.py/run_britannica.py b/run.py/run_britannica.py deleted file mode 100644 index 1a287e4..0000000 --- a/run.py/run_britannica.py +++ /dev/null @@ -1,17 +0,0 @@ -""" - --> sample script for collecting data from britannica.com -""" - -import os -current_directory = os.path.dirname(os.path.abspath(__file__)) -os.chdir(current_directory) -import timeit - -from graze import britannica - -scraper = britannica(max_limit=20) -start_time = timeit.default_timer() -scraper(out_file='./output.txt') -end_time = timeit.default_timer() - -print(f"total time taken: {((end_time - start_time) / 60):2f} mins") \ No newline at end of file diff --git a/run.py/run_transcripts.py b/run.py/run_transcripts.py index 18c08b5..bef766a 100644 --- a/run.py/run_transcripts.py +++ b/run.py/run_transcripts.py @@ -1,7 +1,3 @@ -""" - --> sample script to collect transcripts from youtube videos -""" - import os from dotenv import load_dotenv load_dotenv() @@ -10,7 +6,10 @@ api_key = os.getenv('yt_key') -from graze import youtube +from graze import Youtube +from graze.queries import Queries + +queries = Queries(category="channel") -scraper = youtube(api_key=api_key, filepath='./output.txt') -scraper() \ No newline at end of file +youtube = Youtube(api_key=api_key, filepath='../transcripts', max_results=50) +youtube(channel_ids=queries(), videoUrls=True) \ No newline at end of file diff --git a/run.py/run_unsplash.py b/run.py/run_unsplash.py new file mode 100644 index 0000000..09c801e --- /dev/null +++ b/run.py/run_unsplash.py @@ -0,0 +1,11 @@ +import os +current_directory = os.path.dirname(os.path.abspath(__file__)) +os.chdir(current_directory) + +from graze import Unsplash +from graze.queries import Queries + +topics = Queries("images") + +image = Unsplash(directory='../images', metrics=True) +image(topics=topics()) \ No newline at end of file diff --git a/run.py/run_wiki.py b/run.py/run_wiki.py index 729cfd3..30f5eaf 100644 --- a/run.py/run_wiki.py +++ b/run.py/run_wiki.py @@ -1,18 +1,7 @@ -""" - --> sample script to collect data from wikipedia.com -""" +from graze import Wikipedia +from graze.queries import Queries -import os -current_directory = os.path.dirname(os.path.abspath(__file__)) -os.chdir(current_directory) -import timeit +queries = Queries(category="search") +wiki = Wikipedia(filepath='../data.txt', metrics=True) -from graze import wikipedia - -wiki = wikipedia() - -start_time = timeit.default_timer() -wiki(out_file='./output.txt') -end_time = timeit.default_timer() - -print(f"total time taken: {((end_time - start_time) / 60):2f} mins") \ No newline at end of file +wiki(queries=queries(), extra_urls=True) \ No newline at end of file diff --git a/test.py b/test.py index 6121e36..09c801e 100644 --- a/test.py +++ b/test.py @@ -1,3 +1,11 @@ -import graze +import os +current_directory = os.path.dirname(os.path.abspath(__file__)) +os.chdir(current_directory) -graze.unsplash() \ No newline at end of file +from graze import Unsplash +from graze.queries import Queries + +topics = Queries("images") + +image = Unsplash(directory='../images', metrics=True) +image(topics=topics()) \ No newline at end of file