From 96a0575187c704025fb6b32a096653c59661dc11 Mon Sep 17 00:00:00 2001 From: HobnobMancer Date: Tue, 21 May 2024 21:47:20 +0100 Subject: [PATCH 1/8] remove fstrings from cache logging --- cazy_webscraper/cache/ncbi.py | 44 ++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/cazy_webscraper/cache/ncbi.py b/cazy_webscraper/cache/ncbi.py index e46c1865..dfc2af26 100644 --- a/cazy_webscraper/cache/ncbi.py +++ b/cazy_webscraper/cache/ncbi.py @@ -47,6 +47,9 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from cazy_webscraper import closing_message +from cazy_webscraper.ncbi.sequences import get_protein_accession + def get_cache_seqs(start_time, args): """Extract protein sequences from FASTA and/or JSON file, which will be added to the @@ -62,7 +65,7 @@ def get_cache_seqs(start_time, args): seq_records = [] if args.seq_dict: - logger.warning(f"Getting sequences from JSON cache:\n{args.seq_dict}") + logger.warning("Getting sequences from JSON cache:\n%s", args.seq_dict) try: with open(args.seq_dict, "r") as fh: @@ -70,10 +73,10 @@ def get_cache_seqs(start_time, args): except FileNotFoundError: logger.error( - f"Could not find JSON file of protein sequences at:\n" - f"{args.seq_dict}\n" - "Check the path is correct" - "Terminating program" + "Could not find JSON file of protein sequences at:\n" + "%s\n" + "Check the path is correct. Terminating program", + args.seq_dict ) closing_message("Get GenBank seqs", start_time, args, early_term=True) @@ -82,7 +85,7 @@ def get_cache_seqs(start_time, args): seq_dict[key] = Seq(cache_dict[key]) if args.seq_file: - logger.warning(f"Getting sequences from FASTA cache:\n{args.seq_file}") + logger.warning("Getting sequences from FASTA cache:\n%s", args.seq_file) try: for record in SeqIO.parse(args.seq_file, "fasta"): @@ -91,20 +94,23 @@ def get_cache_seqs(start_time, args): if retrieved_accession is None: logger.error( "Could not retrieve a NCBI protein version accession from cache\n" - f"from the record id '{record.id}'\n" - "The sequence from this record will not be added to the db" + "from the record id '%s'\n" + "The sequence from this record will not be added to the db", + record.id ) continue try: - seq_dict[retrieved_accession] if seq_dict[retrieved_accession] != record.seq: logger.warning( - f"Retrieved seq for {retrieved_accession} from JSON file which does NOT match " + "Retrieved seq for %s from JSON file which does NOT match " "the seq in the FASTA file.\n" "Adding seq from the FASTA file to the local CAZyme database\n" - f"JSON seq: {seq_dict[retrieved_accession]}\n" - f"FASTA seq: {record.seq}" + "JSON seq: %s\n" + "FASTA seq: %s", + retrieved_accession, + seq_dict[retrieved_accession], + record.seq ) seq_dict[retrieved_accession] = record.seq except KeyError: @@ -112,16 +118,16 @@ def get_cache_seqs(start_time, args): except FileNotFoundError: logger.error( - f"Could not find FASTA file of protein sequences at:\n" - f"{args.seq_file}\n" - "Check the path is correct" - "Terminating program" + "Could not find FASTA file of protein sequences at:\n" + "%s\n" + "Check the path is correct. Terminating program", + args.seq_file ) closing_message("Get GenBank seqs", start_time, args, early_term=True) - for key in seq_dict: - seq_records.append(SeqRecord(id=key, seq=Seq(seq_dict[key]))) + for key, value in seq_dict.items(): + seq_records.append(SeqRecord(id=key, seq=Seq(value))) - logger.warning(f"Retrieved {len(seq_records)} from cache") + logger.warning("Retrieved %s from cache", len(seq_records)) return seq_dict, seq_records From 2ff52322238bbec5b5165045c753a05fdb7d12b9 Mon Sep 17 00:00:00 2001 From: HobnobMancer Date: Tue, 21 May 2024 21:51:24 +0100 Subject: [PATCH 2/8] add type hinting --- cazy_webscraper/cache/ncbi.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cazy_webscraper/cache/ncbi.py b/cazy_webscraper/cache/ncbi.py index dfc2af26..e5199962 100644 --- a/cazy_webscraper/cache/ncbi.py +++ b/cazy_webscraper/cache/ncbi.py @@ -40,6 +40,7 @@ """Cache data retrieved from the remove NCBI database""" +import argparse import logging import json @@ -51,7 +52,10 @@ from cazy_webscraper.ncbi.sequences import get_protein_accession -def get_cache_seqs(start_time, args): +def get_cache_seqs( + start_time: str, + args: argparse.ArgumentParser +) -> tuple(dict[str, Seq], list[SeqRecord]): """Extract protein sequences from FASTA and/or JSON file, which will be added to the local CAZyme database From 64198b609170ed9f6ba91b377c9c83d7e47f4c33 Mon Sep 17 00:00:00 2001 From: HobnobMancer Date: Sun, 2 Jun 2024 09:37:23 +0100 Subject: [PATCH 3/8] remove unused imports --- cazy_webscraper/cazy_scraper.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cazy_webscraper/cazy_scraper.py b/cazy_webscraper/cazy_scraper.py index b85dc2a9..46a0dd22 100644 --- a/cazy_webscraper/cazy_scraper.py +++ b/cazy_webscraper/cazy_scraper.py @@ -68,7 +68,6 @@ import logging -import json import os from datetime import datetime @@ -81,8 +80,6 @@ from saintBioutils.utilities.logger import config_logger, build_logger from cazy_webscraper import ( - CITATION_INFO, - VERSION_INFO, closing_message, connect_to_new_db, connect_existing_db, From 05e0a413e0b5a766b150473e3a747c8585083912 Mon Sep 17 00:00:00 2001 From: HobnobMancer Date: Sun, 16 Jun 2024 15:13:18 +0100 Subject: [PATCH 4/8] refactorise general ops to own modules --- cazy_webscraper/__init__.py | 123 +----------------- cazy_webscraper/cazy/__init__.py | 17 ++- cazy_webscraper/cazy_scraper.py | 193 ++++++++-------------------- cazy_webscraper/crawler/__init__.py | 74 +++++------ 4 files changed, 107 insertions(+), 300 deletions(-) diff --git a/cazy_webscraper/__init__.py b/cazy_webscraper/__init__.py index 5e3d75be..82b9bc2f 100644 --- a/cazy_webscraper/__init__.py +++ b/cazy_webscraper/__init__.py @@ -41,14 +41,12 @@ import logging -import os import sys import numpy as np import pandas as pd from datetime import datetime -from pathlib import Path import Bio import bioservices @@ -61,12 +59,7 @@ import sqlalchemy import tqdm -from saintBioutils.utilities.file_io import make_output_directory - -from cazy_webscraper.sql import sql_orm - - -__version__ = "2.3.0.3" +__version__ = "3.0.0.0" VERSION_INFO = f"cazy_webscraper version: {__version__}" @@ -88,6 +81,9 @@ AUTHOR_EMAIL = "eemh1@st-andrews.ac.uk" +CAZY_URL = "http://www.cazy.org" +DOWNLOAD_URL = 'http://www.cazy.org/IMG/cazy_data/cazy_data.zip' + def closing_message(job, start_time, args, early_term=False): """Write closing messsage to terminal @@ -215,114 +211,3 @@ def display_version_info(): """ print(message) - - -def connect_existing_db(args, time_stamp, start_time): - """Coordinate connecting to an existing local CAZyme database, define logger name and cache dir - - :param args: cmd-line args parser - :param time_stamp: str, time cazy_webscraper was invoked - :param start_time: pd date-time obj, time cazy_webscraper was invoked - - Return connection to local CAZyme database, logger file name, and path to cache dir - """ - logger = logging.getLogger(__name__) - - logger.info("Adding data to an existing local CAZyme database") - - if os.path.isfile(args.database) is False: - logger.error( - "Could not find local CAZy database.\n" - "Check path is correct.\n" - "Terminating programme." - ) - closing_message("cazy_webscraper", start_time, args) - sys.exit(1) - - try: - connection = sql_orm.get_db_connection(args.database, args.sql_echo, new=False) - logger.info("Opened connection to local CAZyme database") - except Exception: - logger.error( - "Failed to open connection to an exiting local CAZyme database\n." - "Terminating program\n", - exc_info=True, - ) - closing_message("cazy_webscraper", start_time, args) - sys.exit(1) - - # used for naming additional log files - logger_name = str(args.database).split('.')[0] - - # define path to cache family txt files - cache_dir = Path(f"{str(args.database.parent)}/.cazy_webscraper_{time_stamp}") - - return connection, logger_name, cache_dir - - -def connect_to_new_db(args, time_stamp, start_time): - """Build and connect to a new local CAZyme database. - - :param args: cmd-line args parser - :param time_stamp: str, time cazy_webscraper was invoked - :param start_time: pd date-time obj, time cazy_webscraper was invoked - - Return connection to the database, name of the logger, and path to the cache dir - """ - logger = logging.getLogger(__name__) - - if args.db_output is not None: # user defined target output for the NEW database - - if os.path.isfile(args.db_output): # target file exists - if args.force: - logger.warning( - "Overwriting existing local CAZyme database at:\n" - f"{args.db_output}" - ) - - else: - logger.warning( - "Target path for new database already exists.\n" - "Either enable forced overwriting (-f) or add data this data (-D).\n" - "Terminating program." - ) - closing_message("cazy_webscraper", start_time, args) - sys.exit(1) - - else: # may need to build dirs - logger.info( - "Building new local CAZyme database\n" - f"Output directory: {(args.db_output).parent}\n" - f"Force overwriting exiting output file: {args.force}" - ) - - if str((args.db_output).parent) != '.': # dirs defined in output put - output_dir = (args.db_output).parent - make_output_directory(output_dir, args.force, args.nodelete) - cache_dir = Path(f"{str(output_dir)}/.cazy_webscraper_{time_stamp}") - - else: # writing to cwd - cache_dir = Path(f".cazy_webscraper_{time_stamp}") - - logger_name = str(args.db_output).split('.')[0] - db_path = args.db_output - - else: - logger.info("Using default database name and writing to cwd") - db_path = Path(f"cazy_webscraper_{time_stamp}.db") - cache_dir = Path(f".cazy_webscraper_{time_stamp}") - logger_name = f'cazy_webscraper_{time_stamp}' - - try: - connection = sql_orm.get_db_connection(db_path, args.sql_echo, new=True) - logger.warning(f"Built new local CAZyme database at\n{db_path}") - except Exception: - logger.error( - "Failed to build new SQL database\n." - "Terminating program", - exc_info=True, - ) - closing_message("cazy_webscraper", start_time, args) - sys.exit(1) - - return connection, logger_name, cache_dir diff --git a/cazy_webscraper/cazy/__init__.py b/cazy_webscraper/cazy/__init__.py index 6cf8c926..20c9c98f 100644 --- a/cazy_webscraper/cazy/__init__.py +++ b/cazy_webscraper/cazy/__init__.py @@ -40,11 +40,13 @@ """Parse data retrieved from CAZy and build a dict of data matching the user's criteria.""" +import argparse import logging import re import sys from collections import namedtuple +from pathlib import Path from tqdm import tqdm from zipfile import ZipFile @@ -53,11 +55,13 @@ from cazy_webscraper.crawler import get_cazy_file -def get_cazy_txt_file_data(cache_dir, time_stamp, args): +def get_cazy_txt_file_data(cache_dir: Path, time_stamp: str, args: argparse.ArgumentParser): """Retrieve txt file of CAZy db dump from CAZy or the local disk. + :param cache_dir: Path(), path to directory where cache is written to :param time_stamp: str, date and time cazy_webscraper was intiated :param args: cmd-line args parser + Return list of lines from CAZy txt file, one line is one item in the list""" logger = logging.getLogger(__name__) @@ -84,10 +88,13 @@ def get_cazy_txt_file_data(cache_dir, time_stamp, args): if err_message is not None: logger.error( "Could not connect to CAZy to download the CAZy db txt file after " - f"{(args.retries + 1)*(args.retries + 1)}\n" - f"The following error was raised:\n{err_message}" - f"File would have been written to {cazy_txt_path}" - "Terminating program" + "%s\n" + "The following error was raised:\n%s" + "File would have been written to %s" + "Terminating program", + ((args.retries + 1)*(args.retries + 1)), + err_message, + cazy_txt_path, ) sys.exit(1) diff --git a/cazy_webscraper/cazy_scraper.py b/cazy_webscraper/cazy_scraper.py index 46a0dd22..f0b5c707 100644 --- a/cazy_webscraper/cazy_scraper.py +++ b/cazy_webscraper/cazy_scraper.py @@ -40,54 +40,37 @@ # SOFTWARE. """ Web scraper to scrape CAZy website and retrieve all protein data. - -:cmd_args --cache_dir: target path for cache -:cmd_args --cazy_synonms: path to yaml file containing CAZy class name synonms -:cmd_args --classes: specify CAZy classes to scrape -:cmd_args --citation: print citation information -:cmd_args --config: path to configruration file -:cmd_args --database: provide path to a local SQLite database to add additional data to -:cmd_args --db_output: path to write out new SQLite database -:cmd_args --families: specify CAZy families to retrieve CAZymes from -:cmd_args --force: force overwriting existing database -:cmd_args --genera: specify Genera to retrieve CAZymes from -:cmd_args --kingdoms: specify taxonomy Kingdoms to scrape proteins from -:cmd_args --log: path to log file, enables writing out log messages to a log file -:cmd_args --nodelete_cache: do not deleted existing content in cache dir -:cmd_args --nodelete_log: do not deleted existing content in log dir -:cmd_args --output: path to output directory -:cmd_args --retries: specify the number of times to try scraping a page if connection fails -:cmd_args --subfamilies: enable retrieval of subfamilies from CAZy -:cmd_args --species: specify species to retrieve CAZymes from -:cmd_args --strains: specify specific strains of species to retrieve CAZymes from -:cmd_args --timeout: specify the maximum time (in seconds) before determining connection timed out -:cmd_args --validate: retrieve CAZy fam population sizes and check against -:cmd_args --verbose: change logger level from warning to info, verbose logging -:cmd_args --version: print version info """ +import argparse import logging -import os +import sys from datetime import datetime from typing import List, Optional import pandas as pd +from pathlib import Path + from Bio import Entrez from saintBioutils.utilities.file_io import make_output_directory from saintBioutils.utilities.logger import config_logger, build_logger from cazy_webscraper import ( + CAZY_URL, closing_message, - connect_to_new_db, - connect_existing_db, display_citation_info, display_version_info, ) from cazy_webscraper.cache.cazy import cache_cazy_data -from cazy_webscraper.crawler.get_validation_data import get_validation_data +from cazy_webscraper.cazy.download import get_cazy_db_dump +from cazy_webscraper.database.connect import ( + connect_to_new_db, + connect_existing_db, +) +from cazy_webscraper.database.scrape_log import add_main_scrape_message from cazy_webscraper.cazy import ( build_taxa_dict, get_cazy_txt_file_data, @@ -98,36 +81,32 @@ identify_multiple_taxa, replace_multiple_tax, ) -from cazy_webscraper.sql import sql_orm, sql_interface from cazy_webscraper.sql.sql_interface.add_data import add_cazyme_data from cazy_webscraper.utilities import ( parse_configuration, - termcolour, + sanity_checks ) from cazy_webscraper.utilities.parsers.cazy_webscraper_parser import build_parser -def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): - """Set up parser, logger and coordinate overal scrapping of CAZy.""" - cazy_home_url = "http://www.cazy.org" +logger = logging.getLogger(__name__) + +def main(argv: Optional[List[str]] = None): + """Set up parser, logger and coordinate overal scrapping of CAZy.""" time_stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # used in naming files start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # used in terminating message start_time = pd.to_datetime(start_time) - # Program preparation - if argv is None: + if not argv: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() - if logger is None: - logger = logging.getLogger(__name__) - config_logger(args, logger_name=__name__) + config_logger(args, logger_name=__name__) - # check if printing out version or citation information if args.version: display_version_info() return @@ -136,56 +115,20 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No display_citation_info() return - if args.email is None: - logger.error( - "No email address provided.\n" - "Email address required by NCBI - which is required to retrieve the latest taxonomic\n" - "classifications for proteins listed with multiple source organisms in the CAZy database\n" - "Please provide your email address.\n" - "Terminating program." - ) - return - - # check correct output was provided, exit if not operable - if args.database is not None and args.db_output is not None: - warning_message = ( - "Target path for a NEW database (--db_output, -d) and\n" - "a path to an EXISTING database (--database, -D) were provided.\n" - "Please provide one OR the other.\n" - "Terminating program." - ) - logger.warning(termcolour(warning_message, "red")) - closing_message("cazy_webscraper", start_time, args, early_term=True) - return - - if args.db_output is not None and args.db_output.exists(): - if args.force: - logger.warning( - f"Local db {args.db_output} already exists\n" - "Force is True\n" - "Ovewriting existing database." - ) - os.remove(args.db_output) - else: - logger.warning( - f"Local db {args.db_output} already exists\n" - "Force is False\n" - "Not ovewriting existing database\n" - "Termianting program" - ) - closing_message("cazy_webscraper", start_time, args, early_term=True) - return + sanity_checks.sanity_check_main_input(args) if args.skip_ncbi_tax: logger.warning( - "skip_ncbi_tax is True - not retrieving the latest taxa from NCBI for proteins with multipe tax. Will use the first taxa listed in CAZy\n" - "The latest taxonomic data can be retrieved using any of the three options:\n" + "skip_ncbi_tax is True\n" + "The latest taxa from NCBI for proteins with multipe tax in CAZy will not be retrieved\n." + "The first taxonomy retrieved from CAZy will be used instead.\n" + "The latest taxonomic data can be retrieved later using any of the three options:\n" "(i) cw_get_ncbi_taxs\n" "(ii) cw_get_genomics + cw_get_gtdb_taxs\n" "(iii) cw_get_uniprot_data with --taxonomy/-t" ) - - Entrez.email = args.email + else: + Entrez.email = args.email logger.info("Parsing configuration") ( @@ -199,54 +142,19 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No taxonomy_filter_set, ) = parse_configuration.parse_configuration(args) - scrape_config_message = ( - "Configuration:\n" - f"Classes to scrape: {config_dict['classes']}\n" - f"GH fams to scrape: {config_dict['Glycoside Hydrolases (GHs)']}\n" - f"GT fams to scrape: {config_dict['GlycosylTransferases (GTs)']}\n" - f"PL fams to scrape: {config_dict['Polysaccharide Lyases (PLs)']}\n" - f"CE fams to scrape: {config_dict['Carbohydrate Esterases (CEs)']}\n" - f"AA fams to scrape: {config_dict['Auxiliary Activities (AAs)']}\n" - f"CBM fams to scrape: {config_dict['Carbohydrate-Binding Modules (CBMs)']}\n" - f"Scraping subfamilies: {args.subfamilies}" - ) - - if len(taxonomy_filter_set) != 0: - scrape_config_message += "\nTaxonomy filters applied." - - if len(kingdom_filters) < 5: - scrape_config_message += f"\nScraping only tax kingdoms: {kingdom_filters}" - - logger.info(termcolour(scrape_config_message, "cyan")) - - if args.database is not None: # adding data to an EXISTING database + if args.database: connection, logger_name, cache_dir = connect_existing_db(args, time_stamp, start_time) - else: # build a new database + else: connection, logger_name, cache_dir = connect_to_new_db(args, time_stamp, start_time) - logger.info("Adding log of scrape to the local CAZyme database") - with sql_orm.Session(bind=connection) as session: - sql_interface.log_scrape_in_db( - time_stamp, - config_dict, - kingdom_filters, - taxonomy_filter_dict, - set(), # ec_filters not applied when scraping CAZy - 'CAZy', - 'CAZy annotations', - session, - args, - ) - - if args.cache_dir is not None: # use user defined cache dir + if args.cache_dir: # use user defined cache dir cache_dir = args.cache_dir make_output_directory(cache_dir, args.force, args.nodelete_cache) else: make_output_directory(cache_dir, args.force, args.nodelete_cache) + logger.warning("Created cache dir: %s", cache_dir) - logger.warning(f"Created cache dir: {cache_dir}") - - if args.log is not None: # write additional log files to user specified dir + if args.log: # write additional log files to user specified dir logger_name = args.log.name if logger_name.endswith(".log"): logger_name = logger_name[:-4] @@ -255,15 +163,24 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No # write the additional log files to the .cazy_webscraper cache dire logger_name = "log" + add_main_scrape_message( + kingdom_filters, + taxonomy_filter_set, + taxonomy_filter_dict, + time_stamp, + config_dict, + args, + connection + ) + logger.info("Starting retrieval of data from CAZy") - if args.cazy_data is not None: + if args.cazy_data: logger.warning( - f"Retrieving CAZy data from predownloaded CAZy db dump at:\n{args.cazy_data}" + "Retrieving CAZy data from predownloaded CAZy db dump at:\n%s", args.cazy_data ) get_cazy_data( - cazy_home_url, excluded_classes, cazy_class_synonym_dict, config_dict, @@ -282,26 +199,24 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No def get_cazy_data( - cazy_home_url, - excluded_classes, - cazy_class_synonym_dict, - config_dict, - class_filters, - fam_filters, - kingdom_filters, - taxonomy_filters, + excluded_classes: list[str], + cazy_class_synonym_dict: dict[str, list], + config_dict: dict[str, set], + class_filters: set[str], + fam_filters: set[str], + kingdom_filters: set[str], + taxonomy_filters: set[str], connection, - cache_dir, - logger_name, - time_stamp, - args, + cache_dir: Path, + logger_name: str, + time_stamp: str, + args: argparse.Namespace, ): """Coordinate retrieval of data from the CAZy website. This function coordinates the crawling through the CAZy website by calling the appropriate functions, and then retrieving the protein data by calling to the appropriate data again. - :param cazy_home_url: str, url of CAZy home page :param excluded_classes: list, list of classes to not scrape from CAZy :param cazy_class_synonym_dict: dict of accepted CAZy class name synonyms :param config_dict: dict of CAZy families to scrape, or None if args.validate is False @@ -329,7 +244,7 @@ def get_cazy_data( if args.validate: # retrieve CAZy family population sizes for validating all data was retrieved # {fam (str): pop size (int)} cazy_fam_populations = get_validation_data( - cazy_home_url, + CAZY_URL, excluded_classes, cazy_class_synonym_dict, config_dict, diff --git a/cazy_webscraper/crawler/__init__.py b/cazy_webscraper/crawler/__init__.py index 8631167f..753f6973 100644 --- a/cazy_webscraper/crawler/__init__.py +++ b/cazy_webscraper/crawler/__init__.py @@ -51,21 +51,23 @@ from urllib.request import urlopen from requests.exceptions import ConnectionError, MissingSchema +from cazy_webscraper import DOWNLOAD_URL + def download_file_decorator(func): """Decorator to re-invoke the wrapped function up to 'args.retries' times.""" - + def wrapper(*args, **kwargs): logger = logging.getLogger(__name__) tries, success, err = 0, False, None - + while not success and (tries < kwargs['max_tries']): # reset storing error messsage err_message = None - + try: func(*args, **kwargs) - + except ( IOError, HTTPError, @@ -81,21 +83,23 @@ def wrapper(*args, **kwargs): if err is None: success = True - + tries += 1 - + if (not success) and (tries < kwargs['max_tries']): logger.warning( - f'Failed to connect to CAZy on try {tries}/{kwargs["max_tries"]}\n' - f'Error raised: {err}\n' - 'Retrying connection to CAZy in 10s' + 'Failed to connect to CAZy on try %s/{kwargs["max_tries"]}\n' + 'Error raised: %s\n' + 'Retrying connection to CAZy in 10s', + tries, err ) time.sleep(10) - + if success is False: logger.warning( - f'Failed to connect to CAZy after {kwargs["max_tries"]} tries\n' - f'Error raised: {err}\n' + 'Failed to connect to CAZy after %s tries\n' + 'Error raised: %s\n', + kwargs["max_tries"], err ) return err else: @@ -105,7 +109,7 @@ def wrapper(*args, **kwargs): @download_file_decorator -def get_cazy_file(out_path, args, **kwargs): +def get_cazy_file(out_path: Path, args: args, **kwargs): """Download plain text file database dumb from the CAZy website :param out_path: Path, target path to write out downloaded txt file @@ -115,29 +119,25 @@ def get_cazy_file(out_path, args, **kwargs): Return nothing """ logger = logging.getLogger(__name__) - download_url = 'http://www.cazy.org/IMG/cazy_data/cazy_data.zip' # HTTPError, URLError or timeout error may be raised, handled by wrapper - response = urlopen(download_url, timeout=args.timeout) - - file_size = int(response.info().get("Content-length")) - bsize = 1_048_576 - - # IOError may be raised, handled by wrapper - with open(out_path, 'wb') as fh: - with tqdm( - total=file_size, - desc=f"Downloading CAZy txt file", - ) as pbar: - while True: - buffer = response.read(bsize) - if not buffer: - break - pbar.update(len(buffer)) - fh.write(buffer) - - if os.path.isfile(out_path) is False: - logger.error('CAZy txt file not created locally.') - raise IOError - - return + with urlopen(DOWNLOAD_URL, timeout=args.timeout) as response: + file_size = int(response.info().get("Content-length")) + bsize = 1_048_576 + + # IOError may be raised, handled by wrapper + with open(out_path, 'wb') as fh: + with tqdm( + total=file_size, + desc="Downloading CAZy txt file", + ) as pbar: + while True: + buffer = response.read(bsize) + if not buffer: + break + pbar.update(len(buffer)) + fh.write(buffer) + + if os.path.isfile(out_path) is False: + logger.error('CAZy txt file not created locally.') + raise IOError From 12774a1910f7c0ca6367e714990d21d147bbee82 Mon Sep 17 00:00:00 2001 From: HobnobMancer Date: Sun, 16 Jun 2024 16:24:20 +0100 Subject: [PATCH 5/8] dump cazy data into a temp table --- cazy_webscraper/cazy_scraper.py | 45 +++++++----------- cazy_webscraper/database/cazy.py | 81 ++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 28 deletions(-) create mode 100644 cazy_webscraper/database/cazy.py diff --git a/cazy_webscraper/cazy_scraper.py b/cazy_webscraper/cazy_scraper.py index f0b5c707..57faa507 100644 --- a/cazy_webscraper/cazy_scraper.py +++ b/cazy_webscraper/cazy_scraper.py @@ -66,11 +66,14 @@ ) from cazy_webscraper.cache.cazy import cache_cazy_data from cazy_webscraper.cazy.download import get_cazy_db_dump + from cazy_webscraper.database.connect import ( connect_to_new_db, connect_existing_db, ) from cazy_webscraper.database.scrape_log import add_main_scrape_message +from cazy_webscraper.database.cazy import dump_cazy_txt + from cazy_webscraper.cazy import ( build_taxa_dict, get_cazy_txt_file_data, @@ -115,7 +118,8 @@ def main(argv: Optional[List[str]] = None): display_citation_info() return - sanity_checks.sanity_check_main_input(args) + db = sanity_checks.sanity_check_main_input(args) + # db = byte representation of path to the local cazyme db if args.skip_ncbi_tax: logger.warning( @@ -193,6 +197,7 @@ def main(argv: Optional[List[str]] = None): logger_name, time_stamp, args, + db, ) closing_message("cazy_webscraper", start_time, args) @@ -211,6 +216,7 @@ def get_cazy_data( logger_name: str, time_stamp: str, args: argparse.Namespace, + db: Path, ): """Coordinate retrieval of data from the CAZy website. @@ -231,8 +237,6 @@ def get_cazy_data( Return nothing. """ - logger = logging.getLogger(__name__) - # define paths for additional logs files # unless specifed they are added to the logs dir in the cache dir connection_failures_logger = build_logger( @@ -241,45 +245,30 @@ def get_cazy_data( multiple_taxa_logger = build_logger(cache_dir, f"{logger_name}_{time_stamp}_multiple_taxa.log") replaced_taxa_logger = build_logger(cache_dir, f"{logger_name}_{time_stamp}_replaced_taxa.log") - if args.validate: # retrieve CAZy family population sizes for validating all data was retrieved - # {fam (str): pop size (int)} - cazy_fam_populations = get_validation_data( - CAZY_URL, - excluded_classes, - cazy_class_synonym_dict, - config_dict, - cache_dir, - connection_failures_logger, - time_stamp, - args, - ) + if args.cazy_data: + dump_cazy_txt(args.cazy_data, db) else: - cazy_fam_populations = None - - cazy_txt_lines = get_cazy_txt_file_data(cache_dir, time_stamp, args) + cazy_txt_path = get_cazy_db_dump(cache_dir, time_stamp, args) + dump_cazy_txt(cazy_txt_path, db) - logger.info(f"Retrieved {len(cazy_txt_lines)} lines from the CAZy db txt file") + sys.exit(0) - if (len(class_filters) == 0) and \ - (len(fam_filters) == 0) and \ - (len(kingdom_filters) == 0) and \ - (len(taxonomy_filters) == 0): - cazy_data = parse_all_cazy_data(cazy_txt_lines, cazy_fam_populations, args) + if not any((class_filters, fam_filters, kingdom_filters, taxonomy_filters)): + cazy_data = parse_all_cazy_data(args) else: cazy_data = parse_cazy_data_with_filters( - cazy_txt_lines, class_filters, fam_filters, kingdom_filters, taxonomy_filters, - cazy_fam_populations, args, ) logger.info( - f"Retrieved {len((list(cazy_data.keys())))} proteins from the CAZy txt file " - "matching the scraping criteria" + "Retrieved %s proteins from the CAZy txt file " + "matching the scraping criteria", + len((list(cazy_data.keys()))) ) # check for GenBank accessions with multiple source organisms in the CAZy data diff --git a/cazy_webscraper/database/cazy.py b/cazy_webscraper/database/cazy.py new file mode 100644 index 00000000..0fb6ea36 --- /dev/null +++ b/cazy_webscraper/database/cazy.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# (c) University of St Andrews 2024 +# (c) University of Strathclyde 2024 +# (c) James Hutton Institute 2024 +# Author: +# Emma E. M. Hobbs +# +# Contact +# eemh1@st-andrews.ac.uk +# +# Emma E. M. Hobbs, +# Biomolecular Sciences Building, +# University of St Andrews, +# North Haugh Campus, +# St Andrews, +# KY16 9ST +# Scotland, +# UK +# +# The MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +"""Dump the CAZy db dump text file into the local CAZyme db""" + + +import sqlite3 + +from pathlib import Path +from zipfile import ZipFile + +from tqdm import tqdm + + +def dump_cazy_txt(cazy_txt_path: Path, db: Path): + """Dump cazy txt file into the local db + + :param cazy_txt_path: Path to the local tsv file dump of cazy + :param db: Path to the local CAZyme db + """ + conn = sqlite3.connect(db) + cur = conn.cursor() + + with ZipFile(cazy_txt_path) as zip_handle: + cazy_filepath = zip_handle.namelist()[0] + + with zip_handle.open(cazy_filepath) as fh: + num_lines = sum(1 for _ in fh) # Count total lines in the file + fh.seek(0) # Reset file pointer to the beginning + + for line_bytes in tqdm(fh, desc="Dumping CAZy data into a temp table", total=num_lines): + data = line_bytes.decode('utf-8').strip().split() + # e.g. GH157 Bacteria Bacteroides cellulosilyticus BFG-250 UBD70155.1 ncbi + fam, king, genus, protein_id, source = data[0], data[1], data[2], data[-2], data[-1] + sp = ' '.join([_ for _ in data if _ not in [fam, king, genus, protein_id, source]]) + cur.execute( + """ + INSERT INTO TempTable (family, kingdom, genus, species, protein_id, source) + VALUES (?, ?, ?, ?, ?, ?) + """, + (fam, king, genus, sp, protein_id, source) + ) + + conn.commit() + conn.close() From 82518e3afe96d3db054004a2c1f09b7d8ca53779 Mon Sep 17 00:00:00 2001 From: HobnobMancer Date: Sun, 16 Jun 2024 16:24:57 +0100 Subject: [PATCH 6/8] refactor core functions --- cazy_webscraper/cazy/__init__.py | 5 ++- cazy_webscraper/sql/sql_interface/__init__.py | 8 ++--- cazy_webscraper/sql/sql_orm.py | 36 +++++++++++++++++++ 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/cazy_webscraper/cazy/__init__.py b/cazy_webscraper/cazy/__init__.py index 20c9c98f..fd14c6af 100644 --- a/cazy_webscraper/cazy/__init__.py +++ b/cazy_webscraper/cazy/__init__.py @@ -52,7 +52,10 @@ from zipfile import ZipFile from cazy_webscraper import crawler -from cazy_webscraper.crawler import get_cazy_file + + +def get_cazy_file(): + return def get_cazy_txt_file_data(cache_dir: Path, time_stamp: str, args: argparse.ArgumentParser): diff --git a/cazy_webscraper/sql/sql_interface/__init__.py b/cazy_webscraper/sql/sql_interface/__init__.py index d377f553..1c08dba8 100644 --- a/cazy_webscraper/sql/sql_interface/__init__.py +++ b/cazy_webscraper/sql/sql_interface/__init__.py @@ -112,7 +112,7 @@ def log_scrape_in_db( new_log.classes = classes except KeyError: pass - + if len(classes) != 0: new_log.classes = classes @@ -165,7 +165,7 @@ def log_scrape_in_db( strains = organism else: strains += f", {organism}" - + if len(strains) != 0: new_log.strains = strains except (TypeError, KeyError): @@ -179,14 +179,14 @@ def log_scrape_in_db( kingdoms_str = kingdom else: kingdoms_str += f", {kingdom}" - + if len(kingdoms_str) != 0: new_log.kingdoms = kingdoms_str else: new_log.kingdoms = "ALL (Archaea, Bacteria, Eukaryota, Viruses, Unclassified)" else: new_log.kingdoms = "ALL (Archaea, Bacteria, Eukaryota, Viruses, Unclassified)" - + # retrieve commands from the command line cmd_line = "" for cmd in [ diff --git a/cazy_webscraper/sql/sql_orm.py b/cazy_webscraper/sql/sql_orm.py index b0882dda..9d04c583 100644 --- a/cazy_webscraper/sql/sql_orm.py +++ b/cazy_webscraper/sql/sql_orm.py @@ -206,6 +206,7 @@ class Genbank(Base): taxonomy_id = Column(Integer, ForeignKey("Taxs.taxonomy_id")) ncbi_tax_id = Column(Integer, ForeignKey("NcbiTaxs.ncbi_tax_id")) uniprot_id = Column(Integer, ForeignKey("Uniprots.uniprot_id")) + source = Column(String) uniprot = relationship( "Uniprot", @@ -620,3 +621,38 @@ def get_db_connection(db_path, sql_echo, new=False): connection = engine.connect() return connection + + +class TempTable(Base): + """Represent the temporary table used to store the CAZy database dump. + + Extract from a CAZy file: + GH157 Bacteria Bacteroides cellulosilyticus BFG-250 UBD70155.1 ncbi + GH157 Bacteria Bacteroides cellulosilyticus BFG-371 UVP51702.1 ncbi + GH157 Bacteria Bacteroides cellulosilyticus WH2 ALJ59177.1 ncbi + GH157 Bacteria Bacteroides cellulosilyticus WH2 WP_029429093.1 ncbi + GH157 Bacteria Bacteroides sp. BFG-257 UVO98786.1 ncbi + GH157 Bacteria Bacteroides sp. BFG-257 UVO98787.1 ncbi + """ + __tablename__ = "TempTable" + + __table_args__ = ( + Index("record_id", "protein_id"), + ) + + record_id = Column(Integer, primary_key=True) + family = Column(ReString) + kingdom = Column(String) + genus = Column(ReString) + species = Column(ReString) + protein_id = Column(String) + source = Column(String) + + def __str__(self): + return f"-temp protein record, protein={self.protein_id}-" + + def __repr__(self): + return ( + f"" + ) From d1bb3a562e630992780a1b25743a7417b50a7d70 Mon Sep 17 00:00:00 2001 From: HobnobMancer Date: Sun, 16 Jun 2024 17:54:49 +0100 Subject: [PATCH 7/8] refactor code and apply filters to temp db table --- cazy_webscraper/cazy/download.py | 176 ++++++++++++++++++ cazy_webscraper/cazy/filter_data.py | 132 +++++++++++++ cazy_webscraper/cazy_scraper.py | 43 +++-- cazy_webscraper/database/connect.py | 162 ++++++++++++++++ cazy_webscraper/database/scrape_log.py | 91 +++++++++ .../utilities/parse_configuration/__init__.py | 7 +- .../cazy_class_synonym_dict.py | 12 +- 7 files changed, 601 insertions(+), 22 deletions(-) create mode 100644 cazy_webscraper/cazy/download.py create mode 100644 cazy_webscraper/cazy/filter_data.py create mode 100644 cazy_webscraper/database/connect.py create mode 100644 cazy_webscraper/database/scrape_log.py diff --git a/cazy_webscraper/cazy/download.py b/cazy_webscraper/cazy/download.py new file mode 100644 index 00000000..92465416 --- /dev/null +++ b/cazy_webscraper/cazy/download.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# (c) University of St Andrews 2022 +# (c) University of Strathclyde 2022 +# (c) James Hutton Institute 2022 +# Author: +# Emma E. M. Hobbs +# +# Contact +# eemh1@st-andrews.ac.uk +# +# Emma E. M. Hobbs, +# Biomolecular Sciences Building, +# University of St Andrews, +# North Haugh Campus, +# St Andrews, +# KY16 9ST +# Scotland, +# UK +# +# The MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +"""Download the CAZy database dump (txt file).""" + + +import argparse +import logging +import os +import time + +from pathlib import Path +from socket import timeout + +from tqdm import tqdm +from urllib.error import HTTPError, URLError +from urllib3.exceptions import HTTPError, RequestError +from urllib.request import urlopen +from requests.exceptions import ConnectionError, MissingSchema + +from cazy_webscraper import DOWNLOAD_URL + + +logger = logging.getLogger(__name__) + + +def get_cazy_db_dump(cache_dir: Path, time_stamp: str, args: argparse.ArgumentParser): + """Retrieve txt file of CAZy db dump from CAZy or the local disk. + + :param cache_dir: Path(), path to directory where cache is written to + :param time_stamp: str, date and time cazy_webscraper was intiated + :param args: cmd-line args parser + + Returns the path to the CAZy DB dump (txt file) + """ + if args.cazy_data: # retrieve lines from predownloaded CAZy txt file + return args.cazy_data + + # download cazy db dump + cazy_txt_path = cache_dir / f"cazy_db_{time_stamp}.zip" + tries, retries, success, err_message = 0, (args.retries + 1), False, None + + err_message = None + while (tries <= retries) and (not success): + err_message = download_cazy(cazy_txt_path, args, max_tries=(args.retries + 1)) + + if err_message is None: + break + + else: + tries += 1 + + return cazy_txt_path + + +def download_file_decorator(func): + """Decorator to re-invoke the wrapped function up to 'args.retries' times.""" + + def wrapper(*args, **kwargs): + logger = logging.getLogger(__name__) + tries, success, err = 0, False, None + + while not success and (tries < kwargs['max_tries']): + # reset storing error messsage + err_message = None + + try: + func(*args, **kwargs) + + except ( + IOError, + HTTPError, + URLError, + timeout, + ConnectionError, + OSError, + MissingSchema, + RequestError, + ) as err_message: + success = False + err = err_message + + if err is None: + success = True + + tries += 1 + + if (not success) and (tries < kwargs['max_tries']): + logger.warning( + 'Failed to connect to CAZy on try %s/%s\n' + 'Error raised: %s\n' + 'Retrying connection to CAZy in 10s', + tries, kwargs["max_tries"], err + ) + time.sleep(10) + + if success is False: + logger.warning( + 'Failed to connect to CAZy after %s tries\n' + 'Error raised: %s\n', + kwargs["max_tries"], err + ) + return err + else: + return None + + return wrapper + + +@download_file_decorator +def download_cazy(out_path: Path, args: argparse.ArgumentParser, **kwargs): + """Download plain text file database dumb from the CAZy website + + :param out_path: Path, target path to write out downloaded txt file + :param args: cmd-line args parser + :param max_tries: int, max number of times connection to CAZy can be attempted + """ + logger = logging.getLogger(__name__) + + # HTTPError, URLError or timeout error may be raised, handled by wrapper + with urlopen(DOWNLOAD_URL, timeout=args.timeout) as response: + file_size = int(response.info().get("Content-length")) + bsize = 1_048_576 + + # IOError may be raised, handled by wrapper + with open(out_path, 'wb') as fh: + with tqdm( + total=file_size, + desc="Downloading CAZy txt file", + ) as pbar: + while True: + buffer = response.read(bsize) + if not buffer: + break + pbar.update(len(buffer)) + fh.write(buffer) + + if os.path.isfile(out_path) is False: + logger.error('CAZy txt file not created locally.') + raise IOError diff --git a/cazy_webscraper/cazy/filter_data.py b/cazy_webscraper/cazy/filter_data.py new file mode 100644 index 00000000..b57339fb --- /dev/null +++ b/cazy_webscraper/cazy/filter_data.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# (c) University of St Andrews 2022 +# (c) University of Strathclyde 2022 +# (c) James Hutton Institute 2022 +# Author: +# Emma E. M. Hobbs +# +# Contact +# eemh1@st-andrews.ac.uk +# +# Emma E. M. Hobbs, +# Biomolecular Sciences Building, +# University of St Andrews, +# North Haugh Campus, +# St Andrews, +# KY16 9ST +# Scotland, +# UK +# +# The MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +"""Filter data in CAZy database dump (txt file).""" + + +import logging +import sqlite3 + +from pathlib import Path + + +logger = logging.getLogger(__name__) + + +def apply_kingdom_filers(kingdom_filter: set[str], db: Path): + logger.warning("Filtering to kingdoms: %s", kingdom_filter) + query = "DELETE FROM TempTable WHERE kingdom NOT IN ({})".format( + ', '.join('?' for _ in kingdom_filter) + ) + + conn = sqlite3.connect(db) + cur = conn.cursor() + cur.execute(query, list(kingdom_filter)) + conn.commit() + conn.close() + + +def apply_tax_filters(genera: set[str], species: set[str], strains: set[str], db: Path): + """ + :param taxonomy_filter_dict: dict + E.g. { + 'genera': {'Aspergillus', 'AnotherGenus}, + 'species': {'Bacteroides cellulosilyticus', 'Genus species'}, + 'strains': {'Alternaria alternata SRC1lrK2f v1.0', 'Genus species strain'} + } + """ + query = "DELETE FROM TempTable WHERE " + + if genera: + query += "genus NOT IN ({})".format(','.join('?' for _ in genera)) + parameters = list(genera) + else: + parameters = [] + + if species: + species_clauses = ["species NOT LIKE ?" for _ in species] + if parameters: + query += " AND " + query += " AND ".join(species_clauses) + species_with_wildcards = [s + '%' for s in species] + parameters.extend(species_with_wildcards) + + if strains: + if parameters: + query += " AND " + query += "species NOT IN ({})".format(','.join('?' for _ in strains)) + parameters += list(strains) + + print(query) + print(parameters) + + conn = sqlite3.connect(db) + cur = conn.cursor() + cur.execute(query, parameters) + conn.commit() + conn.close() + + +def apply_class_and_family_filters(excluded_classes: list[str], fam_filter: set[str], db: Path): + if excluded_classes and fam_filter: + class_query = " AND ".join(f"family NOT LIKE '{class_}%'" for class_ in excluded_classes) + fam_query = ', '.join(f"'{family}'" for family in fam_filter) + query = f""" + DELETE FROM TempTable + WHERE {class_query} + AND family NOT IN ({fam_query}); + """ + elif excluded_classes and not fam_filter: + class_query = " AND ".join(f"family NOT LIKE '{class_}%'" for class_ in excluded_classes) + query = f""" + DELETE FROM TempTable + WHERE {class_query}; + """ + elif fam_filter and not excluded_classes: + fam_query = ', '.join(f"'{family}'" for family in fam_filter) + query = f""" + DELETE FROM TempTable + WHERE family NOT IN ({fam_query}); + """ + + conn = sqlite3.connect(db) + cur = conn.cursor() + cur.execute(query) + conn.commit() + conn.close() diff --git a/cazy_webscraper/cazy_scraper.py b/cazy_webscraper/cazy_scraper.py index 57faa507..0c3fb1ba 100644 --- a/cazy_webscraper/cazy_scraper.py +++ b/cazy_webscraper/cazy_scraper.py @@ -66,6 +66,11 @@ ) from cazy_webscraper.cache.cazy import cache_cazy_data from cazy_webscraper.cazy.download import get_cazy_db_dump +from cazy_webscraper.cazy.filter_data import ( + apply_kingdom_filers, + apply_tax_filters, + apply_class_and_family_filters, +) from cazy_webscraper.database.connect import ( connect_to_new_db, @@ -74,6 +79,8 @@ from cazy_webscraper.database.scrape_log import add_main_scrape_message from cazy_webscraper.database.cazy import dump_cazy_txt + + from cazy_webscraper.cazy import ( build_taxa_dict, get_cazy_txt_file_data, @@ -118,7 +125,7 @@ def main(argv: Optional[List[str]] = None): display_citation_info() return - db = sanity_checks.sanity_check_main_input(args) + db = sanity_checks.sanity_check_main_input(time_stamp, args) # db = byte representation of path to the local cazyme db if args.skip_ncbi_tax: @@ -186,12 +193,10 @@ def main(argv: Optional[List[str]] = None): get_cazy_data( excluded_classes, - cazy_class_synonym_dict, - config_dict, class_filters, fam_filters, kingdom_filters, - taxonomy_filter_set, + taxonomy_filter_dict, connection, cache_dir, logger_name, @@ -205,12 +210,10 @@ def main(argv: Optional[List[str]] = None): def get_cazy_data( excluded_classes: list[str], - cazy_class_synonym_dict: dict[str, list], - config_dict: dict[str, set], class_filters: set[str], fam_filters: set[str], kingdom_filters: set[str], - taxonomy_filters: set[str], + taxonomy_filter_dict: dict, connection, cache_dir: Path, logger_name: str, @@ -224,8 +227,6 @@ def get_cazy_data( functions, and then retrieving the protein data by calling to the appropriate data again. :param excluded_classes: list, list of classes to not scrape from CAZy - :param cazy_class_synonym_dict: dict of accepted CAZy class name synonyms - :param config_dict: dict of CAZy families to scrape, or None if args.validate is False :param class_filters: set of CAZy classes to retrieve proteins from :param fam_filters: set of CAZy families to retrieve proteins from :param taxonomy_filters: set of genera, species and strains to restrict the scrape to @@ -239,9 +240,6 @@ def get_cazy_data( """ # define paths for additional logs files # unless specifed they are added to the logs dir in the cache dir - connection_failures_logger = build_logger( - cache_dir, f"{logger_name}_{time_stamp}_connection_failures.log" - ) multiple_taxa_logger = build_logger(cache_dir, f"{logger_name}_{time_stamp}_multiple_taxa.log") replaced_taxa_logger = build_logger(cache_dir, f"{logger_name}_{time_stamp}_replaced_taxa.log") @@ -251,8 +249,29 @@ def get_cazy_data( cazy_txt_path = get_cazy_db_dump(cache_dir, time_stamp, args) dump_cazy_txt(cazy_txt_path, db) + # filter data in the cazy db dump to only retain records that match the user criteria + if kingdom_filters: + apply_kingdom_filers(kingdom_filters, db) sys.exit(0) + # need to fix the taxonomy filter + # retain rows that match at least one criteria + if any([taxonomy_filter_dict['genera'], taxonomy_filter_dict['species'], taxonomy_filter_dict['strains']]): + apply_tax_filters( + taxonomy_filter_dict['genera'], + taxonomy_filter_dict['species'], + taxonomy_filter_dict['strains'], + db + ) + + sys.exit(0) + if excluded_classes or fam_filters: + apply_class_and_family_filters(excluded_classes, fam_filters, db) + + sys.exit(0) + + # deal with instances of multiple taxonomies + if not any((class_filters, fam_filters, kingdom_filters, taxonomy_filters)): cazy_data = parse_all_cazy_data(args) diff --git a/cazy_webscraper/database/connect.py b/cazy_webscraper/database/connect.py new file mode 100644 index 00000000..8c6f2d6e --- /dev/null +++ b/cazy_webscraper/database/connect.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# (c) University of St Andrews 2024 +# (c) University of Strathclyde 2024 +# (c) James Hutton Institute 2024 +# Author: +# Emma E. M. Hobbs +# +# Contact +# eemh1@st-andrews.ac.uk +# +# Emma E. M. Hobbs, +# Biomolecular Sciences Building, +# University of St Andrews, +# North Haugh Campus, +# St Andrews, +# KY16 9ST +# Scotland, +# UK +# +# The MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import logging +import os +import sys + +from pathlib import Path + +from saintBioutils.utilities.file_io import make_output_directory + +from cazy_webscraper.sql import sql_orm +from cazy_webscraper import closing_message + + +def connect_existing_db(args, time_stamp, start_time): + """Coordinate connecting to an existing local CAZyme database, define logger name and cache dir + + :param args: cmd-line args parser + :param time_stamp: str, time cazy_webscraper was invoked + :param start_time: pd date-time obj, time cazy_webscraper was invoked + + Return connection to local CAZyme database, logger file name, and path to cache dir + """ + logger = logging.getLogger(__name__) + + logger.info("Adding data to an existing local CAZyme database") + + if os.path.isfile(args.database) is False: + logger.error( + "Could not find local CAZy database.\n" + "Check path is correct.\n" + "Terminating programme." + ) + closing_message("cazy_webscraper", start_time, args) + sys.exit(1) + + try: + connection = sql_orm.get_db_connection(args.database, args.sql_echo, new=False) + logger.info("Opened connection to local CAZyme database") + except Exception: + logger.error( + "Failed to open connection to an exiting local CAZyme database\n." + "Terminating program\n", + exc_info=True, + ) + closing_message("cazy_webscraper", start_time, args) + sys.exit(1) + + # used for naming additional log files + logger_name = str(args.database).split('.', maxsplit=1)[0] + + # define path to cache family txt files + cache_dir = Path(f"{str(args.database.parent)}/.cazy_webscraper_{time_stamp}") + + return connection, logger_name, cache_dir + + +def connect_to_new_db(args, time_stamp, start_time): + """Build and connect to a new local CAZyme database. + + :param args: cmd-line args parser + :param time_stamp: str, time cazy_webscraper was invoked + :param start_time: pd date-time obj, time cazy_webscraper was invoked + + Return connection to the database, name of the logger, and path to the cache dir + """ + logger = logging.getLogger(__name__) + + if args.db_output: # user defined target output for the NEW database + + if os.path.isfile(args.db_output): # target file exists + if args.force: + logger.warning( + "Overwriting existing local CAZyme database at:\n%s", + args.db_output + ) + + else: + logger.warning( + "Target path for new database already exists.\n" + "Either enable forced overwriting (-f) or add data this data (-D).\n" + "Terminating program." + ) + closing_message("cazy_webscraper", start_time, args) + sys.exit(1) + + else: # may need to build dirs + logger.info( + "Building new local CAZyme database\n" + "Output directory: %s\n" + "Force overwriting exiting output file: %s", + (args.db_output).parent, args.force + ) + + if str((args.db_output).parent) != '.': # dirs defined in output put + output_dir = (args.db_output).parent + make_output_directory(output_dir, args.force, args.nodelete) + cache_dir = Path(f"{str(output_dir)}/.cazy_webscraper_{time_stamp}") + + else: # writing to cwd + cache_dir = Path(f".cazy_webscraper_{time_stamp}") + + logger_name = str(args.db_output).split('.', maxsplit=1)[0] + db_path = args.db_output + + else: + logger.info("Using default database name and writing to cwd") + db_path = Path(f"cazy_webscraper_{time_stamp}.db") + cache_dir = Path(f".cazy_webscraper_{time_stamp}") + logger_name = f'cazy_webscraper_{time_stamp}' + + try: + connection = sql_orm.get_db_connection(db_path, args.sql_echo, new=True) + logger.warning("Built new local CAZyme database at\n%s", db_path) + except Exception: + logger.error( + "Failed to build new SQL database\n." + "Terminating program", + exc_info=True, + ) + closing_message("cazy_webscraper", start_time, args) + sys.exit(1) + + return connection, logger_name, cache_dir \ No newline at end of file diff --git a/cazy_webscraper/database/scrape_log.py b/cazy_webscraper/database/scrape_log.py new file mode 100644 index 00000000..1f723ba0 --- /dev/null +++ b/cazy_webscraper/database/scrape_log.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# (c) University of St Andrews 2024 +# (c) University of Strathclyde 2024 +# (c) James Hutton Institute 2024 +# +# Author: +# Emma E. M. Hobbs +# +# Contact +# eemh1@st-andrews.ac.uk +# +# Emma E. M. Hobbs, +# Biomolecular Sciences Building, +# University of St Andrews, +# North Haugh Campus, +# St Andrews, +# KY16 9ST +# Scotland, +# UK +# +# The MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import argparse +import logging + +from cazy_webscraper.sql import sql_interface, sql_orm +from cazy_webscraper.utilities import termcolour + + +logger = logging.getLogger(__name__) + + +def add_main_scrape_message( + kingdom_filters: set[str], + taxonomy_filters: set[str], + taxonomy_filter_dict: dict, + time_stamp: str, + config_dict: dict, + args: argparse.Namespace, + connection +): + """add information of scraping CAZy to the local CAZyme database""" + scrape_config_message = ( + "Configuration:\n" + f"Classes to scrape: {config_dict['classes']}\n" + f"GH fams to scrape: {config_dict['Glycoside Hydrolases (GHs)']}\n" + f"GT fams to scrape: {config_dict['GlycosylTransferases (GTs)']}\n" + f"PL fams to scrape: {config_dict['Polysaccharide Lyases (PLs)']}\n" + f"CE fams to scrape: {config_dict['Carbohydrate Esterases (CEs)']}\n" + f"AA fams to scrape: {config_dict['Auxiliary Activities (AAs)']}\n" + f"CBM fams to scrape: {config_dict['Carbohydrate-Binding Modules (CBMs)']}\n" + f"Scraping subfamilies: {args.subfamilies}" + ) + scrape_config_message += "\nTaxonomy filters applied." if len(taxonomy_filters) != 0 else "" + scrape_config_message += f"\nScraping only tax kingdoms: {kingdom_filters}" if len(kingdom_filters) < 5 else "" + + logger.info(termcolour(scrape_config_message, "cyan")) + + logger.info("Adding log of scrape to the local CAZyme database") + with sql_orm.Session(bind=connection) as session: + sql_interface.log_scrape_in_db( + time_stamp, + config_dict, + kingdom_filters, + taxonomy_filter_dict, + set(), # ec_filters not applied when scraping CAZy + 'CAZy', + 'CAZy annotations', + session, + args, + ) diff --git a/cazy_webscraper/utilities/parse_configuration/__init__.py b/cazy_webscraper/utilities/parse_configuration/__init__.py index acb7b0d2..af6c8786 100644 --- a/cazy_webscraper/utilities/parse_configuration/__init__.py +++ b/cazy_webscraper/utilities/parse_configuration/__init__.py @@ -330,7 +330,7 @@ def parse_user_cazy_classes(cazy_classes, cazy_class_synonym_dict): selected_classes.append(standardised_class_name) else: # written in standardised format - selected_classes.appent(cazy_class) + selected_classes.append(cazy_class) return list(set(cazy_classes)) @@ -466,9 +466,8 @@ def get_excluded_classes(config_dict, cazy_class_synonym_dict): if len(excluded_classes) != 0: # change names of classes into format for excluding classes during scrape - index = 0 - for index in range(len(excluded_classes)): - excluded_classes[index] = f"{excluded_classes[index]}" + for index, class_name in enumerate(excluded_classes): + excluded_classes[index] = f"{class_name}" else: excluded_classes = None diff --git a/cazy_webscraper/utilities/parse_configuration/cazy_class_synonym_dict.py b/cazy_webscraper/utilities/parse_configuration/cazy_class_synonym_dict.py index d4454b1e..7ec13143 100644 --- a/cazy_webscraper/utilities/parse_configuration/cazy_class_synonym_dict.py +++ b/cazy_webscraper/utilities/parse_configuration/cazy_class_synonym_dict.py @@ -44,11 +44,11 @@ def cazy_synonym_dict(): """Create a dictionary of accepted synonms for CAZy classes.""" cazy_class_synonym_dict = { - "Glycoside Hydrolases (GHs)": ["Glycoside-Hydrolases", "Glycoside-Hydrolases", "Glycoside_Hydrolases", "GlycosideHydrolases", "GLYCOSIDE-HYDROLASES", "GLYCOSIDE-HYDROLASES", "GLYCOSIDE_HYDROLASES", "GLYCOSIDEHYDROLASES", "glycoside-hydrolases", "glycoside-hydrolases", "glycoside_hydrolases", "glycosidehydrolases", "GH", "gh", "GHs", "ghs"], - "GlycosylTransferases (GTs)": ["Glycosyl-Transferases", "GlycosylTransferases", "Glycosyl_Transferases", "Glycosyl Transferases", "GLYCOSYL-TRANSFERASES", "GLYCOSYLTRANSFERASES", "GLYCOSYL_TRANSFERASES", "GLYCOSYL TRANSFERASES", "glycosyl-transferases", "glycosyltransferases", "glycosyl_transferases", "glycosyl transferases", "GT", "gt", "GTs", "gts"], - "Polysaccharide Lyases (PLs)": ["Polysaccharide Lyases", "Polysaccharide-Lyases", "Polysaccharide_Lyases", "PolysaccharideLyases", "POLYSACCHARIDE LYASES", "POLYSACCHARIDE-LYASES", "POLYSACCHARIDE_LYASES", "POLYSACCHARIDELYASES", "polysaccharide lyases", "polysaccharide-lyases", "polysaccharide_lyases", "polysaccharidelyases", "PL", "pl"], - "Carbohydrate Esterases (CEs)": ["Carbohydrate Esterases", "Carbohydrate-Esterases", "Carbohydrate_Esterases", "CarbohydrateEsterases", "CARBOHYDRATE ESTERASES", "CARBOHYDRATE-ESTERASES", "CARBOHYDRATE_ESTERASES", "CARBOHYDRATEESTERASES", "carbohydrate esterases", "carbohydrate-esterases", "carbohydrate_esterases", "carbohydrateesterases", "CE", "ce", "CEs", "ces"], - "Auxiliary Activities (AAs)": ["Auxiliary Activities", "Auxiliary-Activities", "Auxiliary_Activities", "AuxiliaryActivities", "AUXILIARY ACTIVITIES", "AUXILIARY-ACTIVITIES", "AUXILIARY_ACTIVITIES", "AUXILIARYACTIVITIES", "auxiliary activities", "auxiliary-activities", "auxiliary_activities", "auxiliaryactivities", "AA", "aa", "AAs", "aas"], - "Carbohydrate-Binding Modules (CBMs)": ["Carbohydrate-Binding-Modules", "Carbohydrate_Binding_Modules", "Carbohydrate_Binding Modules", "CarbohydrateBindingModules", "CARBOHYDRATE-BINDING-MODULES", "CARBOHYDRATE_BINDING_MODULES", "CARBOHYDRATE_BINDING MODULES", "CARBOHYDRATEBINDINGMODULES", "carbohydrate-binding-modules", "carbohydrate_binding_modules", "carbohydrate_binding modules", "carbohydratebindingmodules", "CBMs", "CBM", "cbms", "cbm"] + "GH": ["Glycoside-Hydrolases", "Glycoside-Hydrolases", "Glycoside_Hydrolases", "GlycosideHydrolases", "GLYCOSIDE-HYDROLASES", "GLYCOSIDE-HYDROLASES", "GLYCOSIDE_HYDROLASES", "GLYCOSIDEHYDROLASES", "glycoside-hydrolases", "glycoside-hydrolases", "glycoside_hydrolases", "glycosidehydrolases", "GH", "gh", "GHs", "ghs"], + "GT": ["Glycosyl-Transferases", "GlycosylTransferases", "Glycosyl_Transferases", "Glycosyl Transferases", "GLYCOSYL-TRANSFERASES", "GLYCOSYLTRANSFERASES", "GLYCOSYL_TRANSFERASES", "GLYCOSYL TRANSFERASES", "glycosyl-transferases", "glycosyltransferases", "glycosyl_transferases", "glycosyl transferases", "GT", "gt", "GTs", "gts"], + "PLs": ["Polysaccharide Lyases", "Polysaccharide-Lyases", "Polysaccharide_Lyases", "PolysaccharideLyases", "POLYSACCHARIDE LYASES", "POLYSACCHARIDE-LYASES", "POLYSACCHARIDE_LYASES", "POLYSACCHARIDELYASES", "polysaccharide lyases", "polysaccharide-lyases", "polysaccharide_lyases", "polysaccharidelyases", "PL", "pl"], + "CE": ["Carbohydrate Esterases", "Carbohydrate-Esterases", "Carbohydrate_Esterases", "CarbohydrateEsterases", "CARBOHYDRATE ESTERASES", "CARBOHYDRATE-ESTERASES", "CARBOHYDRATE_ESTERASES", "CARBOHYDRATEESTERASES", "carbohydrate esterases", "carbohydrate-esterases", "carbohydrate_esterases", "carbohydrateesterases", "CE", "ce", "CEs", "ces"], + "AA": ["Auxiliary Activities", "Auxiliary-Activities", "Auxiliary_Activities", "AuxiliaryActivities", "AUXILIARY ACTIVITIES", "AUXILIARY-ACTIVITIES", "AUXILIARY_ACTIVITIES", "AUXILIARYACTIVITIES", "auxiliary activities", "auxiliary-activities", "auxiliary_activities", "auxiliaryactivities", "AA", "aa", "AAs", "aas"], + "CBM": ["Carbohydrate-Binding-Modules", "Carbohydrate_Binding_Modules", "Carbohydrate_Binding Modules", "CarbohydrateBindingModules", "CARBOHYDRATE-BINDING-MODULES", "CARBOHYDRATE_BINDING_MODULES", "CARBOHYDRATE_BINDING MODULES", "CARBOHYDRATEBINDINGMODULES", "carbohydrate-binding-modules", "carbohydrate_binding_modules", "carbohydrate_binding modules", "carbohydratebindingmodules", "CBMs", "CBM", "cbms", "cbm"] } return cazy_class_synonym_dict From abcede90f54fe65dba6731d118b7842474f3a88a Mon Sep 17 00:00:00 2001 From: HobnobMancer Date: Sun, 16 Jun 2024 17:55:23 +0100 Subject: [PATCH 8/8] remove old crawler mod --- cazy_webscraper/crawler/__init__.py | 143 ----- .../crawler/get_validation_data.py | 581 ------------------ 2 files changed, 724 deletions(-) delete mode 100644 cazy_webscraper/crawler/__init__.py delete mode 100644 cazy_webscraper/crawler/get_validation_data.py diff --git a/cazy_webscraper/crawler/__init__.py b/cazy_webscraper/crawler/__init__.py deleted file mode 100644 index 753f6973..00000000 --- a/cazy_webscraper/crawler/__init__.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# (c) University of St Andrews 2022 -# (c) University of Strathclyde 2022 -# (c) James Hutton Institute 2022 -# Author: -# Emma E. M. Hobbs -# -# Contact -# eemh1@st-andrews.ac.uk -# -# Emma E. M. Hobbs, -# Biomolecular Sciences Building, -# University of St Andrews, -# North Haugh Campus, -# St Andrews, -# KY16 9ST -# Scotland, -# UK -# -# The MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -"""Retrieve CAZy text file, extract data, apply user scraping filters.""" - - -import logging -import os -import time - -from socket import timeout -from tqdm import tqdm -from urllib.error import HTTPError, URLError -from urllib3.exceptions import HTTPError, RequestError -from urllib.request import urlopen -from requests.exceptions import ConnectionError, MissingSchema - -from cazy_webscraper import DOWNLOAD_URL - - -def download_file_decorator(func): - """Decorator to re-invoke the wrapped function up to 'args.retries' times.""" - - def wrapper(*args, **kwargs): - logger = logging.getLogger(__name__) - tries, success, err = 0, False, None - - while not success and (tries < kwargs['max_tries']): - # reset storing error messsage - err_message = None - - try: - func(*args, **kwargs) - - except ( - IOError, - HTTPError, - URLError, - timeout, - ConnectionError, - OSError, - MissingSchema, - RequestError, - ) as err_message: - success = False - err = err_message - - if err is None: - success = True - - tries += 1 - - if (not success) and (tries < kwargs['max_tries']): - logger.warning( - 'Failed to connect to CAZy on try %s/{kwargs["max_tries"]}\n' - 'Error raised: %s\n' - 'Retrying connection to CAZy in 10s', - tries, err - ) - time.sleep(10) - - if success is False: - logger.warning( - 'Failed to connect to CAZy after %s tries\n' - 'Error raised: %s\n', - kwargs["max_tries"], err - ) - return err - else: - return None - - return wrapper - - -@download_file_decorator -def get_cazy_file(out_path: Path, args: args, **kwargs): - """Download plain text file database dumb from the CAZy website - - :param out_path: Path, target path to write out downloaded txt file - :param args: cmd-line args parser - :param max_tries: int, max number of times connection to CAZy can be attempted - - Return nothing - """ - logger = logging.getLogger(__name__) - - # HTTPError, URLError or timeout error may be raised, handled by wrapper - with urlopen(DOWNLOAD_URL, timeout=args.timeout) as response: - file_size = int(response.info().get("Content-length")) - bsize = 1_048_576 - - # IOError may be raised, handled by wrapper - with open(out_path, 'wb') as fh: - with tqdm( - total=file_size, - desc="Downloading CAZy txt file", - ) as pbar: - while True: - buffer = response.read(bsize) - if not buffer: - break - pbar.update(len(buffer)) - fh.write(buffer) - - if os.path.isfile(out_path) is False: - logger.error('CAZy txt file not created locally.') - raise IOError diff --git a/cazy_webscraper/crawler/get_validation_data.py b/cazy_webscraper/crawler/get_validation_data.py deleted file mode 100644 index a4eef5c9..00000000 --- a/cazy_webscraper/crawler/get_validation_data.py +++ /dev/null @@ -1,581 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# (c) University of St Andrews 2022 -# (c) University of Strathclyde 2022 -# (c) James Hutton Institute 2022 -# Author: -# Emma E. M. Hobbs -# -# Contact -# eemh1@st-andrews.ac.uk -# -# Emma E. M. Hobbs, -# Biomolecular Sciences Building, -# University of St Andrews, -# North Haugh Campus, -# St Andrews, -# KY16 9ST -# Scotland, -# UK -# -# The MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -"""Retrieve CAZy family populations from CAZy website to check the expected number of gamily members -were added to the local CAZyme database while scraping CAZy.""" - - -import logging -import re -import time - -from urllib.error import HTTPError - -from tqdm import tqdm -from requests.exceptions import ConnectionError, MissingSchema -from saintBioutils.utilities import file_io -from saintBioutils.utilities.file_io import make_output_directory -from urllib3.exceptions import HTTPError, RequestError - -import mechanicalsoup - - -class CazyClass: - """A single CAZy class. - - Used to keep track of specific families that need to be scraped again. - """ - - def __init__(self, name, url, tries, failed_families=None): - self.name = name - self.url = url - self.tries = tries # number of attempts to be scraped - if failed_families is None: - self.failed_families = {} # keyed by Family instance, valued by attempted scrapes (int) - else: - self.failed_families = failed_families - - def __str__(self): - return f"" - - def __repr__(self): - return( - f"" - ) - - -def get_validation_data( - cazy_home_url, - excluded_classes, - cazy_synonym_dict, - config_dict, - cache_dir, - connection_failures_logger, - time_stamp, - args, -): - """Coordinate retrieving the population sizes of CAZy familes from the CAZy website. - - :param cazy_home_url: str, URL to CAZy home page - :param excluded_classes: list of CAZy classes NOT to scrape - :param cazy_synonym_dict: dict of accepted CAZy class name synonyms - :param config_dict: dict keyed by CAZy classes, values by set of CAZy families to scrape - :param cache_dir: path to cache dir - :param connection_failures_logger: logger, logg incorrect URLs and URLs to which a connection - could not be made - :param time_stamp: str, time cazy_webscraper was invoked - :param args: cmd-line args parser - - Return dict, keyed by CAZy family (str) and valued by population size (int) - """ - # make dir fo caching HTML files - cache_dir = cache_dir / "html" - make_output_directory(cache_dir, args.force, args.nodelete_cache) - - cazy_fam_populations = {} # {fam(str): population(int)} - - # retrieve list of CAZy class instances, one instance per class to be scrapped - cazy_classes = get_cazy_classes( - cazy_home_url, - excluded_classes, - cazy_synonym_dict, - cache_dir, - time_stamp, - args, - ) - if cazy_classes is None: - return - - for cazy_class in tqdm(cazy_classes, desc="Retrieving CAZy family population sizes"): - - # first attempt of scraping, retrieve URLs to CAZy families - if len(list(cazy_class.failed_families.keys())) == 0: - fam_pops_to_retrieve = config_dict[cazy_class.name] # retrieve user specified fams - else: - fam_pops_to_retrieve = list(cazy_class.failed_families.keys()) # retry failed connections - - family_populations, err_message, incorrect_urls, failed_families = get_cazy_family_pops( - cazy_class.name, - cazy_class.url, - cazy_home_url, - fam_pops_to_retrieve, - cache_dir, - time_stamp, - args, - ) - - if incorrect_urls is not None: # log families for which compiled URL is incorrect - [connection_failures_logger.warning(url_message) for url_message in incorrect_urls] - - if family_populations is None: # couldn't retrieve family populations - cazy_class.tries += 1 - - # check if maximum number of attempts to connect have been met - if cazy_class.tries == (args.retries + 1): # Maximum number of tries met - connection_failures_logger.warning( - f"{cazy_class.url}\t" - f"{cazy_class.name}\t" - f"CAZy family populations not retrieved from {cazy_class.name}\t" - f"{err_message}" - ) - - else: - for fam in failed_families: - try: - cazy_class.failed_families[fam] += 1 - if cazy_class.failed_families[fam] == (args.retries + 1): - # max number of attemptes made, do not retry connection - del cazy_class.failed_families[fam] - except KeyError: - cazy_class.failed_families[fam] = 1 - - cazy_classes.append(cazy_class) # retry retriving family populations later - - continue # go onto next CAZy class - - else: # retrieved CAZy family populations - cazy_fam_populations.update(family_populations) - - # log any errors that meant no family population could be retrieved - for cazy_class in cazy_classes: - for fam in list(cazy_class.failed_families.keys()): - connection_failures_logger.warning( - f"{fam}\t" - "Retrieved no family population for data retrieval validation\n" - f"Failed to conencted to CAZy after {(args.retries + 1)*(args.retries +1)} attempts" - ) - - return cazy_fam_populations - - -def get_cazy_classes( - cazy_home_url, - excluded_classes, - cazy_synonym_dict, - cache_dir, - time_stamp, - args, - unit_test=False, -): - """Returns a list of CAZy class instances. - - :param cazy_url: str, URL to the CAZy home page. - :param excluded_classes: list, list of CAZy classes not to be scraped - :param cazy_synonym_dict: dictionary of offical CAZy class names - :param cache_dir: path to cache dir - :param time_stamp: str, time cazy_webscraper was invoked - :param args: cmd line args parser - - Return list of CazyClass instances, or None and an error message. - """ - logger = logging.getLogger(__name__) - logger.info("Retrieving URLs to summary CAZy class pages") - - # define items to be excluded from returned class list, ALWAYS exlide links to genomes - if excluded_classes is not None: - exclusions = tuple(excluded_classes) - else: - exclusions = tuple() - - homepage, error = get_page(cazy_home_url, args, max_tries=(args.retries + 1)) - - if homepage is None: - logger.error( - ( - f"Failed to connect to CAZy home page after {args.retries} attempts.\n" - "The following error was raised:\n" - f"{error}" - "Could not retrieve URLs to CAZy classes.\n" - "Check the network connection.\n" - "Terminating program." - ) - ) - return - - cache_name = cazy_home_url.replace('.', '_') - cache_path = cache_dir / f"{cache_name}_{time_stamp}.html" - if unit_test is False: - with open(cache_path, "w") as cache: - cache.write(homepage) - - # retrieve the h3 elements with class spip - h3_spip_elements = homepage.find_all("h3", {"class": "spip"}) - - # retrieve the div section containing the h3 element for Enzyme classes catalgoued by CAZy - try: - enzyme_classes_div = [ - _ for _ in h3_spip_elements if ( - str(_.contents[0].strip()).replace(u'\xa0', ' ') - ) == 'Enzyme Classes currently covered'][0].parent - - # Retreive the enzyme class page URLs suffixs - enzyme_class_urls = [ - f"{cazy_home_url}/{_['href']}" for _ in enzyme_classes_div.find_all("a") - if (not _["href"].startswith("http")) - and (str(_.contents[0]) not in exclusions) - ] - - # retrieve the div section containing the h3 element for Associated Module catalgoued by CAZy - associated_module_div = [ - _ for _ in h3_spip_elements if ( - str(_.contents[0].strip()).replace(u'\xa0', ' ') - ) == 'Associated Modules currently covered'][0].parent - - # Retreive the enzyme class page URLs suffixs - associated_module_urls = [ - f"{cazy_home_url}/{_['href']}" for _ in associated_module_div.find_all("a") - if (not _["href"].startswith("http")) - and (str(_.contents[0]) not in exclusions) - ] - - except (AttributeError, IndexError) as err: - logger.error( - ( - "Error raised during retrieving of CAZy class URLs.\n" - "Therefore, cannot validate data retrieval. \n" - "Will proceed with scraping CAZy. Error message:\n" - ), - exc_info=1, - ) - return - - # compile the full CAZy class URLs from the homepage url and class suffixes - - if len(enzyme_class_urls) == 0 and len(associated_module_urls) == 0: - logger.error( - ( - "Failed retrieve URLs to CAZy classes from the CAZy homepage.\n" - "Therefore, cannot validate data retrieval. \n" - "Will proceed with scraping CAZy" - ), - exc_info=1, - ) - return - - # create CAZyClass objects - cazy_class_urls = enzyme_class_urls + associated_module_urls - cazy_classes = [] - - for url in cazy_class_urls: - # retrieve class name and standardise it - class_name = url[20:-5] - for key in cazy_synonym_dict: - if class_name in cazy_synonym_dict[key]: - class_name = key - - cazy_class = CazyClass(class_name, url, 0) - cazy_classes.append(cazy_class) - - logger.info( - "Retrieved URLs for:" - f"{len(enzyme_class_urls)} Enzyme Classes and\n" - f"{len(associated_module_urls)} Associated Modules classes" - ) - - return cazy_classes - - -def get_cazy_family_pops( - class_name, - class_url, - cazy_home_url, - fam_pops_to_retrieve, - cache_dir, - time_stamp, - args, - unit_test=False, -): - """Retrieve all protein members of each CAZy family within the given CAZy class. - - :param class_name: str, name of CAZy class - :param class_url: str, URL to CAZy class webpage - :param cazy_home_url: str, URL to CAZy home page - :param fam_pops_to_retrieve: list of CAZy families to scrape - :param cache_dir: str representing Path to dir to write out downloaded family file to - :param time_stamp: str, date and time cazy_webscraper was invoked - :param args: args parser object - - Returns: - A dict of CAZy family populations (fam: pop) - An error message from when retrieving CAZy family URLs - A list of incorrectly formated URLs - A list of URLs of families from which a connection could not be made - """ - logger = logging.getLogger(__name__) - logger.info(f"Retrieving URLs to families under {class_name}") - - failed_connections = [] - incorrect_urls = [] - family_populations = {} - - # get the html code of the class page - class_page, error = get_page(class_url, args, max_tries=(args.retries + 1)) - - if class_page is None: - logger.error( - f"Couldn't connect to {class_url} after {(args.retries + 1)} attempts.\n" - f"The following error was raised:\n{error}" - ) - return None, error, incorrect_urls, failed_connections - - cache_name = class_url.replace('.', '_') - cache_path = cache_dir / f"{cache_name}_{time_stamp}.html" - - if unit_test is False: - with open(cache_path, "w") as cache: - cache.write(class_page) - - family_urls, url_err_message, incorrect_urls = get_families_urls(cazy_home_url, class_page, args) - - if family_urls is None: - return None, url_err_message, incorrect_urls, failed_connections - - for fam_url in tqdm(family_urls, desc=f"Retrieing fam populations for {class_name}"): - fam_name = fam_url.replace(cazy_home_url, "").split(".")[0] - - if (fam_pops_to_retrieve is not None) and (fam_name not in fam_pops_to_retrieve): - continue - - family_page, err = get_page(fam_url, args, max_tries=(args.retries + 1)) - if err is not None: - logger.warning( - f"Failed to connect to {fam_name} webpage at {fam_url}\n" - f"to retrieve the population size, after trying {args.retries + 1} times\n" - ) - failed_connections.append(fam_url) - continue - - cache_name = fam_url.replace('.', '_') - cache_path = cache_dir / f"{cache_name}_{time_stamp}.html" - if unit_test is False: - with open(cache_path, "w") as cache: - cache.write(family_page) - - # retrieve the table containing the Family data - family_data = family_page.find_all("div", {"class": "pos_choix"}) - - try: - fam_pop = int(re.findall( - r"Download \D{2,3}\d+? \(\d+?\)", - family_data[0].text, - flags=re.IGNORECASE, - )[0].split("(")[1].replace(")", "")) - - except (IndexError, AttributeError) as err: - fam_pop = 0 - - if fam_pop == 0: # check if an empty or deleted fam - try: - family_activities_cell = family_page.select("table")[ - 0].select("tr")[0].select("td")[0].contents[0].strip() - - if family_activities_cell == 'Deleted family!': - logger.warning(f"{fam_name} is a deleted family in CAZy") - else: - logger.warning(f"{fam_name} is an empty CAZy family") - except Exception: - logger.warning(f"Could not retrieve family population for {fam_name}") - fam_pop = 'Failed Retrieval' - - logger.warning( - f"{fam_name}\t" - f"{fam_url}\t" - f"Failed to retrieve population for {fam_name}\t" - f"{err}" - ) - - family_populations[fam_name] = fam_pop # handle errors - - logger.info(f"Retrieved URLs for {len(family_urls)} from {class_name} class page") - - return family_urls, url_err_message, incorrect_urls, failed_connections - - -def get_families_urls(cazy_home_url, class_name, class_page, args): - """Retrieve the URLs to CAZy family pages. - - :param cazy_home_url: str, CAZ home page URL - :param class_name: str, name of CAZy class - :param class_page: bs4 soup object, CAZy class summary page - :param args: cmd-line args parser - - Return: - List of CAZy family URLs - Str, message if any errors arose - List of inccorectly formated CAZy family URLs - """ - logger = logging.getLogger(__name__) - incorrect_urls = [] - err_message = None - - # retrieve the

element that titles the div section containing the tables of family links - family_h3_element = [ - _ - for _ in class_page.find_all("h3", {"class": "spip"}) - if str(_.contents[0]).strip() == "Tables for Direct Access" - ][0] - - # retrieve all tables within the parent div section of the

element - tables = family_h3_element.parent.find_all("table") - - # tables[0] is the table containing links to CAZy families - # tables[1] is the table containing the link to unclassified proteins - - family_urls = [f"{cazy_home_url}/{_['href']}" for _ in tables[0].find_all("a")] - try: - family_urls.append(f"{cazy_home_url}/{tables[1].a['href']}") - except TypeError: - family_urls = None - - if (args.subfamilies is False) and (family_urls is None): - err_message = f"Failed to retrieve URLs to CAZy families for {class_name}" - logger.warning(err_message) - return None, err_message, incorrect_urls - - # retrieve URLs to subfamilies - if args.subfamilies is True: - subfam_urls = get_subfamily_links(family_h3_element, cazy_home_url) - - if (family_urls is None) and (subfam_urls is None): - err_message = f"Failed to retrieve URLs to CAZy subfamilies for {class_name}" - logger.warning(err_message) - return None, err_message, incorrect_urls - - elif family_urls is None: - family_urls = subfam_urls - err_message = ( - f"Failed to retrieve URLs to CAZy families for {class_name}\n" - f"But successfully retrieved the URLs to the CAZy subfamilies for {class_name}" - ) - logger.warning(err_message) - - else: - family_urls += subfam_urls - - return family_urls, err_message, incorrect_urls - - -def get_subfamily_links(family_h3_element, cazy_home_url): - """Retrieve URL links to CAZy subfamilies. - - :param family_h3_element: bs4.element.Tag, h3 element titling the page div - :param cazy_home_url: str, URL to CAZy homepage - - Return list of URLs to subfamilies. - """ - parent_div = family_h3_element.parent - all_links = parent_div.find_all("a") - - pattern = re.compile(r"\D+?\d+?_\d+?\.html") - - urls = [] # empty list to store subfamily URLs - - for link in all_links: - try: - search_result = re.search(pattern, link["href"]) - urls.append(f"{cazy_home_url}/{search_result.group()}") - except (KeyError, AttributeError) as error: - # KeyError raised if link does not have ['href'] - # AttributeError error raised if search_result is None becuase not subfam link - pass - - if len(urls) == 0: - return - else: - return urls - - -def browser_decorator(func): - """Decorator to re-invoke the wrapped function up to 'args.retries' times.""" - - def wrapper(*args, **kwargs): - logger = logging.getLogger(__name__) - tries, success, err = 0, False, None - - while not success and (tries < kwargs['max_tries']): - try: - response = func(*args, **kwargs) - except ( - ConnectionError, - HTTPError, - OSError, - MissingSchema, - RequestError, - ) as err_message: - if (tries < kwargs['max_tries']): - logger.warning( - f"Failed to connect to CAZy on try {tries}/{kwargs['max_tries']}.\n" - f"Error: {err_message}" - "Retrying connection to CAZy in 10s" - ) - success = False - response = None - err = err_message - if response is not None: # response was successful - success = True - # if response from webpage was not successful - tries += 1 - time.sleep(10) - if (not success) or (response is None): - logger.warning(f"Failed to connect to CAZy.\nError: {err}") - return None, err - else: - return response, None - - return wrapper - - -@browser_decorator -def get_page(url, args, **kwargs): - """Create browser and use browser to retrieve page for given URL. - - :param url: str, url to webpage - :param args: cmd-line args parser - :kwargs max_tries: max number of times connection to CAZy can be attempted - - Return browser response object (the page). - """ - # create browser object - browser = mechanicalsoup.Browser() - # create response object - page = browser.get(url, timeout=args.timeout) - page = page.soup - - return page