HobnobMancer · HobnobMancer · May 21, 2024 · May 21, 2024 · Jun 2, 2024 · Jun 16, 2024
diff --git a/cazy_webscraper/__init__.py b/cazy_webscraper/__init__.py
@@ -41,14 +41,12 @@
 
 
 import logging
-import os
 import sys
 
 import numpy as np
 import pandas as pd
 
 from datetime import datetime
-from pathlib import Path
 
 import Bio
 import bioservices
@@ -61,12 +59,7 @@
 import sqlalchemy
 import tqdm
 
-from saintBioutils.utilities.file_io import make_output_directory
-
-from cazy_webscraper.sql import sql_orm
-
-
-__version__ = "2.3.0.3"
+__version__ = "3.0.0.0"
 
 
 VERSION_INFO = f"cazy_webscraper version: {__version__}"
@@ -88,6 +81,9 @@
 
 AUTHOR_EMAIL = "eemh1@st-andrews.ac.uk"
 
+CAZY_URL = "http://www.cazy.org"
+DOWNLOAD_URL = 'http://www.cazy.org/IMG/cazy_data/cazy_data.zip'
+
 
 def closing_message(job, start_time, args, early_term=False):
     """Write closing messsage to terminal
@@ -215,114 +211,3 @@ def display_version_info():
     """
 
     print(message)
-
-
-def connect_existing_db(args, time_stamp, start_time):
-    """Coordinate connecting to an existing local CAZyme database, define logger name and cache dir
-
-    :param args: cmd-line args parser
-    :param time_stamp: str, time cazy_webscraper was invoked
-    :param start_time: pd date-time obj, time cazy_webscraper was invoked
-
-    Return connection to local CAZyme database, logger file name, and path to cache dir
-    """
-    logger = logging.getLogger(__name__)
-
-    logger.info("Adding data to an existing local CAZyme database")
-
-    if os.path.isfile(args.database) is False:
-        logger.error(
-            "Could not find local CAZy database.\n"
-            "Check path is correct.\n"
-            "Terminating programme."
-        )
-        closing_message("cazy_webscraper", start_time, args)
-        sys.exit(1)
-
-    try:
-        connection = sql_orm.get_db_connection(args.database, args.sql_echo, new=False)
-        logger.info("Opened connection to local CAZyme database")
-    except Exception:
-        logger.error(
-            "Failed to open connection to an exiting local CAZyme database\n."
-            "Terminating program\n",
-            exc_info=True,
-        )
-        closing_message("cazy_webscraper", start_time, args)
-        sys.exit(1)
-
-    # used for naming additional log files
-    logger_name = str(args.database).split('.')[0]
-
-    # define path to cache family txt files
-    cache_dir = Path(f"{str(args.database.parent)}/.cazy_webscraper_{time_stamp}")
-
-    return connection, logger_name, cache_dir
-
-
-def connect_to_new_db(args, time_stamp, start_time):
-    """Build and connect to a new local CAZyme database.
-
-    :param args: cmd-line args parser
-    :param time_stamp: str, time cazy_webscraper was invoked
-    :param start_time: pd date-time obj, time cazy_webscraper was invoked
-
-    Return connection to the database, name of the logger, and path to the cache dir
-    """
-    logger = logging.getLogger(__name__)
-
-    if args.db_output is not None:  # user defined target output for the NEW database
-
-        if os.path.isfile(args.db_output):  # target file exists
-            if args.force:
-                logger.warning(
-                    "Overwriting existing local CAZyme database at:\n"
-                    f"{args.db_output}"
-                )
-
-            else:
-                logger.warning(
-                    "Target path for new database already exists.\n"
-                    "Either enable forced overwriting (-f) or add data this data (-D).\n"
-                    "Terminating program."
-                )
-                closing_message("cazy_webscraper", start_time, args)
-                sys.exit(1)
-
-        else:  # may need to build dirs
-            logger.info(
-                "Building new local CAZyme database\n"
-                f"Output directory: {(args.db_output).parent}\n"
-                f"Force overwriting exiting output file: {args.force}"
-            )
-
-        if str((args.db_output).parent) != '.':  # dirs defined in output put
-            output_dir = (args.db_output).parent
-            make_output_directory(output_dir, args.force, args.nodelete)
-            cache_dir = Path(f"{str(output_dir)}/.cazy_webscraper_{time_stamp}")
-
-        else:  # writing to cwd
-            cache_dir = Path(f".cazy_webscraper_{time_stamp}")
-
-        logger_name = str(args.db_output).split('.')[0]
-        db_path = args.db_output
-
-    else:
-        logger.info("Using default database name and writing to cwd")
-        db_path = Path(f"cazy_webscraper_{time_stamp}.db")
-        cache_dir = Path(f".cazy_webscraper_{time_stamp}")
-        logger_name = f'cazy_webscraper_{time_stamp}'
-
-    try:
-        connection = sql_orm.get_db_connection(db_path, args.sql_echo, new=True)
-        logger.warning(f"Built new local CAZyme database at\n{db_path}")
-    except Exception:
-        logger.error(
-            "Failed to build new SQL database\n."
-            "Terminating program",
-            exc_info=True,
-        )
-        closing_message("cazy_webscraper", start_time, args)
-        sys.exit(1)
-
-    return connection, logger_name, cache_dir
diff --git a/cazy_webscraper/cache/ncbi.py b/cazy_webscraper/cache/ncbi.py
@@ -40,15 +40,22 @@
 """Cache data retrieved from the remove NCBI database"""
 
 
+import argparse
 import logging
 import json
 
 from Bio import SeqIO
 from Bio.Seq import Seq 
 from Bio.SeqRecord import SeqRecord
 
+from cazy_webscraper import closing_message
+from cazy_webscraper.ncbi.sequences import get_protein_accession
 
-def get_cache_seqs(start_time, args):
+
+def get_cache_seqs(
+    start_time: str,
+    args: argparse.ArgumentParser
+) -> tuple(dict[str, Seq], list[SeqRecord]):
     """Extract protein sequences from FASTA and/or JSON file, which will be added to the
     local CAZyme database
 
@@ -62,18 +69,18 @@ def get_cache_seqs(start_time, args):
     seq_records = []
 
     if args.seq_dict:
-        logger.warning(f"Getting sequences from JSON cache:\n{args.seq_dict}")
+        logger.warning("Getting sequences from JSON cache:\n%s", args.seq_dict)
 
         try:
             with open(args.seq_dict, "r") as fh:
                 cache_dict = json.load(fh)
 
         except FileNotFoundError:
             logger.error(
-                f"Could not find JSON file of protein sequences at:\n"
-                f"{args.seq_dict}\n"
-                "Check the path is correct"
-                "Terminating program"
+                "Could not find JSON file of protein sequences at:\n"
+                "%s\n"
+                "Check the path is correct. Terminating program",
+                args.seq_dict
             )
             closing_message("Get GenBank seqs", start_time, args, early_term=True)
 
@@ -82,7 +89,7 @@ def get_cache_seqs(start_time, args):
             seq_dict[key] = Seq(cache_dict[key])
 
     if args.seq_file:
-        logger.warning(f"Getting sequences from FASTA cache:\n{args.seq_file}")
+        logger.warning("Getting sequences from FASTA cache:\n%s", args.seq_file)
 
         try:
             for record in SeqIO.parse(args.seq_file, "fasta"):
@@ -91,37 +98,40 @@ def get_cache_seqs(start_time, args):
                 if retrieved_accession is None:
                     logger.error(
                         "Could not retrieve a NCBI protein version accession from cache\n"
-                        f"from the record id '{record.id}'\n"
-                        "The sequence from this record will not be added to the db"
+                        "from the record id '%s'\n"
+                        "The sequence from this record will not be added to the db",
+                        record.id
                     )
                     continue
 
                 try:
-                    seq_dict[retrieved_accession]
                     if seq_dict[retrieved_accession] != record.seq:
                         logger.warning(
-                            f"Retrieved seq for {retrieved_accession} from JSON file which does NOT match "
+                            "Retrieved seq for %s from JSON file which does NOT match "
                             "the seq in the FASTA file.\n"
                             "Adding seq from the FASTA file to the local CAZyme database\n"
-                            f"JSON seq: {seq_dict[retrieved_accession]}\n"
-                            f"FASTA seq: {record.seq}"
+                            "JSON seq: %s\n"
+                            "FASTA seq: %s",
+                            retrieved_accession,
+                            seq_dict[retrieved_accession],
+                            record.seq
                         )
                         seq_dict[retrieved_accession] = record.seq
                 except KeyError:
                     seq_dict[retrieved_accession] = record.seq
 
         except FileNotFoundError:
             logger.error(
-                f"Could not find FASTA file of protein sequences at:\n"
-                f"{args.seq_file}\n"
-                "Check the path is correct"
-                "Terminating program"
+                "Could not find FASTA file of protein sequences at:\n"
+                "%s\n"
+                "Check the path is correct. Terminating program",
+                args.seq_file
             )
             closing_message("Get GenBank seqs", start_time, args, early_term=True)
 
-    for key in seq_dict:
-        seq_records.append(SeqRecord(id=key, seq=Seq(seq_dict[key])))
+    for key, value in seq_dict.items():
+        seq_records.append(SeqRecord(id=key, seq=Seq(value)))
 
-    logger.warning(f"Retrieved {len(seq_records)} from cache")
+    logger.warning("Retrieved %s from cache", len(seq_records))
 
     return seq_dict, seq_records
diff --git a/cazy_webscraper/cazy/__init__.py b/cazy_webscraper/cazy/__init__.py
@@ -40,24 +40,31 @@
 """Parse data retrieved from CAZy and build a dict of data matching the user's criteria."""
 
 
+import argparse
 import logging
 import re
 import sys
 
 from collections import namedtuple
+from pathlib import Path
 
 from tqdm import tqdm
 from zipfile import ZipFile
 
 from cazy_webscraper import crawler
-from cazy_webscraper.crawler import get_cazy_file
 
 
-def get_cazy_txt_file_data(cache_dir, time_stamp, args):
+def get_cazy_file():
+    return
+
+
+def get_cazy_txt_file_data(cache_dir: Path, time_stamp: str, args: argparse.ArgumentParser):
     """Retrieve txt file of CAZy db dump from CAZy or the local disk.
+
     :param cache_dir: Path(), path to directory where cache is written to
     :param time_stamp: str, date and time cazy_webscraper was intiated
     :param args: cmd-line args parser
+
     Return list of lines from CAZy txt file, one line is one item in the list"""
     logger = logging.getLogger(__name__)
 
@@ -84,10 +91,13 @@ def get_cazy_txt_file_data(cache_dir, time_stamp, args):
         if err_message is not None:
             logger.error(
                 "Could not connect to CAZy to download the CAZy db txt file after "
-                f"{(args.retries + 1)*(args.retries + 1)}\n"
-                f"The following error was raised:\n{err_message}"
-                f"File would have been written to {cazy_txt_path}"
-                "Terminating program"
+                "%s\n"
+                "The following error was raised:\n%s"
+                "File would have been written to %s"
+                "Terminating program",
+                ((args.retries + 1)*(args.retries + 1)),
+                err_message,
+                cazy_txt_path,
             )
             sys.exit(1)