Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issues 129 ncbi #130

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 4 additions & 119 deletions cazy_webscraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,12 @@


import logging
import os
import sys

import numpy as np
import pandas as pd

from datetime import datetime
from pathlib import Path

import Bio
import bioservices
Expand All @@ -61,12 +59,7 @@
import sqlalchemy
import tqdm

from saintBioutils.utilities.file_io import make_output_directory

from cazy_webscraper.sql import sql_orm


__version__ = "2.3.0.3"
__version__ = "3.0.0.0"


VERSION_INFO = f"cazy_webscraper version: {__version__}"
Expand All @@ -88,6 +81,9 @@

AUTHOR_EMAIL = "eemh1@st-andrews.ac.uk"

CAZY_URL = "http://www.cazy.org"
DOWNLOAD_URL = 'http://www.cazy.org/IMG/cazy_data/cazy_data.zip'


def closing_message(job, start_time, args, early_term=False):
"""Write closing messsage to terminal
Expand Down Expand Up @@ -215,114 +211,3 @@ def display_version_info():
"""

print(message)


def connect_existing_db(args, time_stamp, start_time):
"""Coordinate connecting to an existing local CAZyme database, define logger name and cache dir

:param args: cmd-line args parser
:param time_stamp: str, time cazy_webscraper was invoked
:param start_time: pd date-time obj, time cazy_webscraper was invoked

Return connection to local CAZyme database, logger file name, and path to cache dir
"""
logger = logging.getLogger(__name__)

logger.info("Adding data to an existing local CAZyme database")

if os.path.isfile(args.database) is False:
logger.error(
"Could not find local CAZy database.\n"
"Check path is correct.\n"
"Terminating programme."
)
closing_message("cazy_webscraper", start_time, args)
sys.exit(1)

try:
connection = sql_orm.get_db_connection(args.database, args.sql_echo, new=False)
logger.info("Opened connection to local CAZyme database")
except Exception:
logger.error(
"Failed to open connection to an exiting local CAZyme database\n."
"Terminating program\n",
exc_info=True,
)
closing_message("cazy_webscraper", start_time, args)
sys.exit(1)

# used for naming additional log files
logger_name = str(args.database).split('.')[0]

# define path to cache family txt files
cache_dir = Path(f"{str(args.database.parent)}/.cazy_webscraper_{time_stamp}")

return connection, logger_name, cache_dir


def connect_to_new_db(args, time_stamp, start_time):
"""Build and connect to a new local CAZyme database.

:param args: cmd-line args parser
:param time_stamp: str, time cazy_webscraper was invoked
:param start_time: pd date-time obj, time cazy_webscraper was invoked

Return connection to the database, name of the logger, and path to the cache dir
"""
logger = logging.getLogger(__name__)

if args.db_output is not None: # user defined target output for the NEW database

if os.path.isfile(args.db_output): # target file exists
if args.force:
logger.warning(
"Overwriting existing local CAZyme database at:\n"
f"{args.db_output}"
)

else:
logger.warning(
"Target path for new database already exists.\n"
"Either enable forced overwriting (-f) or add data this data (-D).\n"
"Terminating program."
)
closing_message("cazy_webscraper", start_time, args)
sys.exit(1)

else: # may need to build dirs
logger.info(
"Building new local CAZyme database\n"
f"Output directory: {(args.db_output).parent}\n"
f"Force overwriting exiting output file: {args.force}"
)

if str((args.db_output).parent) != '.': # dirs defined in output put
output_dir = (args.db_output).parent
make_output_directory(output_dir, args.force, args.nodelete)
cache_dir = Path(f"{str(output_dir)}/.cazy_webscraper_{time_stamp}")

else: # writing to cwd
cache_dir = Path(f".cazy_webscraper_{time_stamp}")

logger_name = str(args.db_output).split('.')[0]
db_path = args.db_output

else:
logger.info("Using default database name and writing to cwd")
db_path = Path(f"cazy_webscraper_{time_stamp}.db")
cache_dir = Path(f".cazy_webscraper_{time_stamp}")
logger_name = f'cazy_webscraper_{time_stamp}'

try:
connection = sql_orm.get_db_connection(db_path, args.sql_echo, new=True)
logger.warning(f"Built new local CAZyme database at\n{db_path}")
except Exception:
logger.error(
"Failed to build new SQL database\n."
"Terminating program",
exc_info=True,
)
closing_message("cazy_webscraper", start_time, args)
sys.exit(1)

return connection, logger_name, cache_dir
50 changes: 30 additions & 20 deletions cazy_webscraper/cache/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,22 @@
"""Cache data retrieved from the remove NCBI database"""


import argparse
import logging
import json

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from cazy_webscraper import closing_message
from cazy_webscraper.ncbi.sequences import get_protein_accession

def get_cache_seqs(start_time, args):

def get_cache_seqs(
start_time: str,
args: argparse.ArgumentParser
) -> tuple(dict[str, Seq], list[SeqRecord]):
"""Extract protein sequences from FASTA and/or JSON file, which will be added to the
local CAZyme database

Expand All @@ -62,18 +69,18 @@ def get_cache_seqs(start_time, args):
seq_records = []

if args.seq_dict:
logger.warning(f"Getting sequences from JSON cache:\n{args.seq_dict}")
logger.warning("Getting sequences from JSON cache:\n%s", args.seq_dict)

try:
with open(args.seq_dict, "r") as fh:
cache_dict = json.load(fh)

except FileNotFoundError:
logger.error(
f"Could not find JSON file of protein sequences at:\n"
f"{args.seq_dict}\n"
"Check the path is correct"
"Terminating program"
"Could not find JSON file of protein sequences at:\n"
"%s\n"
"Check the path is correct. Terminating program",
args.seq_dict
)
closing_message("Get GenBank seqs", start_time, args, early_term=True)

Expand All @@ -82,7 +89,7 @@ def get_cache_seqs(start_time, args):
seq_dict[key] = Seq(cache_dict[key])

if args.seq_file:
logger.warning(f"Getting sequences from FASTA cache:\n{args.seq_file}")
logger.warning("Getting sequences from FASTA cache:\n%s", args.seq_file)

try:
for record in SeqIO.parse(args.seq_file, "fasta"):
Expand All @@ -91,37 +98,40 @@ def get_cache_seqs(start_time, args):
if retrieved_accession is None:
logger.error(
"Could not retrieve a NCBI protein version accession from cache\n"
f"from the record id '{record.id}'\n"
"The sequence from this record will not be added to the db"
"from the record id '%s'\n"
"The sequence from this record will not be added to the db",
record.id
)
continue

try:
seq_dict[retrieved_accession]
if seq_dict[retrieved_accession] != record.seq:
logger.warning(
f"Retrieved seq for {retrieved_accession} from JSON file which does NOT match "
"Retrieved seq for %s from JSON file which does NOT match "
"the seq in the FASTA file.\n"
"Adding seq from the FASTA file to the local CAZyme database\n"
f"JSON seq: {seq_dict[retrieved_accession]}\n"
f"FASTA seq: {record.seq}"
"JSON seq: %s\n"
"FASTA seq: %s",
retrieved_accession,
seq_dict[retrieved_accession],
record.seq
)
seq_dict[retrieved_accession] = record.seq
except KeyError:
seq_dict[retrieved_accession] = record.seq

except FileNotFoundError:
logger.error(
f"Could not find FASTA file of protein sequences at:\n"
f"{args.seq_file}\n"
"Check the path is correct"
"Terminating program"
"Could not find FASTA file of protein sequences at:\n"
"%s\n"
"Check the path is correct. Terminating program",
args.seq_file
)
closing_message("Get GenBank seqs", start_time, args, early_term=True)

for key in seq_dict:
seq_records.append(SeqRecord(id=key, seq=Seq(seq_dict[key])))
for key, value in seq_dict.items():
seq_records.append(SeqRecord(id=key, seq=Seq(value)))

logger.warning(f"Retrieved {len(seq_records)} from cache")
logger.warning("Retrieved %s from cache", len(seq_records))

return seq_dict, seq_records
22 changes: 16 additions & 6 deletions cazy_webscraper/cazy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,31 @@
"""Parse data retrieved from CAZy and build a dict of data matching the user's criteria."""


import argparse
import logging
import re
import sys

from collections import namedtuple
from pathlib import Path

from tqdm import tqdm
from zipfile import ZipFile

from cazy_webscraper import crawler
from cazy_webscraper.crawler import get_cazy_file


def get_cazy_txt_file_data(cache_dir, time_stamp, args):
def get_cazy_file():
return


def get_cazy_txt_file_data(cache_dir: Path, time_stamp: str, args: argparse.ArgumentParser):
"""Retrieve txt file of CAZy db dump from CAZy or the local disk.

:param cache_dir: Path(), path to directory where cache is written to
:param time_stamp: str, date and time cazy_webscraper was intiated
:param args: cmd-line args parser

Return list of lines from CAZy txt file, one line is one item in the list"""
logger = logging.getLogger(__name__)

Expand All @@ -84,10 +91,13 @@ def get_cazy_txt_file_data(cache_dir, time_stamp, args):
if err_message is not None:
logger.error(
"Could not connect to CAZy to download the CAZy db txt file after "
f"{(args.retries + 1)*(args.retries + 1)}\n"
f"The following error was raised:\n{err_message}"
f"File would have been written to {cazy_txt_path}"
"Terminating program"
"%s\n"
"The following error was raised:\n%s"
"File would have been written to %s"
"Terminating program",
((args.retries + 1)*(args.retries + 1)),
err_message,
cazy_txt_path,
)
sys.exit(1)

Expand Down
Loading