From 95030669dc346289ccc638a86c1fc1706ca17905 Mon Sep 17 00:00:00 2001 From: Rutger Vos Date: Fri, 16 Aug 2024 20:03:08 +0200 Subject: [PATCH] tarfile handling --- barcode_validator/taxonomy.py | 4 +++- config/config.yml | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/barcode_validator/taxonomy.py b/barcode_validator/taxonomy.py index 0003c3b..f44f209 100644 --- a/barcode_validator/taxonomy.py +++ b/barcode_validator/taxonomy.py @@ -4,6 +4,7 @@ import os import time import pandas as pd +import tarfile from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Blast import NCBIWWW, NCBIXML @@ -20,7 +21,8 @@ def read_bold_taxonomy(spreadsheet): def read_ncbi_taxonomy(tarfile): logging.info("Reading NCBI taxonomy") - return NCBIParser(tarfile).parse() + tar = tarfile.open(tarfile, "r:gz") + return NCBIParser(tar).parse() def run_seqid(sequence, ncbi_tree): diff --git a/config/config.yml b/config/config.yml index 1c933a5..4d5a78a 100644 --- a/config/config.yml +++ b/config/config.yml @@ -29,8 +29,8 @@ word_size: 28 BLASTDB_LMDB_MAP_SIZE: 180000000000 # MaaS 37, 180GB # Location of the NCBI taxonomy dump. This must be the tar file that contains the nodes.dmp and names.dmp files. -# When downloaded, the tar file is gzip compressed. This needs to be uncompressed before use, e.g. with `gunzip`. -ncbi_taxonomy: /home/rutger.vos/data/ncbi/taxdump/taxdump.tar +# http://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz +ncbi_taxonomy: /home/rutger.vos/data/ncbi/taxdump/taxdump.tar.gz # Configuration for logging. The verbosity level specified here is overridden by the value provided on the command # line with the -v/-verbosity argument. The log file is written to the current working directory.