From a5a6c43ed32fd30d4f3857d3c73c7555766d20eb Mon Sep 17 00:00:00 2001 From: genomics-geek Date: Fri, 14 Jun 2019 07:27:23 -0400 Subject: [PATCH] Resolved #117 - Ensure gene entries created from RefSeq set chromosome attribute - Added better logging with status of what has been completed - Added setting of default chromsome when creating gene objects from RefSeq Updated reformat chromosome to try to catch more scenarios Updated genome sync to be more efficient when looking up chromosomes removed unused import Updated chromosome lookup to be once from database and then from memory --- genome/management/commands/genome_sync.py | 96 +++++++++++++++-------- genome/utils.py | 27 +++++-- requirements.txt | 1 + requirements_dev.txt | 2 + tests/test_utils.py | 6 ++ 5 files changed, 92 insertions(+), 40 deletions(-) diff --git a/genome/management/commands/genome_sync.py b/genome/management/commands/genome_sync.py index 5d1fd4cf..85332a10 100644 --- a/genome/management/commands/genome_sync.py +++ b/genome/management/commands/genome_sync.py @@ -1,11 +1,13 @@ import logging -from django.core.exceptions import ObjectDoesNotExist from django.core.management import BaseCommand +from django.db import IntegrityError from genome import app_settings, choices, models, utils from genomix.utils import retrieve_data +from tqdm import tqdm + logger = logging.getLogger(__name__) @@ -37,41 +39,63 @@ def handle(self, *args, **options): defaults={'description_url': description_url} ) self.sync_chromosomes(genome) - self.sync_cytobands(genome) - self.sync_genes(genome) - self.sync_transcripts(genome, sync_exons=options['sync_exons']) + + chromosomes = self.get_chromosomes(genome) + self.sync_cytobands(genome, chromosomes) + self.sync_genes(genome, chromosomes) + self.sync_transcripts(genome, chromosomes, sync_exons=options['sync_exons']) logger.info('Syncing {0} complete!'.format(build)) + @staticmethod + def get_chromosomes(genome): + chromosomes = {} + for item in models.Chromosome.objects.filter(genome=genome): + chromosomes[item.label] = item + return chromosomes + + @staticmethod + def get_chromosome(genome, chromosomes, value): + if value: + label = utils.reformat_chromosome(value) + try: + return chromosomes[label] + except KeyError: + logger.warning('Chromosome: {0} does not exist!'.format(value)) + def sync_chromosomes(self, genome): logger.info('Syncing chromosomes...') - for row in self.run_ucsc_query(genome, self.ucsc_chromosome_sql()): + for row in tqdm(self.run_ucsc_query(genome, self.ucsc_chromosome_sql())): chromosome = utils.reformat_chromosome(row[0]) models.Chromosome.objects.update_or_create( genome=genome, label=chromosome, defaults={'length': row[1]} ) + logger.info('Syncing chromosomes complete!') - def sync_cytobands(self, genome): + def sync_cytobands(self, genome, chromosomes): logger.info('Syncing cytobands...') - for row in self.run_ucsc_query(genome, self.ucsc_cytoband_sql()): - chromosome = utils.reformat_chromosome(row[0]) + for row in tqdm(self.run_ucsc_query(genome, self.ucsc_cytoband_sql())): + chromosome = self.get_chromosome(genome, chromosomes, row[0]) models.CytoBand.objects.update_or_create( label=row[3], - chromosome=models.Chromosome.objects.get(genome=genome, label=chromosome), + chromosome=chromosome, defaults={ 'start': row[1], 'end': row[2], 'stain': row[4], } ) + logger.info('Syncing cytobands complete!') - def sync_genes(self, genome): + def sync_genes(self, genome, chromosomes): logger.info('Syncing HGNC genes...') + logger.info('Downloading from HGNC...') hgnc_data = retrieve_data(getattr(app_settings, 'HGNC_GENES')) + logger.info('Downloading complete!') header = [text.upper() for text in hgnc_data.pop(0).strip().split('\t')] - for line in hgnc_data: + for line in tqdm(hgnc_data): columns = line.split('\t') symbol = columns[header.index('APPROVED SYMBOL')] name = columns[header.index('APPROVED NAME')] @@ -93,14 +117,7 @@ def sync_genes(self, genome): not_curated_rat_genome_database = columns[ header.index('RAT GENOME DATABASE ID(SUPPLIED BY RGD)')] - try: - chromosome = models.Chromosome.objects.get( - genome=genome, - label=utils.reformat_chromosome(chromosome), - ) - except ObjectDoesNotExist: - chromosome = None - + chromosome = self.get_chromosome(genome, chromosomes, chromosome) gene, created = models.Gene.objects.update_or_create( symbol=symbol.upper(), chromosome=chromosome, @@ -133,30 +150,41 @@ def sync_genes(self, genome): gene.synonyms.add(synonym) gene.save() + logger.info('Syncing HGNC genes complete!') - def sync_transcripts(self, genome, sync_exons=False): + def sync_transcripts(self, genome, chromosomes, sync_exons=False): message = 'Syncing RefSeq transcripts' if sync_exons: message += ' and exons' logger.info('{0}...'.format(message)) - for row in self.run_ucsc_query(genome, self.ucsc_refgene_sql()): + for row in tqdm(self.run_ucsc_query(genome, self.ucsc_refgene_sql())): + chromosome = self.get_chromosome(genome, chromosomes, row[10]) gene, created = models.Gene.objects.get_or_create( + chromosome__genome=genome, symbol=row[0].upper(), - defaults={'status': getattr(choices.HGNC_GENE_STATUS, 'ucsc_gene')} - ) - transcript, created = models.Transcript.objects.update_or_create( - label=row[1], - gene=gene, defaults={ - 'strand': getattr(choices.STRAND_TYPES, row[2]), - 'transcription_start': row[3], - 'transcription_end': row[4], - 'cds_start': row[5], - 'cds_end': row[6], + 'status': getattr(choices.HGNC_GENE_STATUS, 'ucsc_gene'), + 'chromosome': chromosome, } ) + try: + transcript, created = models.Transcript.objects.update_or_create( + label=row[1], + gene=gene, + defaults={ + 'strand': getattr(choices.STRAND_TYPES, row[2]), + 'transcription_start': row[3], + 'transcription_end': row[4], + 'cds_start': row[5], + 'cds_end': row[6], + } + ) + except IntegrityError: + logger.warning('Transcript: {0} was skipped!'.format(row[1])) + continue + if sync_exons: strand = row[2] number_of_exons = row[7] @@ -179,8 +207,10 @@ def sync_transcripts(self, genome, sync_exons=False): 'end': ends[index], } ) + logger.info('{0} complete!'.format(message)) def run_ucsc_query(self, genome, query): + logger.info('Querying UCSC...') build = genome.label conn = self.ucsc_connection(build) cursor = conn.cursor() @@ -188,6 +218,7 @@ def run_ucsc_query(self, genome, query): rows = cursor.fetchall() cursor.close() conn.close() + logger.info('Querying UCSC complete!') return rows @staticmethod @@ -227,7 +258,8 @@ def ucsc_refgene_sql(): rg.cdsEnd, rg.exonCount, rg.exonStarts, - rg.exonEnds + rg.exonEnds, + rg.chrom FROM hg19.refGene rg INNER JOIN hgFixed.gbCdnaInfo gi ON rg.name=gi.acc; """ diff --git a/genome/utils.py b/genome/utils.py index 385115bd..df2c4b94 100644 --- a/genome/utils.py +++ b/genome/utils.py @@ -1,14 +1,25 @@ # -*- coding: utf-8 def reformat_chromosome(chromosome): - if 'p' in chromosome: - new_chrom = chromosome.split('p')[0].upper() - elif 'q' in chromosome: - new_chrom = chromosome.split('q')[0].upper() - else: - new_chrom = chromosome.replace('chr', '').upper() + chromosome = chromosome.replace('chr', '').replace('unplaced', '').upper().strip() + accepted = list(map(str, range(1, 23))) + ['X', 'Y', 'M', 'MT'] + + if chromosome not in accepted: + if all(x in chromosome for x in ['P']): + return chromosome.split('P')[0] + elif all(x in chromosome for x in ['Q']): + return chromosome.split('Q')[0] + else: + new_chromosome = chromosome.lower() \ + .replace('alternate reference locus', '') \ + .replace('mitochondria', 'M') \ + .replace('not on reference assembly', '') \ + .upper() \ + .strip() - if new_chrom in list(map(str, range(1, 23))) or new_chrom in ['X', 'Y', 'M', 'MT']: - return new_chrom + if new_chromosome in accepted: + return new_chromosome + else: + return chromosome def chromosome_number(chromosome): diff --git a/requirements.txt b/requirements.txt index b893b110..74414749 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ django-genomix==0.6.7 django-model-utils==3.1.2 mysqlclient==1.4.2.post1 +tqdm==4.32.1 diff --git a/requirements_dev.txt b/requirements_dev.txt index 7e324a6e..4b74740a 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -6,3 +6,5 @@ twine==1.13.0 Sphinx==2.1.0 sphinx-rtd-theme==0.4.3 sphinxcontrib-napoleon==0.7 + +autopep8==1.4.4 diff --git a/tests/test_utils.py b/tests/test_utils.py index a99eb4bb..b9f8df26 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -18,6 +18,12 @@ ('1p10.1', '1'), ('1q10.1', '1'), ('dasfa', None), + ('12 alternate reference locus', '12'), + ('mitochondria', 'M'), + ('13 not on reference assembly', '13'), + ('15 unplaced', '15'), + ('4q27', '4'), + ('5q31', '5'), ]) def test_reformat_chromosome(chromosome, expected): assert utils.reformat_chromosome(chromosome) == expected