diff --git a/genome/management/commands/genome_sync.py b/genome/management/commands/genome_sync.py index 5d1fd4cf..85332a10 100644 --- a/genome/management/commands/genome_sync.py +++ b/genome/management/commands/genome_sync.py @@ -1,11 +1,13 @@ import logging -from django.core.exceptions import ObjectDoesNotExist from django.core.management import BaseCommand +from django.db import IntegrityError from genome import app_settings, choices, models, utils from genomix.utils import retrieve_data +from tqdm import tqdm + logger = logging.getLogger(__name__) @@ -37,41 +39,63 @@ def handle(self, *args, **options): defaults={'description_url': description_url} ) self.sync_chromosomes(genome) - self.sync_cytobands(genome) - self.sync_genes(genome) - self.sync_transcripts(genome, sync_exons=options['sync_exons']) + + chromosomes = self.get_chromosomes(genome) + self.sync_cytobands(genome, chromosomes) + self.sync_genes(genome, chromosomes) + self.sync_transcripts(genome, chromosomes, sync_exons=options['sync_exons']) logger.info('Syncing {0} complete!'.format(build)) + @staticmethod + def get_chromosomes(genome): + chromosomes = {} + for item in models.Chromosome.objects.filter(genome=genome): + chromosomes[item.label] = item + return chromosomes + + @staticmethod + def get_chromosome(genome, chromosomes, value): + if value: + label = utils.reformat_chromosome(value) + try: + return chromosomes[label] + except KeyError: + logger.warning('Chromosome: {0} does not exist!'.format(value)) + def sync_chromosomes(self, genome): logger.info('Syncing chromosomes...') - for row in self.run_ucsc_query(genome, self.ucsc_chromosome_sql()): + for row in tqdm(self.run_ucsc_query(genome, self.ucsc_chromosome_sql())): chromosome = utils.reformat_chromosome(row[0]) models.Chromosome.objects.update_or_create( genome=genome, label=chromosome, defaults={'length': row[1]} ) + logger.info('Syncing chromosomes complete!') - def sync_cytobands(self, genome): + def sync_cytobands(self, genome, chromosomes): logger.info('Syncing cytobands...') - for row in self.run_ucsc_query(genome, self.ucsc_cytoband_sql()): - chromosome = utils.reformat_chromosome(row[0]) + for row in tqdm(self.run_ucsc_query(genome, self.ucsc_cytoband_sql())): + chromosome = self.get_chromosome(genome, chromosomes, row[0]) models.CytoBand.objects.update_or_create( label=row[3], - chromosome=models.Chromosome.objects.get(genome=genome, label=chromosome), + chromosome=chromosome, defaults={ 'start': row[1], 'end': row[2], 'stain': row[4], } ) + logger.info('Syncing cytobands complete!') - def sync_genes(self, genome): + def sync_genes(self, genome, chromosomes): logger.info('Syncing HGNC genes...') + logger.info('Downloading from HGNC...') hgnc_data = retrieve_data(getattr(app_settings, 'HGNC_GENES')) + logger.info('Downloading complete!') header = [text.upper() for text in hgnc_data.pop(0).strip().split('\t')] - for line in hgnc_data: + for line in tqdm(hgnc_data): columns = line.split('\t') symbol = columns[header.index('APPROVED SYMBOL')] name = columns[header.index('APPROVED NAME')] @@ -93,14 +117,7 @@ def sync_genes(self, genome): not_curated_rat_genome_database = columns[ header.index('RAT GENOME DATABASE ID(SUPPLIED BY RGD)')] - try: - chromosome = models.Chromosome.objects.get( - genome=genome, - label=utils.reformat_chromosome(chromosome), - ) - except ObjectDoesNotExist: - chromosome = None - + chromosome = self.get_chromosome(genome, chromosomes, chromosome) gene, created = models.Gene.objects.update_or_create( symbol=symbol.upper(), chromosome=chromosome, @@ -133,30 +150,41 @@ def sync_genes(self, genome): gene.synonyms.add(synonym) gene.save() + logger.info('Syncing HGNC genes complete!') - def sync_transcripts(self, genome, sync_exons=False): + def sync_transcripts(self, genome, chromosomes, sync_exons=False): message = 'Syncing RefSeq transcripts' if sync_exons: message += ' and exons' logger.info('{0}...'.format(message)) - for row in self.run_ucsc_query(genome, self.ucsc_refgene_sql()): + for row in tqdm(self.run_ucsc_query(genome, self.ucsc_refgene_sql())): + chromosome = self.get_chromosome(genome, chromosomes, row[10]) gene, created = models.Gene.objects.get_or_create( + chromosome__genome=genome, symbol=row[0].upper(), - defaults={'status': getattr(choices.HGNC_GENE_STATUS, 'ucsc_gene')} - ) - transcript, created = models.Transcript.objects.update_or_create( - label=row[1], - gene=gene, defaults={ - 'strand': getattr(choices.STRAND_TYPES, row[2]), - 'transcription_start': row[3], - 'transcription_end': row[4], - 'cds_start': row[5], - 'cds_end': row[6], + 'status': getattr(choices.HGNC_GENE_STATUS, 'ucsc_gene'), + 'chromosome': chromosome, } ) + try: + transcript, created = models.Transcript.objects.update_or_create( + label=row[1], + gene=gene, + defaults={ + 'strand': getattr(choices.STRAND_TYPES, row[2]), + 'transcription_start': row[3], + 'transcription_end': row[4], + 'cds_start': row[5], + 'cds_end': row[6], + } + ) + except IntegrityError: + logger.warning('Transcript: {0} was skipped!'.format(row[1])) + continue + if sync_exons: strand = row[2] number_of_exons = row[7] @@ -179,8 +207,10 @@ def sync_transcripts(self, genome, sync_exons=False): 'end': ends[index], } ) + logger.info('{0} complete!'.format(message)) def run_ucsc_query(self, genome, query): + logger.info('Querying UCSC...') build = genome.label conn = self.ucsc_connection(build) cursor = conn.cursor() @@ -188,6 +218,7 @@ def run_ucsc_query(self, genome, query): rows = cursor.fetchall() cursor.close() conn.close() + logger.info('Querying UCSC complete!') return rows @staticmethod @@ -227,7 +258,8 @@ def ucsc_refgene_sql(): rg.cdsEnd, rg.exonCount, rg.exonStarts, - rg.exonEnds + rg.exonEnds, + rg.chrom FROM hg19.refGene rg INNER JOIN hgFixed.gbCdnaInfo gi ON rg.name=gi.acc; """ diff --git a/genome/utils.py b/genome/utils.py index 385115bd..df2c4b94 100644 --- a/genome/utils.py +++ b/genome/utils.py @@ -1,14 +1,25 @@ # -*- coding: utf-8 def reformat_chromosome(chromosome): - if 'p' in chromosome: - new_chrom = chromosome.split('p')[0].upper() - elif 'q' in chromosome: - new_chrom = chromosome.split('q')[0].upper() - else: - new_chrom = chromosome.replace('chr', '').upper() + chromosome = chromosome.replace('chr', '').replace('unplaced', '').upper().strip() + accepted = list(map(str, range(1, 23))) + ['X', 'Y', 'M', 'MT'] + + if chromosome not in accepted: + if all(x in chromosome for x in ['P']): + return chromosome.split('P')[0] + elif all(x in chromosome for x in ['Q']): + return chromosome.split('Q')[0] + else: + new_chromosome = chromosome.lower() \ + .replace('alternate reference locus', '') \ + .replace('mitochondria', 'M') \ + .replace('not on reference assembly', '') \ + .upper() \ + .strip() - if new_chrom in list(map(str, range(1, 23))) or new_chrom in ['X', 'Y', 'M', 'MT']: - return new_chrom + if new_chromosome in accepted: + return new_chromosome + else: + return chromosome def chromosome_number(chromosome): diff --git a/requirements.txt b/requirements.txt index b893b110..74414749 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ django-genomix==0.6.7 django-model-utils==3.1.2 mysqlclient==1.4.2.post1 +tqdm==4.32.1 diff --git a/requirements_dev.txt b/requirements_dev.txt index 7e324a6e..4b74740a 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -6,3 +6,5 @@ twine==1.13.0 Sphinx==2.1.0 sphinx-rtd-theme==0.4.3 sphinxcontrib-napoleon==0.7 + +autopep8==1.4.4 diff --git a/tests/test_utils.py b/tests/test_utils.py index a99eb4bb..b9f8df26 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -18,6 +18,12 @@ ('1p10.1', '1'), ('1q10.1', '1'), ('dasfa', None), + ('12 alternate reference locus', '12'), + ('mitochondria', 'M'), + ('13 not on reference assembly', '13'), + ('15 unplaced', '15'), + ('4q27', '4'), + ('5q31', '5'), ]) def test_reformat_chromosome(chromosome, expected): assert utils.reformat_chromosome(chromosome) == expected