Skip to content

Commit

Permalink
Resolved #117 - Ensure gene entries created from RefSeq set chromosom…
Browse files Browse the repository at this point in the history
…e attribute

- Added better logging with status of what has been completed
- Added setting of default chromsome when creating gene objects from RefSeq

Updated reformat chromosome to try to catch more scenarios

Updated genome sync to be more efficient when looking up chromosomes

removed unused import

Updated chromosome lookup to be once from database and then from memory
  • Loading branch information
genomics-geek committed Jun 14, 2019
1 parent 2e532ce commit a5a6c43
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 40 deletions.
96 changes: 64 additions & 32 deletions genome/management/commands/genome_sync.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging

from django.core.exceptions import ObjectDoesNotExist
from django.core.management import BaseCommand
from django.db import IntegrityError

from genome import app_settings, choices, models, utils
from genomix.utils import retrieve_data

from tqdm import tqdm


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -37,41 +39,63 @@ def handle(self, *args, **options):
defaults={'description_url': description_url}
)
self.sync_chromosomes(genome)
self.sync_cytobands(genome)
self.sync_genes(genome)
self.sync_transcripts(genome, sync_exons=options['sync_exons'])

chromosomes = self.get_chromosomes(genome)
self.sync_cytobands(genome, chromosomes)
self.sync_genes(genome, chromosomes)
self.sync_transcripts(genome, chromosomes, sync_exons=options['sync_exons'])
logger.info('Syncing {0} complete!'.format(build))

@staticmethod
def get_chromosomes(genome):
chromosomes = {}
for item in models.Chromosome.objects.filter(genome=genome):
chromosomes[item.label] = item
return chromosomes

@staticmethod
def get_chromosome(genome, chromosomes, value):
if value:
label = utils.reformat_chromosome(value)
try:
return chromosomes[label]
except KeyError:
logger.warning('Chromosome: {0} does not exist!'.format(value))

def sync_chromosomes(self, genome):
logger.info('Syncing chromosomes...')
for row in self.run_ucsc_query(genome, self.ucsc_chromosome_sql()):
for row in tqdm(self.run_ucsc_query(genome, self.ucsc_chromosome_sql())):
chromosome = utils.reformat_chromosome(row[0])
models.Chromosome.objects.update_or_create(
genome=genome,
label=chromosome,
defaults={'length': row[1]}
)
logger.info('Syncing chromosomes complete!')

def sync_cytobands(self, genome):
def sync_cytobands(self, genome, chromosomes):
logger.info('Syncing cytobands...')
for row in self.run_ucsc_query(genome, self.ucsc_cytoband_sql()):
chromosome = utils.reformat_chromosome(row[0])
for row in tqdm(self.run_ucsc_query(genome, self.ucsc_cytoband_sql())):
chromosome = self.get_chromosome(genome, chromosomes, row[0])
models.CytoBand.objects.update_or_create(
label=row[3],
chromosome=models.Chromosome.objects.get(genome=genome, label=chromosome),
chromosome=chromosome,
defaults={
'start': row[1],
'end': row[2],
'stain': row[4],
}
)
logger.info('Syncing cytobands complete!')

def sync_genes(self, genome):
def sync_genes(self, genome, chromosomes):
logger.info('Syncing HGNC genes...')
logger.info('Downloading from HGNC...')
hgnc_data = retrieve_data(getattr(app_settings, 'HGNC_GENES'))
logger.info('Downloading complete!')
header = [text.upper() for text in hgnc_data.pop(0).strip().split('\t')]

for line in hgnc_data:
for line in tqdm(hgnc_data):
columns = line.split('\t')
symbol = columns[header.index('APPROVED SYMBOL')]
name = columns[header.index('APPROVED NAME')]
Expand All @@ -93,14 +117,7 @@ def sync_genes(self, genome):
not_curated_rat_genome_database = columns[
header.index('RAT GENOME DATABASE ID(SUPPLIED BY RGD)')]

try:
chromosome = models.Chromosome.objects.get(
genome=genome,
label=utils.reformat_chromosome(chromosome),
)
except ObjectDoesNotExist:
chromosome = None

chromosome = self.get_chromosome(genome, chromosomes, chromosome)
gene, created = models.Gene.objects.update_or_create(
symbol=symbol.upper(),
chromosome=chromosome,
Expand Down Expand Up @@ -133,30 +150,41 @@ def sync_genes(self, genome):
gene.synonyms.add(synonym)

gene.save()
logger.info('Syncing HGNC genes complete!')

def sync_transcripts(self, genome, sync_exons=False):
def sync_transcripts(self, genome, chromosomes, sync_exons=False):
message = 'Syncing RefSeq transcripts'
if sync_exons:
message += ' and exons'
logger.info('{0}...'.format(message))

for row in self.run_ucsc_query(genome, self.ucsc_refgene_sql()):
for row in tqdm(self.run_ucsc_query(genome, self.ucsc_refgene_sql())):
chromosome = self.get_chromosome(genome, chromosomes, row[10])
gene, created = models.Gene.objects.get_or_create(
chromosome__genome=genome,
symbol=row[0].upper(),
defaults={'status': getattr(choices.HGNC_GENE_STATUS, 'ucsc_gene')}
)
transcript, created = models.Transcript.objects.update_or_create(
label=row[1],
gene=gene,
defaults={
'strand': getattr(choices.STRAND_TYPES, row[2]),
'transcription_start': row[3],
'transcription_end': row[4],
'cds_start': row[5],
'cds_end': row[6],
'status': getattr(choices.HGNC_GENE_STATUS, 'ucsc_gene'),
'chromosome': chromosome,
}
)

try:
transcript, created = models.Transcript.objects.update_or_create(
label=row[1],
gene=gene,
defaults={
'strand': getattr(choices.STRAND_TYPES, row[2]),
'transcription_start': row[3],
'transcription_end': row[4],
'cds_start': row[5],
'cds_end': row[6],
}
)
except IntegrityError:
logger.warning('Transcript: {0} was skipped!'.format(row[1]))
continue

if sync_exons:
strand = row[2]
number_of_exons = row[7]
Expand All @@ -179,15 +207,18 @@ def sync_transcripts(self, genome, sync_exons=False):
'end': ends[index],
}
)
logger.info('{0} complete!'.format(message))

def run_ucsc_query(self, genome, query):
logger.info('Querying UCSC...')
build = genome.label
conn = self.ucsc_connection(build)
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
cursor.close()
conn.close()
logger.info('Querying UCSC complete!')
return rows

@staticmethod
Expand Down Expand Up @@ -227,7 +258,8 @@ def ucsc_refgene_sql():
rg.cdsEnd,
rg.exonCount,
rg.exonStarts,
rg.exonEnds
rg.exonEnds,
rg.chrom
FROM hg19.refGene rg
INNER JOIN hgFixed.gbCdnaInfo gi ON rg.name=gi.acc;
"""
Expand Down
27 changes: 19 additions & 8 deletions genome/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
# -*- coding: utf-8
def reformat_chromosome(chromosome):
if 'p' in chromosome:
new_chrom = chromosome.split('p')[0].upper()
elif 'q' in chromosome:
new_chrom = chromosome.split('q')[0].upper()
else:
new_chrom = chromosome.replace('chr', '').upper()
chromosome = chromosome.replace('chr', '').replace('unplaced', '').upper().strip()
accepted = list(map(str, range(1, 23))) + ['X', 'Y', 'M', 'MT']

if chromosome not in accepted:
if all(x in chromosome for x in ['P']):
return chromosome.split('P')[0]
elif all(x in chromosome for x in ['Q']):
return chromosome.split('Q')[0]
else:
new_chromosome = chromosome.lower() \
.replace('alternate reference locus', '') \
.replace('mitochondria', 'M') \
.replace('not on reference assembly', '') \
.upper() \
.strip()

if new_chrom in list(map(str, range(1, 23))) or new_chrom in ['X', 'Y', 'M', 'MT']:
return new_chrom
if new_chromosome in accepted:
return new_chromosome
else:
return chromosome


def chromosome_number(chromosome):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ django-genomix==0.6.7
django-model-utils==3.1.2

mysqlclient==1.4.2.post1
tqdm==4.32.1
2 changes: 2 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ twine==1.13.0
Sphinx==2.1.0
sphinx-rtd-theme==0.4.3
sphinxcontrib-napoleon==0.7

autopep8==1.4.4
6 changes: 6 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
('1p10.1', '1'),
('1q10.1', '1'),
('dasfa', None),
('12 alternate reference locus', '12'),
('mitochondria', 'M'),
('13 not on reference assembly', '13'),
('15 unplaced', '15'),
('4q27', '4'),
('5q31', '5'),
])
def test_reformat_chromosome(chromosome, expected):
assert utils.reformat_chromosome(chromosome) == expected
Expand Down

0 comments on commit a5a6c43

Please sign in to comment.