Skip to content

Commit

Permalink
Merge pull request #118 from chopdgd/issue-117
Browse files Browse the repository at this point in the history
Resolved #117 - Ensure gene entries created from RefSeq set chromosome attribute
  • Loading branch information
genomics-geek authored Jun 14, 2019
2 parents 2e532ce + a5a6c43 commit 057d0b6
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 40 deletions.
96 changes: 64 additions & 32 deletions genome/management/commands/genome_sync.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging

from django.core.exceptions import ObjectDoesNotExist
from django.core.management import BaseCommand
from django.db import IntegrityError

from genome import app_settings, choices, models, utils
from genomix.utils import retrieve_data

from tqdm import tqdm


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -37,41 +39,63 @@ def handle(self, *args, **options):
defaults={'description_url': description_url}
)
self.sync_chromosomes(genome)
self.sync_cytobands(genome)
self.sync_genes(genome)
self.sync_transcripts(genome, sync_exons=options['sync_exons'])

chromosomes = self.get_chromosomes(genome)
self.sync_cytobands(genome, chromosomes)
self.sync_genes(genome, chromosomes)
self.sync_transcripts(genome, chromosomes, sync_exons=options['sync_exons'])
logger.info('Syncing {0} complete!'.format(build))

@staticmethod
def get_chromosomes(genome):
chromosomes = {}
for item in models.Chromosome.objects.filter(genome=genome):
chromosomes[item.label] = item
return chromosomes

@staticmethod
def get_chromosome(genome, chromosomes, value):
if value:
label = utils.reformat_chromosome(value)
try:
return chromosomes[label]
except KeyError:
logger.warning('Chromosome: {0} does not exist!'.format(value))

def sync_chromosomes(self, genome):
logger.info('Syncing chromosomes...')
for row in self.run_ucsc_query(genome, self.ucsc_chromosome_sql()):
for row in tqdm(self.run_ucsc_query(genome, self.ucsc_chromosome_sql())):
chromosome = utils.reformat_chromosome(row[0])
models.Chromosome.objects.update_or_create(
genome=genome,
label=chromosome,
defaults={'length': row[1]}
)
logger.info('Syncing chromosomes complete!')

def sync_cytobands(self, genome):
def sync_cytobands(self, genome, chromosomes):
logger.info('Syncing cytobands...')
for row in self.run_ucsc_query(genome, self.ucsc_cytoband_sql()):
chromosome = utils.reformat_chromosome(row[0])
for row in tqdm(self.run_ucsc_query(genome, self.ucsc_cytoband_sql())):
chromosome = self.get_chromosome(genome, chromosomes, row[0])
models.CytoBand.objects.update_or_create(
label=row[3],
chromosome=models.Chromosome.objects.get(genome=genome, label=chromosome),
chromosome=chromosome,
defaults={
'start': row[1],
'end': row[2],
'stain': row[4],
}
)
logger.info('Syncing cytobands complete!')

def sync_genes(self, genome):
def sync_genes(self, genome, chromosomes):
logger.info('Syncing HGNC genes...')
logger.info('Downloading from HGNC...')
hgnc_data = retrieve_data(getattr(app_settings, 'HGNC_GENES'))
logger.info('Downloading complete!')
header = [text.upper() for text in hgnc_data.pop(0).strip().split('\t')]

for line in hgnc_data:
for line in tqdm(hgnc_data):
columns = line.split('\t')
symbol = columns[header.index('APPROVED SYMBOL')]
name = columns[header.index('APPROVED NAME')]
Expand All @@ -93,14 +117,7 @@ def sync_genes(self, genome):
not_curated_rat_genome_database = columns[
header.index('RAT GENOME DATABASE ID(SUPPLIED BY RGD)')]

try:
chromosome = models.Chromosome.objects.get(
genome=genome,
label=utils.reformat_chromosome(chromosome),
)
except ObjectDoesNotExist:
chromosome = None

chromosome = self.get_chromosome(genome, chromosomes, chromosome)
gene, created = models.Gene.objects.update_or_create(
symbol=symbol.upper(),
chromosome=chromosome,
Expand Down Expand Up @@ -133,30 +150,41 @@ def sync_genes(self, genome):
gene.synonyms.add(synonym)

gene.save()
logger.info('Syncing HGNC genes complete!')

def sync_transcripts(self, genome, sync_exons=False):
def sync_transcripts(self, genome, chromosomes, sync_exons=False):
message = 'Syncing RefSeq transcripts'
if sync_exons:
message += ' and exons'
logger.info('{0}...'.format(message))

for row in self.run_ucsc_query(genome, self.ucsc_refgene_sql()):
for row in tqdm(self.run_ucsc_query(genome, self.ucsc_refgene_sql())):
chromosome = self.get_chromosome(genome, chromosomes, row[10])
gene, created = models.Gene.objects.get_or_create(
chromosome__genome=genome,
symbol=row[0].upper(),
defaults={'status': getattr(choices.HGNC_GENE_STATUS, 'ucsc_gene')}
)
transcript, created = models.Transcript.objects.update_or_create(
label=row[1],
gene=gene,
defaults={
'strand': getattr(choices.STRAND_TYPES, row[2]),
'transcription_start': row[3],
'transcription_end': row[4],
'cds_start': row[5],
'cds_end': row[6],
'status': getattr(choices.HGNC_GENE_STATUS, 'ucsc_gene'),
'chromosome': chromosome,
}
)

try:
transcript, created = models.Transcript.objects.update_or_create(
label=row[1],
gene=gene,
defaults={
'strand': getattr(choices.STRAND_TYPES, row[2]),
'transcription_start': row[3],
'transcription_end': row[4],
'cds_start': row[5],
'cds_end': row[6],
}
)
except IntegrityError:
logger.warning('Transcript: {0} was skipped!'.format(row[1]))
continue

if sync_exons:
strand = row[2]
number_of_exons = row[7]
Expand All @@ -179,15 +207,18 @@ def sync_transcripts(self, genome, sync_exons=False):
'end': ends[index],
}
)
logger.info('{0} complete!'.format(message))

def run_ucsc_query(self, genome, query):
logger.info('Querying UCSC...')
build = genome.label
conn = self.ucsc_connection(build)
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
cursor.close()
conn.close()
logger.info('Querying UCSC complete!')
return rows

@staticmethod
Expand Down Expand Up @@ -227,7 +258,8 @@ def ucsc_refgene_sql():
rg.cdsEnd,
rg.exonCount,
rg.exonStarts,
rg.exonEnds
rg.exonEnds,
rg.chrom
FROM hg19.refGene rg
INNER JOIN hgFixed.gbCdnaInfo gi ON rg.name=gi.acc;
"""
Expand Down
27 changes: 19 additions & 8 deletions genome/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
# -*- coding: utf-8
def reformat_chromosome(chromosome):
if 'p' in chromosome:
new_chrom = chromosome.split('p')[0].upper()
elif 'q' in chromosome:
new_chrom = chromosome.split('q')[0].upper()
else:
new_chrom = chromosome.replace('chr', '').upper()
chromosome = chromosome.replace('chr', '').replace('unplaced', '').upper().strip()
accepted = list(map(str, range(1, 23))) + ['X', 'Y', 'M', 'MT']

if chromosome not in accepted:
if all(x in chromosome for x in ['P']):
return chromosome.split('P')[0]
elif all(x in chromosome for x in ['Q']):
return chromosome.split('Q')[0]
else:
new_chromosome = chromosome.lower() \
.replace('alternate reference locus', '') \
.replace('mitochondria', 'M') \
.replace('not on reference assembly', '') \
.upper() \
.strip()

if new_chrom in list(map(str, range(1, 23))) or new_chrom in ['X', 'Y', 'M', 'MT']:
return new_chrom
if new_chromosome in accepted:
return new_chromosome
else:
return chromosome


def chromosome_number(chromosome):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ django-genomix==0.6.7
django-model-utils==3.1.2

mysqlclient==1.4.2.post1
tqdm==4.32.1
2 changes: 2 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ twine==1.13.0
Sphinx==2.1.0
sphinx-rtd-theme==0.4.3
sphinxcontrib-napoleon==0.7

autopep8==1.4.4
6 changes: 6 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
('1p10.1', '1'),
('1q10.1', '1'),
('dasfa', None),
('12 alternate reference locus', '12'),
('mitochondria', 'M'),
('13 not on reference assembly', '13'),
('15 unplaced', '15'),
('4q27', '4'),
('5q31', '5'),
])
def test_reformat_chromosome(chromosome, expected):
assert utils.reformat_chromosome(chromosome) == expected
Expand Down

0 comments on commit 057d0b6

Please sign in to comment.