Skip to content

Commit

Permalink
Add script to update taxon IDs with new mappings
Browse files Browse the repository at this point in the history
If species_strain_map is updated, existing genes that have the taxon
ID of a strain aren't automatically updated to the organism level
taxon ID.

This script updates the out-of-date taxon IDs of genes in all sessions
and also updates genes in the TrackDB gene cache with new mappings.

Refs #2831
  • Loading branch information
kimrutherford committed Jun 10, 2024
1 parent fe94441 commit 1d26537
Showing 1 changed file with 144 additions and 0 deletions.
144 changes: 144 additions & 0 deletions etc/reapply_species_strain_map.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/perl -w

# Lookup the taxon ID associated with each gene in all CursDBs using
# Config::get_species_taxon_of_strain_taxon()
# If the lookup returns a result, change the Organism of the genes
# to the result of the call
# Then also update the organism of the corresponding genes in the TrackDB

###########################
# START OF BOILERPLATE CODE

use strict;
use warnings;
use Carp;
use feature ':5.10';

use File::Basename;

BEGIN {
my $script_name = basename $0;

if (-f $script_name && -d "../etc") {
# we're in the scripts directory - go up
chdir "..";
}
};

use lib qw(lib);

use Canto::Config;
use Canto::TrackDB;
use Canto::Track;
use Canto::Track::LoadUtil;
use Canto::Meta::Util;

my $app_name = Canto::Config::get_application_name();

$ENV{CANTO_CONFIG_LOCAL_SUFFIX} ||= 'deploy';

my $suffix = $ENV{CANTO_CONFIG_LOCAL_SUFFIX};

if (!Canto::Meta::Util::app_initialised($app_name, $suffix)) {
die "The application is not yet initialised, try running the canto_start " .
"script\n";
}

my $config = Canto::Config::get_config();
my $schema = Canto::TrackDB->new(config => $config);

my $track_schema = Canto::TrackDB->new(config => $config);

# END OF BOILERPLATE CODE
#########################


# A collection of IDs of genes that need to have their organisms
# updated in the TrackDB. We populate this map while iterating over
# the CursDBs
my %genes_to_update = ();


my $proc = sub {
my $curs = shift;
my $curs_schema = shift;
my $track_schema = shift;

my $organism_rs = $curs_schema->resultset('Organism');

# orig taxon ID to new taxon ID map
my %taxon_map = ();

# first find all Organisms in this session that need updating,
# capturing them into %taxon_map
while (defined (my $organism = $organism_rs->next())) {
my $orig_taxonid = $organism->taxonid();
my $lookup_taxonid =
$config->get_species_taxon_of_strain_taxon($orig_taxonid);

if (defined $lookup_taxonid && $orig_taxonid != $lookup_taxonid) {
my $new_org = $curs_schema->resultset('Organism')
->find_or_create({ taxonid => $lookup_taxonid });
$taxon_map{$orig_taxonid} = $new_org;
}
}

my $gene_rs = $curs_schema->resultset('Gene')
->search({}, { prefetch => 'organism' });

# Iterate over genes and update the Organism based on %taxon_map
while (defined (my $gene = $gene_rs->next())) {
my $gene_taxonid = $gene->organism()->taxonid();

my $new_org = $taxon_map{$gene_taxonid};

if (defined $new_org) {
$genes_to_update{$gene->primary_identifier()} = 1;

print "updating ", $gene->primary_identifier(), " in CursDB\n";
print " $gene_taxonid -> ", $new_org->taxonid(), "\n";
$gene->organism($new_org);
$gene->update();
}
}
};

my $load_util = Canto::Track::LoadUtil->new(schema => $schema);


my $txn_proc = sub {
# iterate over CursDBs
Canto::Track::curs_map($config, $track_schema, $proc);

# update organisms of genes in the TrackDB
for my $gene_primary_identifier (keys %genes_to_update) {
my $gene = $track_schema->resultset('Gene')
->find({ primary_identifier => $gene_primary_identifier },
{ prefetch => 'organism' });
if (defined $gene) {
my $props_rs = $gene->organism()->organismprops()->search({}, { prefetch => 'type' });
my $orig_taxonid;
while (defined (my $prop = $props_rs->next())) {
if ($prop->type()->name() eq 'taxon_id') {
$orig_taxonid = $prop->value();
last;
}
}
if (!defined $orig_taxonid) {
die "internal error: can't find taxon ID for $gene_primary_identifier\n";
}
my $new_taxonid =
$config->get_species_taxon_of_strain_taxon($orig_taxonid);
my $new_organism =
$load_util->find_organism_by_taxonid($new_taxonid);

print "updating ", $gene->primary_identifier(), " in TrackDB\n";
$gene->organism($new_organism);
$gene->update();
}
}
};

$track_schema->txn_do($txn_proc);

exit 0;

0 comments on commit 1d26537

Please sign in to comment.