Daily build

scikal · Aug 29, 2021 · d02ffd7 · d02ffd7
1 parent 545aa86
commit d02ffd7
Show file tree

Hide file tree

Showing 8 changed files with 610 additions and 315 deletions.
diff --git a/ANEUPLOIDY_TEST.py b/ANEUPLOIDY_TEST.py
diff --git a/DISTANT_ADMIXTURE_MODELS.py b/DISTANT_ADMIXTURE_MODELS.py
@@ -2,11 +2,11 @@
 # -*- coding: utf-8 -*-
 
 """
-COMPLEX_ADMIXTURE_MODELS
+DISTANT_ADMIXTURE_MODELS
 
 Given reads that originated form the same genomic window and a reference panel
 of two populations, the likelihood of observed reads under four scenarios,
-namely, monosomy, disomy, SPH and BPH is calculated. This module is for complex
+namely, monosomy, disomy, SPH and BPH is calculated. This module is for distant
 admixtures, where each descendant haplotype has a certain probability to
 originate from one of two ancestral populations.
 
@@ -18,7 +18,7 @@
 Aug 10, 2021
 """
 
-import pickle, os, sys, bz2, collections
+import pickle, os, sys, bz2, collections, gzip
 
 from functools import reduce
 from operator import and_, itemgetter
@@ -37,12 +37,11 @@ def popcount(x):
         """ Counts non-zero bits in positive integer. """
         return bin(x).count('1')
 
-class complex_admixture:
-    """ Based on two IMPUTE2 arrays, which contain the legend and haplotypes,
-    and a dictionary with statisitcal models (models_dict), it allows to
-    calculate the likelihoods of observed alleles under various statistical
-    models (monosomy, disomy, SPH and BPH). """
-
+class distant_admixture:
+    """ Based on the statisitcal models (models_dict) and the reference panel
+    (leg_tab, hap_tab and sam_tab), it allows to calculate the likelihoods of
+    observed alleles under various statistical models (monosomy, disomy, SPH
+    and BPH). """
 
     def __init__(self, obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes, admixture):
         """ Initialize the attributes of the class. """
@@ -101,7 +100,7 @@ def build_hap_dict(self, obs_tab, leg_tab, hap_tab):
 
         fraction_of_matches = 1-mismatches/len(obs_tab)
 
-        print('Algorithm for complex admixtures: %.2f%% of the observed alleles matched the reference panel.' % (100*fraction_of_matches))
+        print('Algorithm for distant admixtures: %.2f%% of the observed alleles matched the reference panel.' % (100*fraction_of_matches))
 
         return hap_dict, fraction_of_matches
 
@@ -288,34 +287,42 @@ def get_likelihoods(self, *x):
             result = self.likelihoods(*x)
         return result
 
-def wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename,admixture):
-    """ Wrapper function of the class complex_admixture. It receives an observations
-    file, IMPUTE2 legend file, IMPUTE2 haplotypes file, IMPUTE2 samples file,
-    and a file with four statistical models. Based on the given data it creates
-    and returns an instance of the class. """
-
-    from MAKE_OBS_TAB import read_impute2
+def wrapper_of_distant_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename,admixture):
+    """ Wrapper function of the class 'distant_admixture'. It receives an
+    observations file, legend file, haplotypes file, samples file and a file
+    with the statistical models. Based on the given data it creates and returns
+    an instance of the class. """
 
     if not os.path.isfile(obs_filename): raise Exception('Error: OBS file does not exist.')
     if not os.path.isfile(leg_filename): raise Exception('Error: LEGEND file does not exist.')
     if not os.path.isfile(hap_filename): raise Exception('Error: HAP file does not exist.')
     if not os.path.isfile(sample_filename): raise Exception('Error: SAMPLE file does not exist.')
     if not os.path.isfile(models_filename): raise Exception('Error: MODELS file does not exist.')
 
-    leg_tab = read_impute2(leg_filename, filetype='leg')
-    hap_tab, total_number_of_haplotypes = read_impute2(hap_filename, filetype='hap')
-    sam_tab  = read_impute2(sample_filename, filetype='sam')
+    load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open)  #Adjusts the opening method according to the file extension.
+
+    open_hap = load(hap_filename)
+    with open_hap(hap_filename,'rb') as hap_in:
+        hap_tab, total_number_of_haplotypes = pickle.load(hap_in)
+
+    open_leg = load(leg_filename)
+    with open_leg(leg_filename,'rb') as leg_in:
+        leg_tab = pickle.load(leg_in)
+
+    open_samp = load(sample_filename)
+    with open_samp(sample_filename,'rb') as samp_in:
+        sam_tab = pickle.load(samp_in)
 
-    load_obs = bz2.BZ2File if obs_filename[-6:]=='.p.bz2' else open
-    with load_obs(obs_filename, 'rb') as f:
-        obs_tab = pickle.load(f)
+    open_obs = load(obs_filename)
+    with open_obs(obs_filename, 'rb') as obs_in:
+        obs_tab = pickle.load(obs_in)
         #info = pickle.load(f)
 
-    load_model = bz2.BZ2File if models_filename[-6:]=='.p.bz2' else open
-    with load_model(models_filename, 'rb') as f:
-        models_dict = pickle.load(f)
+    open_model = load(models_filename)
+    with open_model(models_filename, 'rb') as model_in:
+        models_dict = pickle.load(model_in)
 
-    return complex_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes, admixture)
+    return distant_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes, admixture)
 
 if __name__ != "__main__":
     print('The module COMPLEX_ADMIXTURE_MODELS was imported.')
@@ -341,7 +348,7 @@ def wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_fil
     models_filename = 'MODELS/MODELS16.p'
     admixture = admix_tuple('EUR',0.8)
 
-    A = wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sam_filename,models_filename,admixture)
+    A = wrapper_of_distant_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sam_filename,models_filename,admixture)
 
     alleles = tuple(A.hap_dict.keys())
 

diff --git a/EXTRACT_GENOTYPES.py b/EXTRACT_GENOTYPES.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+EXTRACT_GENOTYPES
+
+Simulates an observation table, obs_tab of a haploid, using phased genotypes from a LD-PGTA reference panel.
+
+Daniel Ariad (daniel@ariad.org)
+Jan 13, 2021
+"""
+import pickle, os, sys, time, argparse, random, gzip, collections
+
+leg_tuple = collections.namedtuple('leg_tuple', ('chr_id', 'pos', 'ref', 'alt')) #Encodes the rows of the legend table
+sam_tuple = collections.namedtuple('sam_tuple', ('sample_id', 'group1', 'group2', 'sex')) #Encodes the rows of the samples table
+obs_tuple = collections.namedtuple('obs_tuple', ('pos', 'read_id', 'base')) #Encodes the rows of the observations table
+
+def get_haplotypes(sample_filename, hap_filename, sample_id):
+    """ Extracts haplotypes that correspond to a specific sample ID. """
+
+    with gzip.open(sample_filename, 'rb') as sam_in:
+        SAM = pickle.load(sam_in)
+
+    samples = [s.sample_id for s in SAM]
+
+    if sample_id in samples:
+        ind = samples[::-1].index(sample_id)
+    else:
+        raise Exception('Error: sample_id not found.')
+
+    a = -2*(ind+1)
+    b = None if ind==0 else -2*(ind+1)+2
+
+    #print(samples[-(ind+1)])
+    string2tuple = {'00': (0,0), '01': (0,1), '10': (1,0), '11': (1,1), '': (0,0), '0': (0,0), '1': (0,1)}
+    with gzip.open(hap_filename,'rb') as hap_in:
+        hap_tab, number_of_haplotypes = pickle.load(hap_in)
+    result = [string2tuple[bin(h)[2:][a:b]] for h in hap_tab]
+
+    return result
+
+def extract(leg_filename,hap_filename,samp_filename,chr_id,sample_id,**kwargs):
+    """ Builds an observation tables of effective haploids by extracting 
+        phased genotypes from a LD-PGTA reference panel. """
+
+    a = time.time()
+    random.seed(None,version=2)
+
+    genotypes = kwargs.get('genotypes', 'AB')
+
+    output_dir = kwargs.get('output_dir', '')
+    if output_dir!='' and not os.path.exists(output_dir): os.makedirs(output_dir)
+    output_dir += '/' if output_dir[-1:]!='/' else ''
+
+    haplotypes = get_haplotypes(samp_filename, hap_filename, sample_id)
+
+    with gzip.open(leg_filename,'rb') as leg_in:
+        legend = pickle.load(leg_in)
+
+    info = {'chr_id': chr_id,
+            'depth': 1,
+            'read_length': 1,
+            'sample_id': sample_id}
+
+    if genotypes in ('A','AB'):
+        obs_tab1 = tuple(obs_tuple(pos, 'XXX', alt if allele1 else ref)
+                             for (chrID,pos,ref,alt),(allele1,allele2) in zip(legend,haplotypes)
+                                 if chr_id==chrID)
+
+        with open(output_dir+sample_id+'A.%s.hg38.obs.p' % chr_id, 'wb') as binfile:
+            info1 = {**info, 'haplotype': 'A'}
+            pickle.dump(obs_tab1, binfile, protocol=4)
+            pickle.dump(info1 , binfile, protocol=4)
+
+    if genotypes in ('B','AB'):
+        obs_tab2 = tuple(obs_tuple(pos, 'XXX', alt if allele2 else ref)
+                            for (chrID,pos,ref,alt),(allele1,allele2) in zip(legend,haplotypes)
+                                if chr_id==chrID)
+
+        with open(output_dir+sample_id+'B.%s.hg38.obs.p' % chr_id, 'wb') as binfile:
+            info2 = {**info, 'haplotype': 'B'}
+            pickle.dump(obs_tab2, binfile, protocol=4)
+            pickle.dump(info2, binfile, protocol=4)
+
+    b = time.time()
+    print('Done in %.3f sec.' % ((b-a)))
+
+    return 0
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser( description='Simulates two observation tables of haploids, using phased genotypes from a LD-PGTA reference panel. ')
+
+    parser.add_argument('leg_filename', metavar='legend_filename', type=str,
+                        help='IMPUTE2 legend file')
+    parser.add_argument('hap_filename', metavar='haplotypes_filename', type=str,
+                        help='IMPUTE2 haplotypes file')
+    parser.add_argument('samp_filename', metavar='samples_filename', type=str,
+                        help='IMPUTE2 samples file')
+    parser.add_argument('chr_id', metavar='chromosomeID', type=str,
+                        help='Chromosome ID')
+    parser.add_argument('sample_id', metavar='sampleID', type=str,
+                        help='Sample ID')
+    parser.add_argument('-g', '--genotypes', metavar='A/B/AB', type=str, default='AB',
+                        help='Which of the individual\'s haplotypes should be used. For each specified haplotype, one haploid would be genereated. Default is both (AB).')
+
+
+    args = parser.parse_args()
+    sys.exit(extract(**vars(args)))
+
+
+def test():
+    sample_id = 'HG00097'
+    chr_id = 'chr21'
+    leg_filename = f'EUR_panel.hg38/{chr_id:s}_EUR_panel.legend.gz'
+    hap_filename = f'EUR_panel.hg38/{chr_id:s}_EUR_panel.hap.gz'
+    samp_filename = 'EUR_panel.hg38/EUR_panel.samples.gz'
+
+    work_dir='results_TEMP'
+    return extract(leg_filename,hap_filename,samp_filename,chr_id,sample_id,output_dir=work_dir)