diff --git a/apobec/count_cg_to_ta.py b/apobec/count_cg_to_ta.py
new file mode 100644
index 0000000..29a984f
--- /dev/null
+++ b/apobec/count_cg_to_ta.py
@@ -0,0 +1,419 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jan 10 13:20:05 2019
+
+@author: babin
+"""
+# TODO redo the comment according to c > t and g > a change
+# TODO redo all the comments
+""" count_snp_duplex.py
+this script count exact number of SNPs and their percentage according
+to the dinucleotide context in "fasta" files
+It writes results into  excel spreadsheets.
+the script compares each sequence with the reference
+and counts SNPs in the context of duplexes
+it takes 'fasta' file imported from Geneious as an input
+'fasta' file must be imported from Geneious with the following parameters:
+    - wrap sequence lines every 80 chars: No
+    - replace spaces in sequence names with undescores: Yes
+    - include sequence description: Yes
+    - export sequence in: upper case
+    - export missing ends of alignments as: Ns
+IMPORTANT NOTES:
+    - first record in fasta file must be your reference sequence
+    - reference sequence must start and end with nucleotides, not gaps
+    f.e.: 'A-A---A---A' is the right string, 
+    but: '----A---AAAA---' is not valid
+    if reference sequence starts or ends with gap, the scrips will throw: 
+    'ValueError: Length mismatch: Expected axis has N elements, 
+    new values have n elements'. This is the pandas exception concerning indices.
+NOTE: to speed up the exection instead of 'SeqIO.parse()
+low level fasta parser 'SimpleFastaParser' is used 
+"""
+
+import datetime
+import os
+import sys
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from Bio.SeqIO.FastaIO import SimpleFastaParser  # low level fast fasta parser
+from ipywidgets import IntProgress
+from IPython.display import display
+from time import time
+from loguru import logger
+sns.set()
+
+
+def get_ref_name(in_file):
+    """deriving the first record 
+    from the input fasta
+    """
+
+    with open("./input_data/" + in_file) as in_handle:
+        for record in SimpleFastaParser(in_handle):
+            # sequence is under the 2nd index in the record's tuple
+            ref_name = record[0]
+            break
+
+    return ref_name
+
+def get_input_files_names(path_to_input):
+    """returns a list of input fasta files
+    """
+    fasta_extensions = ["fasta", "fa", "fas"]
+    input_files = os.listdir(path_to_input)
+    input_files = [f for f in input_files if f.rsplit(
+        ".", 1)[-1] in fasta_extensions]
+
+    return input_files
+
+
+def get_ref(in_file):
+    """deriving the first record 
+    from the input fasta
+    """
+
+    with open("./input_data/" + in_file) as in_handle:
+        for record in SimpleFastaParser(in_handle):
+            # sequence is under the 2nd index in the record's tuple
+            ref_seq = record[1]
+            break
+
+    return ref_seq
+
+
+def get_cg_posits_in_ref(ref_seq):
+    """
+    """
+
+    d = {'C_pos': [], 'G_pos': []}
+
+    for nuc_index in range(0, len(ref_seq)):
+        if ref_seq[nuc_index] == "C":
+            d["C_pos"].append(nuc_index)
+
+        elif ref_seq[nuc_index] == "G":
+            d["G_pos"].append(nuc_index)
+
+    return d
+
+
+def get_num_of_reads_in_file(in_file):
+    """
+    """
+    reads_counter = 0
+    with open("./input_data/" + in_file) as in_handle:
+        for record in SimpleFastaParser(in_handle):
+            reads_counter += 1
+
+    return reads_counter - 1
+
+def count_num_of_c_and_g_in_ref(cg_posits):
+    """
+    """
+    num_c_in_ref = len(cg_posits["C_pos"])
+    num_g_in_ref = len(cg_posits["G_pos"])
+
+    return num_c_in_ref, num_g_in_ref
+
+
+def count_total_c_and_g_in_reads(ref_name, cg_posits, in_file):
+
+    c_counter = 0
+    g_counter = 0
+    nucs = ["A", "T", "G", "C"]
+
+    with open("./input_data/" + in_file) as in_handle:
+        for record in SimpleFastaParser(in_handle):
+           
+            if record[0] == ref_name:
+                continue
+            for c_index in cg_posits["C_pos"]:
+                if record[1][c_index] in nucs:
+                    c_counter += 1
+
+            for g_index in cg_posits["G_pos"]:
+                if record[1][g_index] in nucs:
+                    g_counter += 1
+
+    return c_counter, g_counter
+
+
+#def count_total_c_and_g_in_reads(num_c_in_ref,
+#                                num_g_in_ref,
+ #                               num_reads_in_file):
+ #   """
+#    """
+#    total_num_c_in_reads = num_c_in_ref * num_reads_in_file
+ #   total_num_g_in_reads = num_g_in_ref * num_reads_in_file
+#
+#    return total_num_c_in_reads, total_num_g_in_reads
+
+
+def count_c_t_change(f, cg_posits):
+    """
+    """
+    c_t_change_counter = 0
+    with open("./input_data/" + f) as in_handle:
+        for record in SimpleFastaParser(in_handle):
+            for c_index in cg_posits["C_pos"]:
+
+                if record[1][c_index] == "T":
+                    c_t_change_counter += 1
+
+    return c_t_change_counter 
+
+
+
+def count_g_a_change(f, cg_posits):
+    """
+    """
+    g_a_change_counter = 0
+    with open("./input_data/" + f) as in_handle:
+        for record in SimpleFastaParser(in_handle):
+            for g_index in cg_posits["G_pos"]:
+
+                if record[1][g_index] == "A":
+                    g_a_change_counter += 1
+
+    return g_a_change_counter 
+
+
+def calculate_percent_c_t_change(num_c_t_change_in_reads, 
+                                    total_num_c_in_reads, 
+                                    num_c_in_ref):
+    """
+    """
+    if num_c_in_ref: # check if there are Cs nucs in ref
+        percent_c_t_change = (num_c_t_change_in_reads/total_num_c_in_reads) * 100
+    else:
+        percent_c_t_change = "no C in reference"
+
+    return percent_c_t_change
+
+
+def calculate_percent_g_a_change(num_g_a_change_in_reads, 
+                                    total_num_g_in_reads,
+                                    num_g_in_ref):
+    """
+    """
+    if num_g_in_ref: # check if there are Gs nucs in ref
+        percent_g_a_change = (num_g_a_change_in_reads/total_num_g_in_reads) * 100
+    else:
+        percent_g_a_change = "no G in reference"
+
+    return percent_g_a_change
+
+
+def calculate_c_t_and_g_a_change_percent(total_num_c_in_reads,
+                                            total_num_g_in_reads,
+                                            num_c_t_change_in_reads,
+                                            num_g_a_change_in_reads):
+    """
+    """                                        
+    total_gc = total_num_c_in_reads + total_num_g_in_reads
+    total_cg_ta_change_in_reads = num_c_t_change_in_reads + num_g_a_change_in_reads
+
+    c_t_and_g_a_change_percent = (total_cg_ta_change_in_reads/total_gc) * 100
+
+    return c_t_and_g_a_change_percent
+
+
+def create_df(num_reads_in_file, 
+                total_num_c_in_reads, 
+                total_num_g_in_reads, 
+                num_c_t_change_in_reads,
+                num_g_a_change_in_reads, 
+                percent_c_t_change, 
+                percent_g_a_change,
+                c_t_and_g_a_change_percent):
+    """
+    """
+
+    data = {"number of reads" : [num_reads_in_file],
+            "C in reads count" : [total_num_c_in_reads],
+            "G in reads count" : [total_num_g_in_reads],
+            "C>T change count" : [num_c_t_change_in_reads],
+            "G>A change count" : [num_g_a_change_in_reads],
+            "C>T change percent" : [percent_c_t_change],
+            "G>A change percent" : [percent_g_a_change],
+            "C>T + G>A change percent" : [c_t_and_g_a_change_percent]}
+
+    df = pd.DataFrame(data)
+
+    return df
+
+
+def save_df(f, df):
+    """saves dataframes into an excel spreadsheet
+    """
+    if not os.path.exists("./output_count_cg_to_ta_change"):
+        os.mkdir("./output_count_cg_to_ta_change")
+
+    file_name = f.rsplit(".", 1)[0]
+    writer = pd.ExcelWriter("./output_count_cg_to_ta_change/" + file_name + '.xlsx')
+
+    df.to_excel(writer, "cg_to_ta_change")
+
+    writer.save()
+
+
+def get_current_time():
+    """just returns time stamp
+    """
+    time_stamp = datetime.datetime.fromtimestamp(
+        time()).strftime('%Y-%m-%d %H:%M:%S')
+
+    return time_stamp
+
+
+def show_report(total_time, total_files):
+    """prints out  brief report 
+    """
+
+    hours = total_time // 3600
+    minutes = (total_time % 3600) // 60
+    seconds = total_time % 60
+
+    print("""
+    
+    file processed: {0}
+    time taken: {1} hours {2} minutes {3} seconds
+    
+    the results are in the folder 'output_apobec'
+    
+    ---------------
+    ... job finished at {4}
+    ---------------
+    
+    """.format(total_files, hours, minutes, int(seconds), get_current_time()))
+
+
+
+
+#TODO put messages into separate file, into a dict, for example
+# and import it to just call the items to show them
+# so as to make main function not so large
+
+# TODO delete all the try inside the main
+# or just test them prior to it
+
+
+
+
+def main():
+
+    start_time = time()
+    file_counter = 0
+    #############################
+    # logging
+    # TODO make function for logging
+    if not os.path.exists("./log"):
+        os.mkdir("./log")
+    logger.remove() # don't put messages into notebook output
+    logger.add("./log/apobec_count_cg_to_ta_log_{time}" + "_" + ".txt", backtrace=False)
+    logger.add(sys.stderr, level="CRITICAL")
+    ##############################
+    
+
+    # TODO: make function of if/else
+    if os.path.exists("./input_data"): #TODO make if not and then return
+    # to get rid of else block below
+        
+        input_files = get_input_files_names("./input_data")
+        num_files = len(input_files)
+        
+        #############################
+        # a progress bar 
+        print("""
+               ---------------
+               job started at {0} ...
+               ---------------
+               """.format(get_current_time()))
+
+        progress_bar = IntProgress(min=0, max=num_files, bar_style='success')
+        display(progress_bar)
+        ##################################
+        
+
+        for f in input_files:
+            try:
+                ref_seq = get_ref(f) 
+                ref_name = get_ref_name(f)
+                cg_posits = get_cg_posits_in_ref(ref_seq)
+                num_reads_in_file = get_num_of_reads_in_file(f)
+                num_c_in_ref, num_g_in_ref = count_num_of_c_and_g_in_ref(cg_posits)
+                total_num_c_in_reads, total_num_g_in_reads = count_total_c_and_g_in_reads(ref_name, cg_posits, f)
+
+                # C>T and G>A change                                                                                        
+                num_c_t_change_in_reads = count_c_t_change(f, cg_posits)
+                num_g_a_change_in_reads = count_g_a_change(f, cg_posits)
+                
+                percent_c_t_change = calculate_percent_c_t_change(num_c_t_change_in_reads, 
+                                                                        total_num_c_in_reads, 
+                                                                        num_c_in_ref)
+               
+                percent_g_a_change = calculate_percent_g_a_change(num_g_a_change_in_reads, 
+                                                                        total_num_g_in_reads,
+                                                                        num_g_in_ref)
+    
+                c_t_and_g_a_change_percent = calculate_c_t_and_g_a_change_percent(total_num_c_in_reads,
+                                                                                    total_num_g_in_reads,
+                                                                                    num_c_t_change_in_reads,
+                                                                                    num_g_a_change_in_reads)
+
+                df = create_df(num_reads_in_file, 
+                                total_num_c_in_reads, 
+                                total_num_g_in_reads, 
+                                num_c_t_change_in_reads,
+                                num_g_a_change_in_reads, 
+                                percent_c_t_change, 
+                                percent_g_a_change,
+                                c_t_and_g_a_change_percent)
+
+                save_df(f, df)
+
+                
+                
+               
+                #df_cov = pd.DataFrame.from_dict(
+                #    {"coverage": coverage}, orient='index')
+                
+        
+                
+                ##############################
+                # again progress bar
+                progress_bar.value += 1
+                file_counter += 1
+            
+            except Exception:
+                logger.info(
+                    """\n\t==================================================
+                    file: {0} 
+                    sequence id: {1}
+                    ====================================================""".format(f, record_id))
+                logger.exception("")
+                print("exception detected. see log for more details")
+                progress_bar.value += 1                
+
+        finish_time = time()
+        total_time = finish_time - start_time
+        show_report(total_time, file_counter)
+
+    else:
+        os.mkdir("./input_data")
+        print(
+            """
+        Houston, we have a problem...
+        --------
+        folder 'input_data' doesn't exist in the current directory 
+        or may be you've created it but misspelled its name.
+        Anyway, it has just been created by this script.
+        Paste your 'fasta' files into the folder 'input_data' 
+        and run this script again.
+        --------
+        """
+        )
+if __name__ == "__main__":
+    main()