utils.py

import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import os
from tqdm import tqdm
import sys
import datetime


class Flatten(nn.Module):
    def forward(self, input):
        return input.reshape(input.shape[0], -1)


class PrintLayerShape(nn.Module):
    def __init__(self):
        super(PrintLayerShape, self).__init__()

    def forward(self, x):
        print(x.shape)
        return x


one_hot = {
    'A': np.array((1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'R': np.array((0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'N': np.array((0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'D': np.array((0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'C': np.array((0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'Q': np.array((0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'E': np.array((0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'G': np.array((0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'H': np.array((0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'I': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'L': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'K': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
    'M': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0)),
    'F': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0)),
    'P': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0)),
    'S': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0)),
    'T': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0)),
    'W': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0)),
    'Y': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0)),
    'V': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0)),
    'X': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1))  # Placeholder for no AA
}

#  https://github.com/gifford-lab/DeepLigand/blob/master/data/onehot_first20BLOSUM50
onehot_Blosum50 = {
    'I': np.array((1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -4, -3, -4, -2, -3, -4, -4, -4, 5,
                   2, -3, 2, 0, -3, -3, -1, -3, -1, 4)),
    'L': np.array((0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -3, -4, -4, -2, -2, -3, -4, -3, 2,
                   5, -3, 3, 1, -4, -3, -1, -2, -1, 1)),
    'V': np.array((0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -4, -1, -3, -3, -4, -4, 4,
                   1, -3, 1, -1, -3, -2, 0, -3, -1, 5)),
    'F': np.array((0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -4, -5, -2, -4, -3, -4, -1, 0,
                   1, -4, 0, 8, -4, -3, -2, 1, 4, -1)),
    'M': np.array((0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -2, -2, -4, -2, 0, -2, -3, -1, 2,
                   3, -2, 7, 0, -3, -2, -1, -1, 0, 1)),
    'C': np.array((
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -4, -2, -4, 13, -3, -3, -3, -3, -2,
        -2, -3, -2, -2, -4, -1, -1, -5, -3, -1)),
    'A': np.array((0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, -2, -1, -2, -1, -1, -1, 0, -2, -1,
                   -2, -1, -1, -3, -1, 1, 0, -3, -2, 0)),
    'G': np.array((0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -1, -3, -2, -3, 8, -2, -4,
                   -4, -2, -3, -4, -2, 0, -2, -3, -3, -4)),
    'P': np.array((
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -3, -2, -1, -4, -1, -1, -2, -2, -3,
        -4, -1, -3, -4, 10, -1, -1, -4, -3, -3)),
    'T': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, -1, -1, -1, -2, -2, -1,
                   -1, -1, -1, -2, -1, 2, 5, -3, -2, 0)),
    'S': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -1, 1, 0, -1, 0, -1, 0, -1, -3, -3,
                   0, -2, -3, -1, 5, 2, -4, -2, -2)),
    'Y': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, -2, -3, -3, -1, -2, -3, 2, -1,
                   -1, -2, 0, 4, -3, -2, -2, 2, 8, -1)),
    'W': np.array((
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, -3, -3, -4, -5, -5, -1, -3, -3, -3, -3,
        -2, -3, -1, 1, -4, -4, -3, 15, 2, -3)),
    'Q': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, -3, 7, 2, -2, 1, -3, -2,
                   2, 0, -4, -1, 0, -1, -1, -1, -3)),
    'N': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -1, -1, 7, 2, -2, 0, 0, 0, 1, -3, -4,
                   0, -2, -4, -2, 1, 0, -4, -2, -3)),
    'H': np.array((
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, -2, 0, 1, -1, -3, 1, 0, -2, 10, -4, -3,
        0, -1, -1, -2, -1, -2, -3, 2, -4)),
    'E': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 2, -3, 2, 6, -3, 0, -4, -3,
                   1, -2, -3, -1, -1, -1, -3, -2, -3)),
    'D': np.array((
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -2, -2, 2, 8, -4, 0, 2, -1, -1, -4, -4,
        -1, -4, -5, -1, 0, -1, -5, -3, -4)),
    'K': np.array((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -1, 3, 0, -1, -3, 2, 1, -2, 0, -3, -3,
                   6, -2, -4, -1, 0, -1, -3, -2, -3)),
    'R': np.array((
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -2, 7, -1, -2, -4, 1, 0, -3, 0, -4, -3,
        3, -2, -3, -3, -1, -1, -3, -1, -3)),
    'X': np.array((0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05,
                   0.05, 0.05, 0.05, 0.05, -1, -1, -1, -1, -2, -1, -1, -2, -1, -1, -1, -1, -1, -2, -2, -1, 0, -3, -1,
                   -1)),
    'J': np.array((
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
        -5, -5, -5, -5, -5, -5, -5, -5, -5, -5))
}


def one_hot_encoding(list, encoding_dict, max_len):
    """ One-hot encoding applied on pandas array """
    matrix = []
    idx = 0
    for i in list:
        # OBS: upper is only due to data already being preprocessed + there are leftover 'm's
        try:
            matrix.append(encoding_dict[i.upper()])
        except:
            print(list, i)
        idx += 1
    for i in range(int(max_len) - idx):
        matrix.append(encoding_dict['X'])

    return np.array(matrix)


class MHC_dataset(torch.utils.data.Dataset):
    """
    test set input currently does nothing. May need to change depending on memory usage.
    """

    def __init__(self, filepath, Partition, BA_EL, mapping_dict, MHC_len, Peptide_len=15):
        self.filepath = filepath
        self.partition = Partition
        self.BA_EL = BA_EL
        self.mapping_dict = mapping_dict

        df = MHC_df(filepath, Partition, BA_EL, mapping_dict, Peptide_len=15)
        self.X, self.y = df_ToTensor(df, MHC_len, Peptide_len)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        batch = self.X[idx]
        label = self.y[idx]
        return batch, label


def MHC_df(filepath, Partition, BA_EL, mapping_dict, Peptide_len=15):
    """ reads MHC files and creates pandas df """

    if type(Partition) is not list:
        Partition = [Partition]
    colnames = ['Peptide', 'BindingAffinity', 'MHC']
    X = pd.DataFrame(columns=colnames)

    # reading files
    for i in Partition:
        complete_path = filepath + 'c00' + str(i) + "_" + BA_EL.lower()
        tmp = pd.read_csv(complete_path, header=None, sep='\s+', names=colnames)
        tmp['Peptide'] = tmp['Peptide'].astype(str)
        tmp['MHC_names'] = tmp['MHC'].astype(str)
        tmp['MHC'] = tmp['MHC_names'].map(mapping_dict)
        X = X.append(tmp, ignore_index=True)

    return X


def df_ToTensor(df, MHC_len, Peptide_len):
    """ Turns input pandas df into tensors """

    # modification of shape and conversion to np array
    y = torch.from_numpy(np.expand_dims(df['BindingAffinity'].values, 1))
    X = df.drop('BindingAffinity', axis=1)
    Peptide_mat = np.stack(
        X.Peptide.apply(one_hot_encoding, encoding_dict=onehot_Blosum50, max_len=Peptide_len).values)
    MHC_mat = np.stack(
        X.MHC.apply(one_hot_encoding, encoding_dict=onehot_Blosum50, max_len=MHC_len).values)
    X = torch.from_numpy(np.concatenate((Peptide_mat, MHC_mat), axis=1).astype(int))

    return X, y


def save_checkpoint(state, save_dir, ckpt_name='best.pth.tar'):
    file_path = os.path.join(save_dir, ckpt_name)
    if not os.path.exists(save_dir):
        print("Save directory dosen't exist! Making directory {}".format(save_dir))
        os.mkdir(save_dir)

    torch.save(state, file_path)


def load_checkpoint(checkpoint, model):
    if not os.path.exists(checkpoint):
        raise Exception("File {} dosen't exists!".format(checkpoint))
    checkpoint = torch.load(checkpoint)
    saved_dict = checkpoint['state_dict']
    new_dict = model.state_dict()
    new_dict.update(saved_dict)
    model.load_state_dict(new_dict)


def print_stdout(a_string):
    sys.stdout.write("{}\n".format(a_string))


def make_dir(directory_path):
    if not os.path.exists(directory_path):
        os.mkdir(directory_path)
        print_stdout("Directory {} created.".format(directory_path))
    else:
        print_stdout("Directory {} already exists.".format(directory_path))


def generate_experiment_folders(root_folder, argument_parser):
    exp_path = str(root_folder) + "/" + datetime.date.today().strftime("%Y%m%d")
    exp_path += '-{}_seed'.format(str(argument_parser.seed))
    exp_path += '-{}_lr'.format(str(argument_parser.lr))
    exp_path += '-{}_pat'.format(argument_parser.patience)
    exp_path += '-gauss' if argument_parser.gauss else '-predic'
    exp_path += '-{}_nresb'.format(argument_parser.n_reslayers)
    exp_path += '-{}_bt'.format(argument_parser.block_type)
    if argument_parser.fucking_raw:
        exp_path += '-raw'
    elif argument_parser.vae:
        exp_path += '-VAE'
    elif argument_parser.lstm:
        exp_path += '-{}_nh-{}_ly_lstm'.format(argument_parser.lstm_nhidden, argument_parser.lstm_nlayers)
    if argument_parser.rezero:
        exp_path += '-rezero'
    if argument_parser.full_lstm:
        exp_path += '-full_lstm'
    exp_path += '/'
    save_dir = exp_path + 'save_dir/'  # name:'checkpoints' cannot be accessed through Jupyter Notebook
    figure_dir = exp_path + 'figures/'
    make_dir(exp_path)
    make_dir(save_dir)
    make_dir(figure_dir)
    return exp_path, save_dir, figure_dir


def performance_testing_print(data_path, test_set, BA_EL, MHC_dict, batch_size, MHC_len, Peptide_len, net, k, outfile,
                              net2=None, resnet=False):
    # LOOP IN ORDER TO MEASURE PERFORMANCE IN THE END.
    # Test loop is funny due to having to save MHC Allele
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    test_df = MHC_df(data_path, test_set, BA_EL, MHC_dict)
    batches_per_epoch = int(np.ceil(test_df.shape[0] / batch_size))
    test_df = test_df.sample(frac=1).reset_index(drop=True)  # Shuffling data set

    for i in tqdm(range(batches_per_epoch)):
        if i == batches_per_epoch:  # Batching
            batch_df = test_df.iloc[batch_size * i:]
        else:
            batch_df = test_df.iloc[batch_size * i:batch_size * (i + 1)]
        X, y = df_ToTensor(batch_df, MHC_len, Peptide_len)
        X = X.permute(0, 2, 1).float().to(device)
        with torch.no_grad():
            if resnet:
                res_out = net(X)
                mu = net2(X, res_out)  # mu for easy variable name stuff
            else:
                mu, std = net(X)
        # For each value in batch print performance to outfile
        for j in range(batch_df.shape[0]):
            MHC = test_df.iloc[(batch_size * i) + j].MHC_names
            Peptide = test_df.iloc[(batch_size * i) + j].Peptide
            if resnet:
                print(k, MHC, Peptide, y[j].item(), mu[j].item(), sep='\t', file=outfile)
            else:
                print(k, MHC, Peptide, y[j].item(), mu[j].item(), std[j].item(), sep='\t', file=outfile)


def train_epochs(args, model, loss_function, train_loader, validation_loader, optimizer, save_dir, crossvalsplit,
                 model_name, trained_model=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_epoch_loss, val_epoch_loss, test_epoch_loss = [], [], []
    train_batch_loss = []
    best_validation_MSE = np.inf

    for epoch in range(1, args.n_epochs + 1):
        for X, y in tqdm(train_loader):
            model.train()
            X = X.permute(0, 2, 1).float().to(device)

            if trained_model is None:
                mu, std = model(X)
                loss = loss_function(y.to(device).float(), mu, std, normal_dist=args.gauss)

            else:
                trained_model.eval()
                with torch.no_grad():
                    res_out = trained_model(X)
                y_pred = model(X, res_out)
                loss = nn.functional.mse_loss(y_pred, y.to(device).float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_batch_loss.append(loss.item())

        val_batch_loss, test_batch_loss = [], []
        for X, y in tqdm(validation_loader):
            X = X.permute(0, 2, 1).float().to(device)
            with torch.no_grad():
                model.eval()
                if trained_model is None:
                    mu, std = model(X.to(device))
                    loss = loss_function(y.to(device).float(), mu, std, normal_dist=False)
                else:
                    trained_model.eval()
                    res_out = trained_model(X)
                    y_pred = model(X, res_out)
                    loss = nn.functional.mse_loss(y_pred, y.to(device).float())
                val_batch_loss.append(loss.item())

        train_epoch_loss.append(np.mean(train_batch_loss))
        val_epoch_loss.append(np.mean(val_batch_loss))

        print('Validation Split: [{}/20], Epoch: {}, Training Loss: {}, Validation Loss {}'.format(
            crossvalsplit, epoch, train_epoch_loss[-1], val_epoch_loss[-1]))

        if np.mean(val_batch_loss) < best_validation_MSE:
            best_epoch = epoch
            best_validation_MSE = np.mean(val_batch_loss)
            save_checkpoint({'epoch': best_epoch, 'state_dict': model.state_dict()},
                            save_dir,
                            ckpt_name='best' + str(crossvalsplit) + '_' + model_name + '.pth.tar')

        if epoch - best_epoch > args.patience:  # Early stopping
            break

    load_checkpoint(save_dir + 'best' + str(crossvalsplit) + '_' + model_name + '.pth.tar', model)

    return model, optimizer