overall_pipeline.py

# -*- coding: utf-8 -*-
"""
Created on Mon Aug  8 16:28:41 2022

@author: IIT
"""

import torch
from torch.utils.data import  DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

import pandas as pd
from sklearn import preprocessing

# from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from tqdm.auto import tqdm
from transformers import AdamW
import collections


class Dataset(torch.utils.data.Dataset):
    """
    Class to store the data as PyTorch Dataset
    """
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        # an encoding can have keys such as input_ids and attention_mask
        # item is a dictionary which has the same keys as the encoding has
        # and the values are the idxth value of the corresponding key (in PyTorch's tensor format)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
        

print(Dataset.__doc__)


class Predictor:
    
    """
    Class to for holding predictor object
    
    """
    
    def __init__(self, tokenizer='./model/bert_uncased_L-4_H-512_A-8', num_of_epochs = 10,learning_rate = 5e-5):
        self.num_of_epochs = num_of_epochs
        self.learning_rate = learning_rate
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer,do_lower_case = True) 
        self.encoder = preprocessing.LabelEncoder()
        # self.device = torch.device('cpu')
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
    def train(self, model, optimizer):
        """Method to train the model"""
        dataloader = self.train_loader
        model.train()
        
        epoch_loss = 0
        size = len(dataloader.dataset)
        
        for i, batch in enumerate(dataloader):  
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].type(torch.LongTensor).to(self.device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    
            optimizer.zero_grad()
            loss = outputs.loss
            print(loss)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
    
            
        print('Training loss: {:.3f}'.format(epoch_loss / size))
                
    print(train.__doc__)
    
    def test(self, model):
        """Method to test the model's accuracy and loss on the validation set"""
        dataloader = self.test_loader
        model.eval()
        
        size = len(dataloader.dataset)
        test_loss, accuracy = 0, 0
        
        with torch.no_grad():
            for batch in dataloader:
                X, y = batch['input_ids'].to(self.device), batch['labels'].type(torch.LongTensor).to(self.device)
                pred = model(X, labels=y)
                
                test_loss += pred.loss
                accuracy += (pred.logits.softmax(1).argmax(1) == y).type(torch.float).sum().item()
                
            test_loss /= size
            accuracy /= size
            
            print("Test loss: {:.3f}, accuracy: {:.3f}%".format(test_loss, accuracy * 100))    
    def prepare_data(self, descriptions,label):
        
        # label processing
        label = self.encoder.fit_transform(label)
        # labels = torch.tensor(label)
        X_train, X_test, y_train, y_test = train_test_split(descriptions, label, test_size=0.2, random_state=42)
        
        # converting to list
        X_train = X_train.values.tolist()
        X_test = X_test.values.tolist()
        
        # some saving
        # self.X_train = X_train
        # self.X_test = X_test
        # self.y_train = y_train
        # self.y_test = y_test
        
        # tokenize
        X_train = self.tokenizer(X_train, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        X_test = self.tokenizer(X_test, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        
        
        #dataloader
        self.train_loader = DataLoader(Dataset(X_train, y_train), batch_size=64, shuffle=True)
        self.test_loader = DataLoader(Dataset(X_test, y_test), batch_size=64, shuffle=True)
        
        
    def train_model(self,num_labels,baseModel='./model/bert_uncased_L-4_H-512_A-8'):
        model = AutoModelForSequenceClassification.from_pretrained(baseModel,num_labels=num_labels)
        # device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        model.to(self.device)
        
        #optimizer
        optimizer = AdamW(model.parameters(), lr=self.learning_rate)
        
        # train_test = Train_test()

        tqdm.pandas()

        for name, param in model.named_parameters():
            if 'classifier' not in name: # classifier layer
                param.requires_grad = False
            else:
                param.requires_grad = True
        
        
        for i in tqdm(range(self.num_of_epochs//2)):
            print("Epoch: #{}".format(i+1))
            self.train( model, optimizer)
            self.test(model)
            
        print("After Unfreezing the layer......................")

        for name, param in model.named_parameters():
        		param.requires_grad = True
                
        for i in tqdm(range(self.num_of_epochs//2, self.num_of_epochs)):
            print("Epoch: #{}".format(i+1))
            self.train(model, optimizer)
            self.test( model)
            
        # saving the model not saving optimizer
        self.model = model
        
    def predict(self,descriptions):
        
        # preprocess data
        X_test = descriptions.values.tolist()
        lent = len(X_test)
        X_test = predictor.tokenizer(X_test, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        # test_loader = DataLoader(Dataset(X_test, y_test), batch_size=64, shuffle=True)
        
        test_ids = []
        test_attention_mask = []
        
        test_ids.append(X_test['input_ids'])
        test_attention_mask.append(X_test['attention_mask'])
        test_ids = torch.cat(test_ids, dim = 0)
        test_attention_mask = torch.cat(test_attention_mask, dim = 0)
        
        self.model.eval()
        with torch.no_grad():
            # predictions = predictor.model(**X_test)
            predictions = self.model(test_ids.to(self.device), token_type_ids = None, attention_mask = test_attention_mask.to(self.device))
        
        predictions_class = []
        for i in range(lent):
            predictions_class.append(predictions.logits.softmax(1)[i].argmax().item())
        
        return self.encoder.inverse_transform(predictions_class)
    
def roundup(val):
    return float(('{0:.1f}'. format((val+0.05))))
def onedec(val):
    return float(('{0:.1f}'. format((val))))

def score_calculator(pred_df):
    met_value_mapping ={
        'attackVector':{
            'NETWORK': 0.85,
            'ADJACENT_NETWORK': 0.62,
            'LOCAL': 0.55,
            'PHYSICAL': 0.2
            },
        'attackComplexity':{
            'LOW': 0.77,
            'HIGH': 0.44
            },
        'privilegesRequired':{
            'NONEUNCHANGED': 0.85,
            'NONECHANGED': 0.85,
            'LOWUNCHANGED': 0.62,
            'LOWCHANGED': 0.68,
            'HIGHUNCHANGED': 0.27,
            'HIGHCHANGED': 0.5
            },
        'userInteraction':{
            'NONE': 0.85,
            'REQUIRED': 0.62
            },
        'confidentialityImpact':{
            'NONE': 0,
            'HIGH': 0.56,
            'LOW': 0.22
            },
        'integrityImpact':{
            'NONE': 0,
            'HIGH': 0.56,
            'LOW': 0.22
            },
        'availabilityImpact':{
            'NONE': 0,
            'HIGH': 0.56,
            'LOW': 0.22
            },
        }
    exploitabilityScore_lst = []
    impactScore_lst = []
    baseScore_lst = []
    
    for ind in pred_df.index:
    
        exploitabilityScore = 0
        impactScore = 0
        baseScore = 0
        ISS = 1 - ( (1 - met_value_mapping['confidentialityImpact'][pred_df['confidentialityImpact'][ind]]) * (1 - met_value_mapping['integrityImpact'][pred_df['integrityImpact'][ind]]) * (1 - met_value_mapping['availabilityImpact'][pred_df['availabilityImpact'][ind]]) )
        scope = pred_df['scope'][ind]
        if scope == 'CHANGED':    
            impactScore = 7.52 * (ISS - 0.029) - 3.25 * (ISS - 0.02) ** 15
        else:
            impactScore = ISS * 6.42
        
        exploitabilityScore = 8.22 * met_value_mapping['attackVector'][pred_df['attackVector'][ind]] * met_value_mapping['attackComplexity'][pred_df['attackComplexity'][ind]] * met_value_mapping['privilegesRequired'][pred_df['privilegesRequired'][ind]+scope] * met_value_mapping['userInteraction'][pred_df['userInteraction'][ind]]
        
        if impactScore <= 0:
            baseScore = 0 
        elif scope == 'CHANGED':
            baseScore = roundup(min (1.08 * (impactScore + exploitabilityScore), 10))
        else:
            baseScore = roundup(min((impactScore + exploitabilityScore), 10))
    
        exploitabilityScore_lst.append(onedec(exploitabilityScore))
        impactScore_lst.append(onedec(impactScore))
        baseScore_lst.append(baseScore)
        

    pred_df['baseScore'] = baseScore_lst
    pred_df['exploitabilityScore'] = exploitabilityScore_lst
    pred_df['impactScore'] = impactScore_lst
    return pred_df

if __name__ == '__main__':
        
    data = pd.read_csv('../data/output/data_2019.csv')
    
    predictors = []
    
    predictions = pd.DataFrame()
    
    allcols = ['ID','description','attackVector','attackComplexity','privilegesRequired','userInteraction','scope',
            'confidentialityImpact','integrityImpact','availabilityImpact','baseScore','baseSeverity','exploitabilityScore','impactScore']
    metc_clmn = ['ID','attackVector','attackComplexity','privilegesRequired','userInteraction','scope',
            'confidentialityImpact','integrityImpact','availabilityImpact','baseScore','baseSeverity','exploitabilityScore','impactScore']
    only_met_clmn = ['attackVector','attackComplexity','privilegesRequired','userInteraction','scope',
            'confidentialityImpact','integrityImpact','availabilityImpact']
    only_scr_clmn = ['baseScore','baseSeverity','exploitabilityScore','impactScore']
    data = data[allcols].dropna()
    
    unique_val = {}
    
    for  metc in only_met_clmn:
        unique_val[metc] = (len(collections.Counter(data[metc])))
        
    
    # av_data = data[['description', 'attackVector']]
    X_train, X_test, y_train, y_test = train_test_split(data['description'], data[metc_clmn], test_size=0.2, random_state=42)
    predictions['ID'] = y_test['ID'].values.tolist()
    for metc in only_met_clmn:
        if metc == 'ID':
            continue
        predictor = Predictor()    
        
        predictor.prepare_data(X_train, y_train[metc])
    
        predictor.train_model(unique_val[metc])
    
        predictions[metc]  = predictor.predict(X_test)
    
    predictions = score_calculator(predictions)
    
    
#%% saving files


y_test.to_csv('actual.csv',index=False)
predictions.to_csv('predictions.csv',index=False)

predictions = pd.read_csv('predictions.csv')

# %% compare

vari = 'baseScore'
y_test[vari].mean()
y_test[vari].std()

# print(roundup(4.001))


fm = pd.DataFrame()
fm['a'] = [1,2,3,4,5,6,7]
fm['b'] = [1,2,3,4,5,6,7]
fm['c'] = [1,2,3,4,5,6,7]
fm['d'] = [1,2,3,4,5,6,7]

X_train, X_test, y_train, y_test = train_test_split(fm[['a','d']], fm[['b','d']], test_size=0.2, random_state=42)


X_train['e'] = [1,2,3,4,5]

e = X_train['e'].values.tolist()