utils.py

import torch
from torch import nn
from torch.nn import functional as F
import pandas as pd
from collections import Counter
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import re
from sklearn.utils import shuffle


def cos_dist(x, y):
    ## cosine distance function
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    batch_size = x.size(0)
    c = torch.clamp(1 - cos(x.view(batch_size, -1), y.view(batch_size, -1)),
                    min=0)
    return c.mean()


def tag_mapping(tags):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    #tags = [s[1] for s in dataset]
    dico = Counter(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag


def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def clean_doc(x, word_freq):
    stop_words = set(stopwords.words('english'))
    clean_docs = []
    most_commons = dict(word_freq.most_common(min(len(word_freq), 50000)))
    for doc_content in x:
        doc_words = []
        cleaned = clean_str(doc_content.strip())
        for word in cleaned.split():
            if word not in stop_words and word_freq[word] >= 5:
                if word in most_commons:
                    doc_words.append(word)
                else:
                    doc_words.append("<UNK>")
        doc_str = ' '.join(doc_words).strip()
        clean_docs.append(doc_str)
    return clean_docs


def load_dataset(dataset):

    if dataset == 'sst':
        df_train = pd.read_csv("./dataset/sst/SST-2/train.tsv", delimiter='\t', header=0)
        
        df_val = pd.read_csv("./dataset/sst/SST-2/dev.tsv", delimiter='\t', header=0)
        
        df_test = pd.read_csv("./dataset/sst/SST-2/sst-test.tsv", delimiter='\t', header=None, names=['sentence', 'label'])

        train_sentences = df_train.sentence.values
        val_sentences = df_val.sentence.values
        test_sentences = df_test.sentence.values
        train_labels = df_train.label.values
        val_labels = df_val.label.values
        test_labels = df_test.label.values   
    

    if dataset == '20news':
        
        VALIDATION_SPLIT = 0.8
        newsgroups_train  = fetch_20newsgroups('dataset/20news', subset='train',  shuffle=True, random_state=0)
        print(newsgroups_train.target_names)
        print(len(newsgroups_train.data))

        newsgroups_test  = fetch_20newsgroups('dataset/20news', subset='test',  shuffle=False)

        print(len(newsgroups_test.data))

        train_len = int(VALIDATION_SPLIT * len(newsgroups_train.data))

        train_sentences = newsgroups_train.data[:train_len]
        val_sentences = newsgroups_train.data[train_len:]
        test_sentences = newsgroups_test.data
        train_labels = newsgroups_train.target[:train_len]
        val_labels = newsgroups_train.target[train_len:]
        test_labels = newsgroups_test.target


    if dataset == '20news-15':
        VALIDATION_SPLIT = 0.8
        cats = ['alt.atheism',
        'comp.graphics',
        'comp.os.ms-windows.misc',
        'comp.sys.ibm.pc.hardware',
        'comp.sys.mac.hardware',
        'comp.windows.x',
        'rec.autos',
        'rec.motorcycles',
        'rec.sport.baseball',
        'rec.sport.hockey',
        'misc.forsale',
        'sci.crypt',
        'sci.electronics',
        'sci.med',
        'sci.space']
        newsgroups_train  = fetch_20newsgroups('dataset/20news', subset='train',  shuffle=True, categories=cats, random_state=0)
        print(newsgroups_train.target_names)
        print(len(newsgroups_train.data))

        newsgroups_test  = fetch_20newsgroups('dataset/20news', subset='test',  shuffle=False, categories=cats)

        print(len(newsgroups_test.data))

        train_len = int(VALIDATION_SPLIT * len(newsgroups_train.data))

        train_sentences = newsgroups_train.data[:train_len]
        val_sentences = newsgroups_train.data[train_len:]
        test_sentences = newsgroups_test.data
        train_labels = newsgroups_train.target[:train_len]
        val_labels = newsgroups_train.target[train_len:]
        test_labels = newsgroups_test.target


    if dataset == '20news-5':
        cats = [
        'soc.religion.christian',
        'talk.politics.guns',
        'talk.politics.mideast',
        'talk.politics.misc',
        'talk.religion.misc']
              
        newsgroups_test  = fetch_20newsgroups('dataset/20news', subset='test',  shuffle=False, categories=cats)
        print(newsgroups_test.target_names)
        print(len(newsgroups_test.data))

        train_sentences = None
        val_sentences = None
        test_sentences = newsgroups_test.data
        train_labels = None
        val_labels = None
        test_labels = newsgroups_test.target

    if dataset == 'wos':
        TESTING_SPLIT = 0.6
        VALIDATION_SPLIT = 0.8
        file_path = './dataset/WebOfScience/WOS46985/X.txt'
        with open(file_path, 'r') as read_file:
            x_temp = read_file.readlines()
            x_all = []
            for x in x_temp:
                x_all.append(str(x))

        print(len(x_all))

        file_path = './dataset/WebOfScience/WOS46985/Y.txt'
        with open(file_path, 'r') as read_file:
            y_temp= read_file.readlines()
            y_all = []
            for y in y_temp:
                y_all.append(int(y))
        print(len(y_all))
        print(max(y_all), min(y_all))


        x_in = []
        y_in = []
        for i in range(len(x_all)):
            x_in.append(x_all[i])
            y_in.append(y_all[i])


        train_val_len = int(TESTING_SPLIT * len(x_in))
        train_len = int(VALIDATION_SPLIT * train_val_len)

        train_sentences = x_in[:train_len]
        val_sentences = x_in[train_len:train_val_len]
        test_sentences = x_in[train_val_len:]

        train_labels = y_in[:train_len]
        val_labels = y_in[train_len:train_val_len]
        test_labels = y_in[train_val_len:]

        print(len(train_labels))
        print(len(val_labels))
        print(len(test_labels))


    if dataset == 'wos-100':
        TESTING_SPLIT = 0.6
        VALIDATION_SPLIT = 0.8
        file_path = './dataset/WebOfScience/WOS46985/X.txt'
        with open(file_path, 'r') as read_file:
            x_temp = read_file.readlines()
            x_all = []
            for x in x_temp:
                x_all.append(str(x))

        print(len(x_all))

        file_path = './dataset/WebOfScience/WOS46985/Y.txt'
        with open(file_path, 'r') as read_file:
            y_temp= read_file.readlines()
            y_all = []
            for y in y_temp:
                y_all.append(int(y))
        print(len(y_all))
        print(max(y_all), min(y_all))


        x_in = []
        y_in = []
        for i in range(len(x_all)):
            if y_all[i] in range(100):
                x_in.append(x_all[i])
                y_in.append(y_all[i])

        for i in range(133):
            num = 0
            for y in y_in:
                if y == i:
                    num = num + 1
            # print(num)

        train_val_len = int(TESTING_SPLIT * len(x_in))
        train_len = int(VALIDATION_SPLIT * train_val_len)

        train_sentences = x_in[:train_len]
        val_sentences = x_in[train_len:train_val_len]
        test_sentences = x_in[train_val_len:]

        train_labels = y_in[:train_len]
        val_labels = y_in[train_len:train_val_len]
        test_labels = y_in[train_val_len:]

        print(len(train_labels))
        print(len(val_labels))
        print(len(test_labels))

    if dataset == 'wos-34':
        TESTING_SPLIT = 0.6
        VALIDATION_SPLIT = 0.8
        file_path = './dataset/WebOfScience/WOS46985/X.txt'
        with open(file_path, 'r') as read_file:
            x_temp = read_file.readlines()
            x_all = []
            for x in x_temp:
                x_all.append(str(x))

        print(len(x_all))

        file_path = './dataset/WebOfScience/WOS46985/Y.txt'
        with open(file_path, 'r') as read_file:
            y_temp= read_file.readlines()
            y_all = []
            for y in y_temp:
                y_all.append(int(y))
        print(len(y_all))
        print(max(y_all), min(y_all))

        x_in = []
        y_in = []
        for i in range(len(x_all)):
            if (y_all[i] in range(100)) != True:
                x_in.append(x_all[i])
                y_in.append(y_all[i])

        for i in range(133):
            num = 0
            for y in y_in:
                if y == i:
                    num = num + 1
            # print(num)

        train_val_len = int(TESTING_SPLIT * len(x_in))
        train_len = int(VALIDATION_SPLIT * train_val_len)

        train_sentences = None
        val_sentences = None
        test_sentences = x_in[train_val_len:]
        
        train_labels = None
        val_labels = None
        test_labels = y_in[train_val_len:]

        print(len(test_labels))
        
    if dataset == 'agnews':

        VALIDATION_SPLIT = 0.8
        labels_in_domain = [1, 2]

        train_df = pd.read_csv('./dataset/agnews/train.csv', header=None)
        train_df.rename(columns={0: 'label',1: 'title', 2:'sentence'}, inplace=True)
        # train_df = pd.concat([train_df, pd.get_dummies(train_df['label'],prefix='label')], axis=1)
        print(train_df.dtypes)
        train_in_df_sentence = []
        train_in_df_label = []
        
        for i in range(len(train_df.sentence.values)):
            sentence_temp = ''.join(str(train_df.sentence.values[i]))
            train_in_df_sentence.append(sentence_temp)
            train_in_df_label.append(train_df.label.values[i]-1)

        test_df = pd.read_csv('./dataset/agnews/test.csv', header=None)
        test_df.rename(columns={0: 'label',1: 'title', 2:'sentence'}, inplace=True)
        # test_df = pd.concat([test_df, pd.get_dummies(test_df['label'],prefix='label')], axis=1)
        test_in_df_sentence = []
        test_in_df_label = []
        for i in range(len(test_df.sentence.values)):
            test_in_df_sentence.append(str(test_df.sentence.values[i]))
            test_in_df_label.append(test_df.label.values[i]-1)

        train_len = int(VALIDATION_SPLIT * len(train_in_df_sentence))

        train_sentences = train_in_df_sentence[:train_len]
        val_sentences = train_in_df_sentence[train_len:]
        test_sentences = test_in_df_sentence
        train_labels = train_in_df_label[:train_len]
        val_labels = train_in_df_label[train_len:]
        test_labels = test_in_df_label
        print(len(train_sentences))
        print(len(val_sentences))
        print(len(test_sentences))


    return train_sentences, val_sentences, test_sentences, train_labels, val_labels, test_labels