utils_story.py

# -*- coding: utf-8 -*-
"""utils_story

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1WRjRiugq32lgefIhXYNjOdW0vANexE8N
"""

import os
import torch
import pandas as pd
import numpy as np
import string
string.punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

def Searching():
    PATH = '/content/drive/MyDrive/Modeling/Keywording/section-stories'
    Key_value_dict =  dict()
    Folder_list = os.listdir(PATH)
    for Folder in Folder_list:
        if '.' in Folder:
            pass
        else:
            Key_value_dict[f'\n\n - File list of {Folder}\n'] = 'Dummy'

            for File in os.listdir(PATH + '/' + Folder):
                Key_value_dict[File.split('.')[0]] = PATH + '/' +Folder + '/' + File

    for key in Key_value_dict.keys():
        print(key)
    return Key_value_dict

def Importing(PATH):
    Data = pd.read_csv(PATH, encoding = 'utf-8-sig')
    return Data

Dict = Searching()

story = Importing(Dict['Snow-man-story'])

def preprocess(text):
  text = text.lower()
  text="".join([i for i in text if i not in string.punctuation])
  text = re.sub(r'\d', '', text)
  text = re.sub(r'\n', ' ', text)
  text = re.sub(r'\s+', ' ', text)
  text = text.strip()
  return text


STOP_WORDS = set(stopwords.words('english'))

NOT_USED_STOP_WORDS = {'more', 'aren', "mightn't", 'doesn', 'isn', "didn't", 'wouldn', "won't", 'ain', 'couldn',
                       "shouldn't", "weren't", 'didn', "hadn't", 'needn', 'shouldn', 'mustn', "mustn't", "wasn't",
                       "couldn't", 'wasn', "hasn't", 'very', 'most', 'hadn', "wouldn't", "don't", "aren't", 'hasn',
                       "needn't", "haven't", 'nor', 'no', 'won', 'not', 'haven', "isn't", 'don', "doesn't"}

ADDITIONAL_STOP_WORDS = {"'s", "'re", "'m", "'ve", "'d", "'ll"}

STOP_WORDS = STOP_WORDS - NOT_USED_STOP_WORDS | ADDITIONAL_STOP_WORDS


def remove_stop_words(text: str) -> str:
    text_without_stop_words = ' '.join([word for word in word_tokenize(text) if word not in STOP_WORDS])
    text_without_stop_words = re.sub(r'\s+\'\s+', ' ', text_without_stop_words)
    return text_without_stop_words


wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize_text(text: str) -> str:
    return ' '.join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(text)])

def story_preprocessing(story):
    story['clean_text']=story['text'].apply(lambda x:preprocess(x))
    story['clean_text']=story['clean_text'].apply(lambda x:remove_stop_words(x))
    story['clean_text']=story['clean_text'].apply(lambda x:lemmatize_text(x))

    return story