forked from namu-tree/SynestheticReading_TeamD
-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils_story.py
88 lines (66 loc) · 2.72 KB
/
utils_story.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""utils_story
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1WRjRiugq32lgefIhXYNjOdW0vANexE8N
"""
import os
import torch
import pandas as pd
import numpy as np
import string
string.punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
def Searching():
PATH = '/content/drive/MyDrive/Modeling/Keywording/section-stories'
Key_value_dict = dict()
Folder_list = os.listdir(PATH)
for Folder in Folder_list:
if '.' in Folder:
pass
else:
Key_value_dict[f'\n\n - File list of {Folder}\n'] = 'Dummy'
for File in os.listdir(PATH + '/' + Folder):
Key_value_dict[File.split('.')[0]] = PATH + '/' +Folder + '/' + File
for key in Key_value_dict.keys():
print(key)
return Key_value_dict
def Importing(PATH):
Data = pd.read_csv(PATH, encoding = 'utf-8-sig')
return Data
Dict = Searching()
story = Importing(Dict['Snow-man-story'])
def preprocess(text):
text = text.lower()
text="".join([i for i in text if i not in string.punctuation])
text = re.sub(r'\d', '', text)
text = re.sub(r'\n', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
STOP_WORDS = set(stopwords.words('english'))
NOT_USED_STOP_WORDS = {'more', 'aren', "mightn't", 'doesn', 'isn', "didn't", 'wouldn', "won't", 'ain', 'couldn',
"shouldn't", "weren't", 'didn', "hadn't", 'needn', 'shouldn', 'mustn', "mustn't", "wasn't",
"couldn't", 'wasn', "hasn't", 'very', 'most', 'hadn', "wouldn't", "don't", "aren't", 'hasn',
"needn't", "haven't", 'nor', 'no', 'won', 'not', 'haven', "isn't", 'don', "doesn't"}
ADDITIONAL_STOP_WORDS = {"'s", "'re", "'m", "'ve", "'d", "'ll"}
STOP_WORDS = STOP_WORDS - NOT_USED_STOP_WORDS | ADDITIONAL_STOP_WORDS
def remove_stop_words(text: str) -> str:
text_without_stop_words = ' '.join([word for word in word_tokenize(text) if word not in STOP_WORDS])
text_without_stop_words = re.sub(r'\s+\'\s+', ' ', text_without_stop_words)
return text_without_stop_words
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize_text(text: str) -> str:
return ' '.join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(text)])
def story_preprocessing(story):
story['clean_text']=story['text'].apply(lambda x:preprocess(x))
story['clean_text']=story['clean_text'].apply(lambda x:remove_stop_words(x))
story['clean_text']=story['clean_text'].apply(lambda x:lemmatize_text(x))
return story