forked from namu-tree/SynestheticReading_TeamD
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils_lyrics.py
59 lines (45 loc) · 2.16 KB
/
utils_lyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
import string
string.punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
# csv 형식의 노래 데이터
song_list=pd.read_csv('D:\spotify_millsongdata.csv')
# 영어 텍스트의 전처리 수행과 불용어 처리
def preprocess(text):
text = text.lower()
text="".join([i for i in text if i not in string.punctuation])
text = re.sub(r'\d', '', text)
text = re.sub(r'\n', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
STOP_WORDS = set(stopwords.words('english'))
NOT_USED_STOP_WORDS = {'more', 'aren', "mightn't", 'doesn', 'isn', "didn't", 'wouldn', "won't", 'ain', 'couldn',
"shouldn't", "weren't", 'didn', "hadn't", 'needn', 'shouldn', 'mustn', "mustn't", "wasn't",
"couldn't", 'wasn', "hasn't", 'very', 'most', 'hadn', "wouldn't", "don't", "aren't", 'hasn',
"needn't", "haven't", 'nor', 'no', 'won', 'not', 'haven', "isn't", 'don', "doesn't"}
ADDITIONAL_STOP_WORDS = {"'s", "'re", "'m", "'ve", "'d", "'ll"}
STOP_WORDS = STOP_WORDS - NOT_USED_STOP_WORDS | ADDITIONAL_STOP_WORDS
# 불용어 제거
def remove_stop_words(text: str) -> str:
text_without_stop_words = ' '.join([word for word in word_tokenize(text) if word not in STOP_WORDS])
text_without_stop_words = re.sub(r'\s+\'\s+', ' ', text_without_stop_words)
return text_without_stop_words
wordnet_lemmatizer = WordNetLemmatizer()
# 텍스트를 표제어로 변환
def lemmatize_text(text: str) -> str:
return ' '.join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(text)])
# 노래 가사에 대한 전처리 시행
def lyrics_preprocessing(song_list):
song_list['clean_lyrics']=song_list['text'].apply(lambda x:preprocess(x))
song_list['clean_lyrics']=song_list['clean_lyrics'].apply(lambda x:remove_stop_words(x))
song_list['clean_lyrics']=song_list['clean_lyrics'].apply(lambda x:lemmatize_text(x))
song_list=song_list[['artist','song','text','clean_lyrics']]
return song_list