forked from namu-tree/SynestheticReading_TeamD
-
Notifications
You must be signed in to change notification settings - Fork 2
/
main_story.py
83 lines (64 loc) · 2.63 KB
/
main_story.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
"""main_story
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1IeAH8xsWfh1J7pwqVMTCqt7VWnuSdCWp
"""
from utils_story import story_preprocessing
from utils_story import Searching
from utils_story import Importing
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
title = 'Snow-man-story'
story = Importing(Searching(title))
story = story_preprocessing(story)
# 감정분석 모델 및 객체 설정
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)
# 토큰화된 텍스트 데이터를 처리하고 모델에 입력으로 공급
class SimpleDataset:
def __init__(self, tokenized_texts):
self.tokenized_texts = tokenized_texts
def __len__(self):
return len(self.tokenized_texts["input_ids"])
def __getitem__(self, idx):
return {k: v[idx] for k, v in self.tokenized_texts.items()}
pred_texts = story.clean_text.to_list()
def story_emotion_classification(pred_texts):
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)
# Run predictions
predictions = trainer.predict(pred_dataset)
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
# scores raw
temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))
# work in progress
# container
anger = []
disgust = []
fear = []
joy = []
neutral = []
sadness = []
surprise = []
# extract scores (as many entries as exist in pred_texts)
for i in range(len(pred_texts)):
anger.append(temp[i][0])
disgust.append(temp[i][1])
fear.append(temp[i][2])
joy.append(temp[i][3])
neutral.append(temp[i][4])
sadness.append(temp[i][5])
surprise.append(temp[i][6])
# Create DataFrame with texts, predictions, labels, and scores
story_classified = pd.DataFrame(list(zip(pred_texts,preds,labels,scores, anger, disgust, fear, joy, neutral, sadness, surprise)),
columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
return story_classified