Skip to content

Commit

Permalink
Merge pull request #62 from boostcampaitech5/feature-sentimental
Browse files Browse the repository at this point in the history
[Feature] Sentimental complete
  • Loading branch information
dbsrlskfdk authored Aug 1, 2023
2 parents 0e209f8 + fc2e7ee commit 82f6636
Show file tree
Hide file tree
Showing 13 changed files with 489 additions and 1,973 deletions.
2 changes: 1 addition & 1 deletion Backend/Sentimental/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
if __name__ == "__main__":
import uvicorn

uvicorn.run("app.main:app", host="0.0.0.0", port=30007, reload=True)
uvicorn.run("Sentimental.main:app", host="0.0.0.0", port=30007, reload=True)
112 changes: 70 additions & 42 deletions Backend/Sentimental/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,66 +11,94 @@

app = FastAPI()

MODEL_PATH = "/opt/ml/input/model-roberta_large-sota_trainer"
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")
MODEL_PATH = "/opt/ml/outputs/klue/roberta-large_merged-4_100-26_100"

model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")
special_tokens_dict = {'additional_special_tokens': ['[COMPANY]','[/COMPANY]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

@app.get("/")
def hello_world():
return {"hello": "world"}
model.resize_token_embeddings(len(tokenizer))

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

def predict_sentiment(text):
model.eval()
with torch.no_grad() :
temp = tokenizer(
text,
return_tensors='pt',
padding=True,
truncation=True,
##
max_length=100,
# stride=stride,
# return_overflowing_tokens=True,
return_offsets_mapping=False
)


predicted_label = model(input_ids=temp['input_ids'],
token_type_ids=temp['token_type_ids'])

print(predicted_label)

results = []
results = torch.nn.Softmax(dim=-1)(predicted_label.logits)
def extract_sentences_token(input_dict, pad_token_id):
'''
512 토큰 초과라면 input_ids, token_type_ids, attention_maks를
앞 128, 뒤 384으로 분리합니다.
이하라면 뒤는 pad 토큰으로 채웁니다.
사용 방법:
train_encoding = tokenizer(text)
train_encoding = extract_sentences_token(train_encoding, tokenizer.pad_token_id)
'''
new = {}
batch_size = len(input_dict['input_ids'])
new['input_ids'] = torch.ones(batch_size, 512, dtype=int)
new['token_type_ids'] = torch.ones(batch_size, 512, dtype=int)
new['attention_mask'] = torch.ones(batch_size, 512, dtype=int)
# batch_size, 512
for i in range(batch_size):
a = input_dict['input_ids'][i]
a = a[a != pad_token_id]
length = len(a)
if length > 512:
left, right = 1, 3
a = torch.cat((a[:128*left], a[-128*right:]), dim=0)
new['input_ids'][i] = a
new['token_type_ids'][i] = input_dict['token_type_ids'][i][:512]
new['attention_mask'][i] = input_dict['attention_mask'][i][:512]
else:
new['input_ids'][i] = input_dict['input_ids'][i][:512]
new['token_type_ids'][i] = input_dict['token_type_ids'][i][:512]
new['attention_mask'][i] = input_dict['attention_mask'][i][:512]
return new

def predict_sentiment(text: List[str]) -> List[str]:
answer = []
print(results)
for result in results :
if result[0]>=result[1] :
answer.append("부정")

else :
answer.append("긍정")

model.eval()

loader = torch.utils.data.DataLoader(dataset=text, batch_size=32, shuffle=False)
with torch.no_grad() :
for batch_text in loader:
temp = tokenizer(
batch_text,
return_tensors='pt',
padding="max_length",
truncation=True,
max_length=3000, # 충분히 커서 모두 토큰화할 길이
)
temp = extract_sentences_token(temp, tokenizer.pad_token_id)
if torch.cuda.is_available():
temp = {key: value.to(device) for key, value in temp.items()}
predicted_label = model(**temp)

results = torch.nn.Softmax(dim=-1)(predicted_label.logits)
for result in results :
if result[0]>=result[1] :
answer.append("부정")
else :
answer.append("긍정")

return answer

class FinanaceSentiment(BaseModel):
corpus_list: list = []
title: str = "title"
company: str = "삼성전자"
result: Optional[List]
company_list: list = []


@app.post("/classify_sentiment", description="문장의 감정을 분류합니다.")
async def classify_sentiment(finance: FinanaceSentiment):
# 입력으로 받은 텍스트를 모델로 예측합니다.
predictions = predict_sentiment(finance.corpus_list)
input = []
for corpus, company in zip(finance.corpus_list, finance.company_list) :
input.append(f"이 기사는[COMPANY]{company}[/COMPANY]에 대한 기사야. {corpus}")

predictions = predict_sentiment(input)

# 결과를 반환합니다.
result = {
"title": finance.title,
# "input_text": finance.corpus,
"sentiment": predictions
}

Expand Down
173 changes: 173 additions & 0 deletions Notebooks/Sentimental/glob.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## 회사명 기준으로 파일 나누기\n",
"preprocessed_data_로 시작하는 chatgpt 라벨링 데이터를 회사명으로 분류해서 merged_(회사명) 으로 저장"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def merge_and_move_files(directory_path):\n",
" file_list = os.listdir(directory_path)\n",
"\n",
" files_by_company = {}\n",
"\n",
" # 파일들을 기업 이름에 따라 그룹화\n",
" for file_name in file_list:\n",
" # if file_name[0].isdigit():\n",
" if file_name.startswith(\"preprocessed_data_\"):\n",
" file_path = os.path.join(directory_path, file_name)\n",
" data = pd.read_csv(file_path)\n",
" \n",
" company_name = data['company'].iloc[0]\n",
" \n",
" if company_name not in files_by_company:\n",
" files_by_company[company_name] = []\n",
" \n",
" files_by_company[company_name].append(file_name)\n",
"\n",
" # 각 기업에 대해 폴더를 만들고 해당 폴더로 각각 파일들을 저장\n",
" for company_name, files in files_by_company.items():\n",
" folder_path = os.path.join(directory_path, \"merged\")\n",
" os.makedirs(folder_path, exist_ok=True)\n",
"\n",
" combined_data = pd.concat([pd.read_csv(os.path.join(directory_path, file)) for file in files], axis=0)\n",
"\n",
" merged_file_path = os.path.join(folder_path, f\"merged_{company_name}.csv\")\n",
" combined_data.to_csv(merged_file_path, index=False)\n",
" \n",
"\n",
"if __name__ == \"__main__\":\n",
" target_directory = \"/opt/ml/finance_sentiment_corpus\"\n",
" merge_and_move_files(target_directory)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## 파일 합치기\n",
"merged_(회사명)을 학습 시키기위한 merged_all.csv 만들기"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def merge_csv_files(directory_path, output_file_name):\n",
" file_list = os.listdir(directory_path)\n",
" combined_data = pd.DataFrame()\n",
"\n",
" for file_name in file_list:\n",
" if file_name.endswith(\".csv\") and file_name != \"merged_all.csv\" : # merged_all.csv 제외하고 합치기\n",
" file_path = os.path.join(directory_path, file_name)\n",
"\n",
" data = pd.read_csv(file_path)\n",
" \n",
" # 혹여나 \"labels\"로 지정해둔 column 이름 변경\n",
" if \"labels\" in data.columns and \"label\" not in data.columns :\n",
" data[\"label\"] = data[\"labels\"]\n",
" \n",
" data = data[[\"company\", \"title\", \"date\", \"content_corpus\", \"label\"]]\n",
" combined_data = pd.concat([combined_data, data], axis=0, ignore_index=True)\n",
" \n",
" output_file_path = os.path.join(directory_path, output_file_name)\n",
"\n",
" # 지우고 다시 만들기\n",
" if os.path.exists(output_file_path):\n",
" os.remove(output_file_path)\n",
" \n",
" combined_data.to_csv(output_file_path, index=False)\n",
" return combined_data\n",
"\n",
"directory_path = \"/opt/ml/finance_sentiment_corpus/merged\"\n",
"output_file_name = \"merged_all.csv\"\n",
"output_file_path = directory_path + output_file_name\n",
" \n",
"data = merge_csv_files(directory_path , output_file_name)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## TEST 데이터셋 만들기\n",
"26개의 회사에서 20개씩 뽑아 test_dataset 만들기"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"csv_file_path = \"/opt/ml/finance_sentiment_corpus/label_0_to_521.csv\"\n",
"df = pd.read_csv(csv_file_path)\n",
"\n",
"# 새로운 DataFrame을 저장할 리스트 생성\n",
"new_dfs = []\n",
"\n",
"# company 컬럼의 고유한 값들을 추출하여 각 회사별로 20개씩 행을 샘플링하여 새로운 DataFrame으로 생성\n",
"for company_name in df['company'].unique():\n",
" company_subset = df[df['company'] == company_name].sample(n=10, random_state=42) # 20개씩 랜덤 샘플링 (여기서는 random_state를 고정하여 재현성을 위해 사용)\n",
" new_dfs.append(company_subset)\n",
"\n",
"# 새로운 DataFrame을 병합하여 하나의 DataFrame으로 합치기\n",
"result_df = pd.concat(new_dfs)\n",
"\n",
"# 새로운 DataFrame을 CSV 파일로 저장\n",
"result_csv_file_path = \"/opt/ml/finance_sentiment_corpus/26_company_half_labeled.csv\" # 저장할 파일 경로 설정 (적절하게 변경해주세요)\n",
"result_df.to_csv(result_csv_file_path, index=False) # index=False를 지정하여 인덱스를 저장하지 않도록 설정합니다.\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "final",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "981f108a204f421f158e0977940335d851edffa6dd3586828a3e1aec045160e4"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 82f6636

Please sign in to comment.