-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #62 from boostcampaitech5/feature-sentimental
[Feature] Sentimental complete
- Loading branch information
Showing
13 changed files
with
489 additions
and
1,973 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
if __name__ == "__main__": | ||
import uvicorn | ||
|
||
uvicorn.run("app.main:app", host="0.0.0.0", port=30007, reload=True) | ||
uvicorn.run("Sentimental.main:app", host="0.0.0.0", port=30007, reload=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import os" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## 회사명 기준으로 파일 나누기\n", | ||
"preprocessed_data_로 시작하는 chatgpt 라벨링 데이터를 회사명으로 분류해서 merged_(회사명) 으로 저장" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def merge_and_move_files(directory_path):\n", | ||
" file_list = os.listdir(directory_path)\n", | ||
"\n", | ||
" files_by_company = {}\n", | ||
"\n", | ||
" # 파일들을 기업 이름에 따라 그룹화\n", | ||
" for file_name in file_list:\n", | ||
" # if file_name[0].isdigit():\n", | ||
" if file_name.startswith(\"preprocessed_data_\"):\n", | ||
" file_path = os.path.join(directory_path, file_name)\n", | ||
" data = pd.read_csv(file_path)\n", | ||
" \n", | ||
" company_name = data['company'].iloc[0]\n", | ||
" \n", | ||
" if company_name not in files_by_company:\n", | ||
" files_by_company[company_name] = []\n", | ||
" \n", | ||
" files_by_company[company_name].append(file_name)\n", | ||
"\n", | ||
" # 각 기업에 대해 폴더를 만들고 해당 폴더로 각각 파일들을 저장\n", | ||
" for company_name, files in files_by_company.items():\n", | ||
" folder_path = os.path.join(directory_path, \"merged\")\n", | ||
" os.makedirs(folder_path, exist_ok=True)\n", | ||
"\n", | ||
" combined_data = pd.concat([pd.read_csv(os.path.join(directory_path, file)) for file in files], axis=0)\n", | ||
"\n", | ||
" merged_file_path = os.path.join(folder_path, f\"merged_{company_name}.csv\")\n", | ||
" combined_data.to_csv(merged_file_path, index=False)\n", | ||
" \n", | ||
"\n", | ||
"if __name__ == \"__main__\":\n", | ||
" target_directory = \"/opt/ml/finance_sentiment_corpus\"\n", | ||
" merge_and_move_files(target_directory)\n" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## 파일 합치기\n", | ||
"merged_(회사명)을 학습 시키기위한 merged_all.csv 만들기" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def merge_csv_files(directory_path, output_file_name):\n", | ||
" file_list = os.listdir(directory_path)\n", | ||
" combined_data = pd.DataFrame()\n", | ||
"\n", | ||
" for file_name in file_list:\n", | ||
" if file_name.endswith(\".csv\") and file_name != \"merged_all.csv\" : # merged_all.csv 제외하고 합치기\n", | ||
" file_path = os.path.join(directory_path, file_name)\n", | ||
"\n", | ||
" data = pd.read_csv(file_path)\n", | ||
" \n", | ||
" # 혹여나 \"labels\"로 지정해둔 column 이름 변경\n", | ||
" if \"labels\" in data.columns and \"label\" not in data.columns :\n", | ||
" data[\"label\"] = data[\"labels\"]\n", | ||
" \n", | ||
" data = data[[\"company\", \"title\", \"date\", \"content_corpus\", \"label\"]]\n", | ||
" combined_data = pd.concat([combined_data, data], axis=0, ignore_index=True)\n", | ||
" \n", | ||
" output_file_path = os.path.join(directory_path, output_file_name)\n", | ||
"\n", | ||
" # 지우고 다시 만들기\n", | ||
" if os.path.exists(output_file_path):\n", | ||
" os.remove(output_file_path)\n", | ||
" \n", | ||
" combined_data.to_csv(output_file_path, index=False)\n", | ||
" return combined_data\n", | ||
"\n", | ||
"directory_path = \"/opt/ml/finance_sentiment_corpus/merged\"\n", | ||
"output_file_name = \"merged_all.csv\"\n", | ||
"output_file_path = directory_path + output_file_name\n", | ||
" \n", | ||
"data = merge_csv_files(directory_path , output_file_name)" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## TEST 데이터셋 만들기\n", | ||
"26개의 회사에서 20개씩 뽑아 test_dataset 만들기" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"csv_file_path = \"/opt/ml/finance_sentiment_corpus/label_0_to_521.csv\"\n", | ||
"df = pd.read_csv(csv_file_path)\n", | ||
"\n", | ||
"# 새로운 DataFrame을 저장할 리스트 생성\n", | ||
"new_dfs = []\n", | ||
"\n", | ||
"# company 컬럼의 고유한 값들을 추출하여 각 회사별로 20개씩 행을 샘플링하여 새로운 DataFrame으로 생성\n", | ||
"for company_name in df['company'].unique():\n", | ||
" company_subset = df[df['company'] == company_name].sample(n=10, random_state=42) # 20개씩 랜덤 샘플링 (여기서는 random_state를 고정하여 재현성을 위해 사용)\n", | ||
" new_dfs.append(company_subset)\n", | ||
"\n", | ||
"# 새로운 DataFrame을 병합하여 하나의 DataFrame으로 합치기\n", | ||
"result_df = pd.concat(new_dfs)\n", | ||
"\n", | ||
"# 새로운 DataFrame을 CSV 파일로 저장\n", | ||
"result_csv_file_path = \"/opt/ml/finance_sentiment_corpus/26_company_half_labeled.csv\" # 저장할 파일 경로 설정 (적절하게 변경해주세요)\n", | ||
"result_df.to_csv(result_csv_file_path, index=False) # index=False를 지정하여 인덱스를 저장하지 않도록 설정합니다.\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "final", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.5" | ||
}, | ||
"orig_nbformat": 4, | ||
"vscode": { | ||
"interpreter": { | ||
"hash": "981f108a204f421f158e0977940335d851edffa6dd3586828a3e1aec045160e4" | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.