Merge pull request #77 from Micheallei/main

Update RecExplainer for unifying folder paths in different scripts (fixes #71)
microsoft · Nov 15, 2024 · a2c9cf6 · a2c9cf6
2 parents 2fe6d63 + ad14e23
commit a2c9cf6
Show file tree

Hide file tree

Showing 9 changed files with 48 additions and 45 deletions.
diff --git a/RecExplainer/README.md b/RecExplainer/README.md
@@ -49,7 +49,7 @@ export MODEL=xxx;
 
 
 ## Dataset Preparation for Target Recommender Model
-For data preparation, you need to download three raw files: Amazon review, Amazon metadata, ShareGPT
+For data preparation, you need to download three raw files: Amazon review, Amazon metadata, ShareGPT, and put them under `$HOME/RecAI/RecExplainer/data/amazon_video_games_v3/raw_data`:
 * Amazon Video Games 5-core reviews: https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Video_Games_5.json.gz
 * Amazon Video Games metadata: https://jmcauley.ucsd.edu/data/amazon_v2/metaFiles2/meta_Video_Games.json.gz
 * ShareGPT: https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json
@@ -76,17 +76,26 @@ bash shell/unirec_mf_train.sh
 You need to copy some files to the UniRec directory in advance.
 ```bash
 cp preprocess/unirec_utils/data4Exp.py $HOME/UniRec/unirec/main
-cp $HOME/RecExplainer/data/unirec_raw_data/amazon_video_games_v3/train_ids.csv $HOME/UniRec/data/amazon_video_games_v3
-cp $HOME/RecExplainer/data/unirec_raw_data/amazon_video_games_v3/test_ids.csv $HOME/UniRec/data/amazon_video_games_v3
+cp $HOME/RecAI/RecExplainer/data/unirec_raw_data/amazon_video_games_v3/train_ids.csv $HOME/UniRec/data/amazon_video_games_v3
+cp $HOME/RecAI/RecExplainer/data/unirec_raw_data/amazon_video_games_v3/test_ids.csv $HOME/UniRec/data/amazon_video_games_v3
 ```
 For SASRec model:
 ```bash
 bash shell/unirec_sasrec_infer.sh
 ```
+After inference, please copy the contents of `$HOME/UniRec/output/amazon_video_games_v3/SASRec/RecExplainer/xxx/` to `$HOME/RecAI/RecExplainer/data/amazon_video_games_v3`
+
+Finally, there should exist these files in `$HOME/RecAI/RecExplainer/data/amazon_video_games_v3`: datamaps.json, metadata.json, SASRec.pth, sequential_data.txt, sim_item.txt, test_top.txt, train_top.txt
+
 For MF model:
 ```bash
 bash shell/unirec_mf_infer.sh
 ```
+After inference, please copy the contents of `$HOME/UniRec/output/amazon_video_games_v3/MF/RecExplainer/xxx/` to `$HOME/RecAI/RecExplainer/data/mf_amazon_video_games_v3`
+
+At the same time, copy these files from `$HOME/RecAI/RecExplainer/data/amazon_video_games_v3` to `$HOME/RecAI/RecExplainer/data/mf_amazon_video_games_v3`: datamaps.json, metadata.json, sequential_data.txt
+
+Finally, there should exist these files in `$HOME/RecAI/RecExplainer/data/mf_amazon_video_games_v3`: datamaps.json, metadata.json, MF.pth, sequential_data.txt, sim_item.txt, test_top.txt, train_top.txt
 
 ## Dataset Preparation for RecExplainer Model
 ```bash

diff --git a/RecExplainer/shell/eval_explan.sh b/RecExplainer/shell/eval_explan.sh
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-DATA_DIR=$HOME/RecExplainer/output/amazon_video_games_v3/explan
+DATA_DIR=$HOME/RecAI/RecExplainer/output/amazon_video_games_v3/explan
 
-cd $HOME/RecExplainer/preprocess
+cd $HOME/RecAI/RecExplainer/preprocess
 
 python eval_explan.py --model_names "recexplainer-B,recexplainer-I,recexplainer-H,llama3,chatgpt" \
     --model_response_files "$DATA_DIR/recexplainer-B_response.csv,$DATA_DIR/recexplainer-I_response.csv,$DATA_DIR/recexplainer-H_response.csv,$DATA_DIR/llama3_response.csv,$DATA_DIR/chatgpt_response.csv" \

diff --git a/RecExplainer/shell/infer_alignment.sh b/RecExplainer/shell/infer_alignment.sh
@@ -1,10 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-DATA_DIR=$HOME/blob/RecExplainer/amazon_video_games_v3
-UNIREC_DATA_DIR="$HOME/blob/RecExplainer/amazon_video_games_v3"
+DATA_DIR=$HOME/RecAI/RecExplainer/data/amazon_video_games_v3
 
-output_dir=$HOME/RecExplainer/output/amazon_video_games_v3/
+output_dir=$HOME/RecAI/RecExplainer/output/amazon_video_games_v3/
 model_name_or_path="path to your merged model"
 validation_file=$DATA_DIR/both_valid.json
 sequential_file=$DATA_DIR/sequential_data.txt
@@ -15,13 +14,13 @@ task_type="both"
 template_name="llama-3"
 
 metadata_file=$DATA_DIR/metadata.json
-test_top_file=$UNIREC_DATA_DIR/test_top.txt
+test_top_file=$DATA_DIR/test_top.txt
 torch_dtype="bfloat16"
 attn_implementation="flash_attention_2"
 rec_model_type="SASRec"
 
 
-cd $HOME/RecExplainer
+cd $HOME/RecAI/RecExplainer
 
 ## infer for item recovery task
 accelerate launch --config_file ./shell/config/infer_single_node.yaml ./src/inference.py \

diff --git a/RecExplainer/shell/infer_explan.sh b/RecExplainer/shell/infer_explan.sh
@@ -1,10 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-DATA_DIR="$HOME/blob/RecExplainer/amazon_video_games_v3"
-UNIREC_DATA_DIR="$HOME/blob/RecExplainer/amazon_video_games_v3"
+DATA_DIR="$HOME/RecAI/RecExplainer/data/amazon_video_games_v3"
 
-output_dir=$HOME/RecExplainer/output/amazon_video_games_v3/explan_valid.csv
+output_dir=$HOME/RecAI/RecExplainer/output/amazon_video_games_v3/explan/recexplainer-H_response.csv
 model_name_or_path="path to your merged model"
 validation_file=$DATA_DIR/explan_both_valid.json
 sequential_file=$DATA_DIR/sequential_data.txt
@@ -15,12 +14,12 @@ task_type="both"
 template_name="llama-3"
 
 metadata_file=$DATA_DIR/metadata.json
-test_top_file=$UNIREC_DATA_DIR/test_top.txt
+test_top_file=$DATA_DIR/test_top.txt
 torch_dtype="bfloat16"
 attn_implementation="flash_attention_2"
 rec_model_type="SASRec"
 
-cd $HOME/RecExplainer
+cd $HOME/RecAI/RecExplainer
 
 accelerate launch --config_file ./shell/config/infer.yaml ./src/inference.py \
     --preprocessing_num_workers 4 \

diff --git a/RecExplainer/shell/merge.sh b/RecExplainer/shell/merge.sh
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-cd $HOME/RecExplainer
+cd $HOME/RecAI/RecExplainer
 
 ### --model_name_or_path: the path to the original LLM
 
@@ -10,7 +10,7 @@ python ./src/merge.py \
     --cache_dir $HOME/.cache \
     --peft_model_name path/to/your/training/checkpoint \
     --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
-    --rec_model_name_or_path $HOME/blob/RecExplainer/amazon_video_games_v3/SASRec.pth \
+    --rec_model_name_or_path $HOME/RecAI/RecExplainer/data/amazon_video_games_v3/SASRec.pth \
     --task_type both \
     --torch_dtype bfloat16 \
     --attn_implementation flash_attention_2 \

diff --git a/RecExplainer/shell/preprocess_recmodel.sh b/RecExplainer/shell/preprocess_recmodel.sh
@@ -1,24 +1,24 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-RAW_DATA_DIR="$HOME/RecExplainer/data/amazon_video_games_v3/raw_data"
+RAW_DATA_DIR="$HOME/RecAI/RecExplainer/data/amazon_video_games_v3/raw_data"
 full_data_name="Video_Games"
 meta_file="$RAW_DATA_DIR/meta_Video_Games.json.gz"
 review_file="$RAW_DATA_DIR/Video_Games_5.json.gz"
 raw_save_data_file="$RAW_DATA_DIR/sequential_data.txt"
 raw_save_metadata_file="$RAW_DATA_DIR/metadata.json"
 raw_save_datamaps_file="$RAW_DATA_DIR/datamaps.json"
 
-PROCESS_DATA_DIR="$HOME/RecExplainer/data/amazon_video_games_v3/process_data"
+PROCESS_DATA_DIR="$HOME/RecAI/RecExplainer/data/amazon_video_games_v3"
 process_save_data_file="$PROCESS_DATA_DIR/sequential_data.txt"
 process_save_metadata_file="$PROCESS_DATA_DIR/metadata.json"
 process_save_datamaps_file="$PROCESS_DATA_DIR/datamaps.json"
 item_thred=2000
 user_thred=4000
 
-UNIREC_RAW_DATA_DIR="$HOME/RecExplainer/data/unirec_raw_data/amazon_video_games_v3"
+UNIREC_RAW_DATA_DIR="$HOME/RecAI/RecExplainer/data/unirec_raw_data/amazon_video_games_v3"
 
-EXE_DIR="$HOME/RecExplainer/preprocess"
+EXE_DIR="$HOME/RecAI/RecExplainer/preprocess"
 cd $EXE_DIR
 
 python data_preprocess_amazon.py --full_data_name $full_data_name --meta_file $meta_file --review_file $review_file \

diff --git a/RecExplainer/shell/recexplainer_data_pipeline.sh b/RecExplainer/shell/recexplainer_data_pipeline.sh
@@ -1,19 +1,17 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-RAW_DATA_DIR="$HOME/blob/RecExplainer/amazon_video_games_v3"
-PROCESS_DATA_DIR="$HOME/blob/RecExplainer/amazon_video_games_v3"
-UNIREC_DATA_DIR="$HOME/blob/RecExplainer/amazon_video_games_v3"
+RAW_DATA_DIR="$HOME/RecAI/RecExplainer/data/amazon_video_games_v3/raw_data"
+PROCESS_DATA_DIR="$HOME/RecAI/RecExplainer/data/amazon_video_games_v3"
 
-MF_PROCESS_DATA_DIR="$HOME/blob/RecExplainer/mf_amazon_video_games_v3"
-MF_UNIREC_DATA_DIR="$HOME/blob/RecExplainer/mf_amazon_video_games_v3"
+MF_PROCESS_DATA_DIR="$HOME/RecAI/RecExplainer/data/mf_amazon_video_games_v3"
 
 gpt_response_file="$PROCESS_DATA_DIR/gpt4_data/test_response.csv"
 max_seq_len=9
 model_name="meta-llama/Meta-Llama-3-8B-Instruct"
 model_max_length=1024
 
-EXE_DIR="$HOME/RecExplainer/preprocess"
+EXE_DIR="$HOME/RecAI/RecExplainer/preprocess"
 cd $EXE_DIR
 
 
@@ -28,7 +26,7 @@ if [[ -e $gpt_response_file ]]; then
 else
     echo "generate gpt_response_file"
 
-    python preprocess/gpt_api.py --input_file $PROCESS_DATA_DIR/gpt4_data/test_query.csv --output_file $gpt_response_file
+    python gpt_api.py --input_file $PROCESS_DATA_DIR/gpt4_data/test_query.csv --output_file $gpt_response_file
 
 fi
 
@@ -37,18 +35,18 @@ fi
 ### generate training and testing data for alignment tasks
 python amazon_generate_v3.py --sharegpt_file $RAW_DATA_DIR/ShareGPT_V3_unfiltered_cleaned_split.json \
     --seqdata_file $PROCESS_DATA_DIR/sequential_data.txt --metadata_file $PROCESS_DATA_DIR/metadata.json \
-    --sim_item_file $UNIREC_DATA_DIR/sim_item.txt --train_top_file $UNIREC_DATA_DIR/train_top.txt --test_top_file $UNIREC_DATA_DIR/test_top.txt \
+    --sim_item_file $PROCESS_DATA_DIR/sim_item.txt --train_top_file $PROCESS_DATA_DIR/train_top.txt --test_top_file $PROCESS_DATA_DIR/test_top.txt \
     --gpt_response_file $gpt_response_file \
     --save_intention_file $PROCESS_DATA_DIR/intention --save_behavior_file $PROCESS_DATA_DIR/behaviour --save_both_file $PROCESS_DATA_DIR/both \
     --max_seq_len $max_seq_len --model_name $model_name --model_max_length $model_max_length
 
 ### generate testing data for explanation task
 python explan_data_gen.py --data_dir $PROCESS_DATA_DIR --seqdata_file $PROCESS_DATA_DIR/sequential_data.txt --metadata_file $PROCESS_DATA_DIR/metadata.json \
-    --test_top_file $UNIREC_DATA_DIR/test_top.txt --max_seq_len $max_seq_len --max_samples 500
+    --test_top_file $PROCESS_DATA_DIR/test_top.txt --max_seq_len $max_seq_len --max_samples 500
 
 ### generate training data for explanation task, used to train classifier and score predictor
 python explan_data_gen.py --data_dir $PROCESS_DATA_DIR --seqdata_file $PROCESS_DATA_DIR/sequential_data.txt --metadata_file $PROCESS_DATA_DIR/metadata.json \
-    --test_top_file $UNIREC_DATA_DIR/train_top.txt --max_seq_len $max_seq_len --max_samples 2000 --split "train"
+    --test_top_file $PROCESS_DATA_DIR/train_top.txt --max_seq_len $max_seq_len --max_samples 2000 --split "train"
 
 
 ######################################################
@@ -58,15 +56,15 @@ python explan_data_gen.py --data_dir $PROCESS_DATA_DIR --seqdata_file $PROCESS_D
 ###  generate training and testing data for alignment tasks
 python mf_amazon_video_games_generate.py --sharegpt_file $RAW_DATA_DIR/ShareGPT_V3_unfiltered_cleaned_split.json \
     --seqdata_file $MF_PROCESS_DATA_DIR/sequential_data.txt --metadata_file $MF_PROCESS_DATA_DIR/metadata.json \
-    --sim_item_file $MF_UNIREC_DATA_DIR/sim_item.txt --test_top_file $MF_UNIREC_DATA_DIR/test_top.txt \
+    --sim_item_file $MF_PROCESS_DATA_DIR/sim_item.txt --test_top_file $MF_PROCESS_DATA_DIR/test_top.txt \
     --gpt_response_file $gpt_response_file \
     --save_intention_file $MF_PROCESS_DATA_DIR/intention --save_behavior_file $MF_PROCESS_DATA_DIR/behaviour --save_both_file $MF_PROCESS_DATA_DIR/both \
     --max_seq_len $max_seq_len --model_name $model_name --model_max_length $model_max_length
 
 ### generate testing data for explanation task
 python explan_data_gen.py --data_dir $MF_PROCESS_DATA_DIR --seqdata_file $MF_PROCESS_DATA_DIR/sequential_data.txt --metadata_file $MF_PROCESS_DATA_DIR/metadata.json \
-    --test_top_file $MF_UNIREC_DATA_DIR/test_top.txt --max_seq_len $max_seq_len --max_samples 500 --rec_model_type "MF"
+    --test_top_file $MF_PROCESS_DATA_DIR/test_top.txt --max_seq_len $max_seq_len --max_samples 500 --rec_model_type "MF"
 
 ### generate training data for explanation task, used to train classifier and score predictor
 python explan_data_gen.py --data_dir $MF_PROCESS_DATA_DIR --seqdata_file $MF_PROCESS_DATA_DIR/sequential_data.txt --metadata_file $MF_PROCESS_DATA_DIR/metadata.json \
-    --test_top_file $MF_UNIREC_DATA_DIR/test_top.txt --max_seq_len $max_seq_len --max_samples 2000 --split "train" --rec_model_type "MF"
+    --test_top_file $MF_PROCESS_DATA_DIR/test_top.txt --max_seq_len $max_seq_len --max_samples 2000 --split "train" --rec_model_type "MF"
diff --git a/RecExplainer/shell/train.sh b/RecExplainer/shell/train.sh
@@ -8,12 +8,11 @@ export DISABLE_MLFLOW_INTEGRATION=true;
 export WANDB_DIR=$HOME/.cache/
 export WANDB_PROJECT="RecExplainer"
 
-DATA_DIR="$HOME/blob/RecExplainer/amazon_video_games_v3"
-UNIREC_DATA_DIR="$HOME/blob/RecExplainer/amazon_video_games_v3"
+DATA_DIR="$HOME/RecAI/RecExplainer/data/amazon_video_games_v3"
 
 attn_implementation="flash_attention_2"
 model_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct"
-rec_model_name_or_path=$UNIREC_DATA_DIR/SASRec.pth
+rec_model_name_or_path=$DATA_DIR/SASRec.pth
 rec_model_type="SASRec"
 model_max_length=1024
 torch_dtype="bfloat16"
@@ -27,7 +26,7 @@ template_name="llama-3"
 
 output_dir=$DATA_DIR/output/both_flashattn2_llam3-8b_len1024_bf16_lr1e-4_epoch20_batch4_accu4_warmratio0.1_4gpus
 
-cd $HOME/RecExplainer
+cd $HOME/RecAI/RecExplainer
 
 torchrun --nnodes=1 --nproc_per_node 4 --master_port=29501 ./src/sft_training.py \
     --seed 2024 \
@@ -61,7 +60,7 @@ torchrun --nnodes=1 --nproc_per_node 4 --master_port=29501 ./src/sft_training.py
     --save_strategy epoch \
     --evaluation_strategy epoch \
     --report_to wandb \
-    --run_name "amazon_video_games_v3_both_flashattn2_llam3-8b_len1024_bf16_lr1e-4_epoch20_batch4_accu4_warmratio0.1_4gpus" > $HOME/RecExplainer/training.log 2>&1
+    --run_name "amazon_video_games_v3_both_flashattn2_llam3-8b_len1024_bf16_lr1e-4_epoch20_batch4_accu4_warmratio0.1_4gpus" > $HOME/RecAI/RecExplainer/training.log 2>&1
 
 
 ### for training MF model
@@ -70,12 +69,11 @@ export DISABLE_MLFLOW_INTEGRATION=true;
 export WANDB_DIR=$HOME/.cache/
 export WANDB_PROJECT="RecExplainer"
 
-DATA_DIR="$HOME/blob/RecExplainer/mf_amazon_video_games_v3"
-UNIREC_DATA_DIR="$HOME/blob/RecExplainer/mf_amazon_video_games_v3"
+DATA_DIR="$HOME/RecAI/RecExplainer/data/mf_amazon_video_games_v3"
 
 attn_implementation="flash_attention_2"
 model_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct"
-rec_model_name_or_path=$UNIREC_DATA_DIR/MF.pth
+rec_model_name_or_path=$DATA_DIR/MF.pth
 rec_model_type="MF"
 model_max_length=1024
 torch_dtype="bfloat16"
@@ -89,7 +87,7 @@ template_name="llama-3"
 
 output_dir=$DATA_DIR/output/both_flashattn2_llam3-8b_len1024_bf16_lr1e-4_epoch20_batch4_accu4_warmratio0.1_4gpus
 
-cd $HOME/RecExplainer
+cd $HOME/RecAI/RecExplainer
 
 torchrun --nnodes=1 --nproc_per_node 4 --master_port=29501 ./src/sft_training.py \
     --seed 2024 \
@@ -123,5 +121,5 @@ torchrun --nnodes=1 --nproc_per_node 4 --master_port=29501 ./src/sft_training.py
     --save_strategy epoch \
     --evaluation_strategy epoch \
     --report_to wandb \
-    --run_name "mf_amazon_video_games_v3_both_flashattn2_llam3-8b_len1024_bf16_lr1e-4_epoch20_batch4_accu4_warmratio0.1_4gpus" > $HOME/RecExplainer/mf_training.log 2>&1
+    --run_name "mf_amazon_video_games_v3_both_flashattn2_llam3-8b_len1024_bf16_lr1e-4_epoch20_batch4_accu4_warmratio0.1_4gpus" > $HOME/RecAI/RecExplainer/mf_training.log 2>&1
 
diff --git a/RecExplainer/shell/unirec_prepare_data.sh b/RecExplainer/shell/unirec_prepare_data.sh
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 
-RAW_DATA_DIR="$HOME/RecExplainer/data/unirec_raw_data/"
+RAW_DATA_DIR="$HOME/RecAI/RecExplainer/data/unirec_raw_data/"
 
 ROOT_DIR="$HOME/UniRec"
 DATA_ROOT="$ROOT_DIR/data"