From 065dc981d66fbfe8cebbe3b91317734c7d1dbf6e Mon Sep 17 00:00:00 2001 From: Joel Ye Date: Wed, 29 May 2024 12:42:18 -0400 Subject: [PATCH] add verbose mode --- decoder_demos/ndt2_sample.Dockerfile | 21 ++++++++++++++------- decoder_demos/ndt2_sample.py | 2 +- falcon_challenge/evaluator.py | 20 ++++++++++++++++---- setup.py | 2 +- test_docker_local.sh | 1 + 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/decoder_demos/ndt2_sample.Dockerfile b/decoder_demos/ndt2_sample.Dockerfile index 3a3387f..45a7411 100644 --- a/decoder_demos/ndt2_sample.Dockerfile +++ b/decoder_demos/ndt2_sample.Dockerfile @@ -26,21 +26,28 @@ ENV EVALUATION_LOC remote # Note that Docker cannot easily import across symlinks, make sure data is not symlinked # H1 -# ADD ./local_data/ndt2_h1_sample_nokey.pth data/decoder.pth +# ADD ./local_data/ndt2_h1_sample.pth data/decoder.pth # ADD ./local_data/ndt2_zscore_h1.pt data/zscore.pt +# ENV SPLIT "h1" +# ENV CONFIG_STEM falcon/h1/h1_100 # M1 +ADD ./local_data/ndt2_m1_sample_continual.pth data/decoder.pth +ADD ./local_data/ndt2_zscore_m1.pt data/zscore.pt +ENV SPLIT "m1" +ENV CONFIG_STEM falcon/m1/m1_100 -# M2 -ADD ./local_data/ndt2_m2_sample_continual.pth data/decoder.pth -ADD ./local_data/ndt2_zscore_m2.pt data/zscore.pt +# # M2 +# ADD ./local_data/ndt2_m2_sample_continual.pth data/decoder.pth +# ADD ./local_data/ndt2_zscore_m2.pt data/zscore.pt +# ENV SPLIT "m2" +# ENV CONFIG_STEM falcon/m2/m2_100 # Add runfile RUN pwd ADD ./decoder_demos/ndt2_sample.py decode.py -ADD ./decoder_demos/ndt2_decoder.py ndt2_decoder.py -ENV SPLIT "h1" +ENV BATCH_SIZE 16 ENV PHASE "test" # Make sure this matches the mounted data volume path. Generally leave as is. @@ -50,4 +57,4 @@ ENV EVAL_DATA_PATH "/dataset/evaluation_data" # CMD specifies a default command to run when the container is launched. # It can be overridden with any cmd e.g. sudo docker run -it my_image /bin/bash CMD ["/bin/bash", "-c", \ - "python decode.py --evaluation $EVALUATION_LOC --model-path data/decoder.pth --zscore-path data/zscore.pt --split $SPLIT --phase $PHASE"] \ No newline at end of file + "python decode.py --evaluation $EVALUATION_LOC --model-path data/decoder.pth --config-stem $CONFIG_STEM --zscore-path data/zscore.pt --split $SPLIT --batch-size $BATCH_SIZE --phase $PHASE"] \ No newline at end of file diff --git a/decoder_demos/ndt2_sample.py b/decoder_demos/ndt2_sample.py index 456833e..e85b59f 100644 --- a/decoder_demos/ndt2_sample.py +++ b/decoder_demos/ndt2_sample.py @@ -53,7 +53,7 @@ def main(): task = getattr(FalconTask, args.split) config = FalconConfig(task=task) max_bins = 50 if task in [FalconTask.m1, FalconTask.m2] else 200 # h1 - + decoder = NDT2Decoder( task_config=config, model_ckpt_path=args.model_path, diff --git a/falcon_challenge/evaluator.py b/falcon_challenge/evaluator.py index 9e45a02..d08a76f 100644 --- a/falcon_challenge/evaluator.py +++ b/falcon_challenge/evaluator.py @@ -235,6 +235,7 @@ def evaluate( mask_dict['held_out'].append(dataset_mask) else: raise ValueError(f"Dataset {dataset} submitted but not found in held-in or held-out list of split {datasplit}.") + for in_or_out in pred_dict: if len(pred_dict[in_or_out]) < len(DATASET_HELDINOUT_MAP[datasplit][in_or_out]): raise ValueError(f"Missing predictions for {datasplit} {in_or_out}. User submitted: {user_submission[datasplit].keys()}. Expecting more like: {HELDIN_OR_OUT_MAP[datasplit][in_or_out]}.") @@ -312,13 +313,17 @@ def simple_collater(batch, task): class FalconEvaluator: - def __init__(self, eval_remote=False, split='h1'): + def __init__(self, eval_remote=False, split='h1', verbose=False): + r""" + verbose: Print out dataset specific metrics for movement tasks. + """ self.eval_remote = eval_remote assert split in ['h1', 'h2', 'm1', 'm2'], "Split must be h1, h2, m1, or m2." if split in ['h1', 'm1', 'm2']: self.continual = True else: self.continual = False + self.verbose = verbose self.dataset: FalconTask = getattr(FalconTask, split) self.cfg = FalconConfig(self.dataset) @@ -554,9 +559,9 @@ def evaluate( else: for k, v in metrics.items(): logger.info("{}: {}".format(k, v)) - + @staticmethod - def compute_metrics_regression(preds, targets, eval_mask, dset_lens): + def compute_metrics_regression(preds, targets, eval_mask, dset_lens, verbose=False): # Verbose drop-in dset_lens = np.cumsum([sum(dset_lens[key]) for key in sorted(dset_lens.keys())]) masked_points = np.cumsum(~eval_mask) dset_lens = [0] + [dset_len - masked_points[dset_len - 1] for dset_len in dset_lens] @@ -566,11 +571,18 @@ def compute_metrics_regression(preds, targets, eval_mask, dset_lens): raise ValueError(f"Targets and predictions have different lengths: {targets.shape[0]} vs {preds.shape[0]}.") r2_scores = [r2_score(targets[dset_lens[i]:dset_lens[i+1]], preds[dset_lens[i]:dset_lens[i+1]], multioutput='variance_weighted') for i in range(len(dset_lens) - 1)] + if verbose: + dsets = sorted(dset_lens.keys()) + print([f'{k}: {r2}' for k, r2 in zip(dsets, r2_scores)]) + preds_dict = {k: preds[dset_lens[i]:dset_lens[i+1]] for i, k in enumerate(dsets)} + with open('preds.pkl', 'wb') as f: + pickle.dump(preds_dict, f) return { "R2 Mean": np.mean(r2_scores), "R2 Std.": np.std(r2_scores) } + @staticmethod def compute_metrics_edit_distance(preds, targets, eval_mask): if len(preds) != len(targets): @@ -609,7 +621,7 @@ def compute_metrics(self, all_preds, all_targets, all_eval_mask=None): all_eval_mask: array of shape (n_timesteps, k_dim). True if we should evaluate this timestep. """ if self.dataset in [FalconTask.h1, FalconTask.m1, FalconTask.m2]: - metrics = self.compute_metrics_regression(all_preds, all_targets, all_eval_mask) + metrics = self.compute_metrics_regression(all_preds, all_targets, all_eval_mask, verbose=self.verbose) elif self.dataset in [FalconTask.h2]: metrics = self.compute_metrics_edit_distance(all_preds, all_targets, all_eval_mask) else: diff --git a/setup.py b/setup.py index cb972d9..5e713cb 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='falcon_challenge', - version='0.3.9', + version='0.3.10', url='https://github.com/snel-repo/stability-benchmark', author='Joel Ye', diff --git a/test_docker_local.sh b/test_docker_local.sh index f43a6df..31130a2 100755 --- a/test_docker_local.sh +++ b/test_docker_local.sh @@ -22,4 +22,5 @@ done docker run \ -v $(pwd)/data:/dataset/evaluation_data \ -e "EVALUATION_LOC=local" \ + --gpus all \ ${DOCKER_NAME}\ \ No newline at end of file