diff --git a/tensorflow_model_analysis/evaluators/metrics_plots_and_validations_evaluator_test.py b/tensorflow_model_analysis/evaluators/metrics_plots_and_validations_evaluator_test.py
index dd5840f506..16241e3d99 100644
--- a/tensorflow_model_analysis/evaluators/metrics_plots_and_validations_evaluator_test.py
+++ b/tensorflow_model_analysis/evaluators/metrics_plots_and_validations_evaluator_test.py
@@ -166,330 +166,6 @@ def ci_derived_fn():
     self.assertLen(derived, 1)
     self.assertLen(ci_derived, 1)
 
-  def testEvaluateWithKerasAndValidateMetrics(self):
-    model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
-    eval_shared_model = self._build_keras_model('candidate', model_dir, mul=0)
-    baseline_eval_shared_model = self._build_keras_model(
-        'baseline', baseline_dir, mul=1
-    )
-
-    schema = text_format.Parse(
-        """
-        tensor_representation_group {
-          key: ""
-          value {
-            tensor_representation {
-              key: "input_1"
-              value {
-                dense_tensor {
-                  column_name: "input_1"
-                  shape { dim { size: 1 } }
-                }
-              }
-            }
-          }
-        }
-        feature {
-          name: "input_1"
-          type: FLOAT
-        }
-        feature {
-          name: "label"
-          type: FLOAT
-        }
-        feature {
-          name: "example_weight"
-          type: FLOAT
-        }
-        feature {
-          name: "extra_feature"
-          type: BYTES
-        }
-        """,
-        schema_pb2.Schema(),
-    )
-    tfx_io = test_util.InMemoryTFExampleRecord(
-        schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN
-    )
-    tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
-        arrow_schema=tfx_io.ArrowSchema(),
-        tensor_representations=tfx_io.TensorRepresentations(),
-    )
-
-    examples = [
-        self._makeExample(
-            input_1=0.0,
-            label=1.0,
-            example_weight=1.0,
-            extra_feature='non_model_feature',
-        ),
-        self._makeExample(
-            input_1=1.0,
-            label=0.0,
-            example_weight=0.5,
-            extra_feature='non_model_feature',
-        ),
-    ]
-
-    eval_config = config_pb2.EvalConfig(
-        model_specs=[
-            config_pb2.ModelSpec(
-                name='candidate',
-                label_key='label',
-                example_weight_key='example_weight',
-            ),
-            config_pb2.ModelSpec(
-                name='baseline',
-                label_key='label',
-                example_weight_key='example_weight',
-                is_baseline=True,
-            ),
-        ],
-        slicing_specs=[config_pb2.SlicingSpec()],
-        metrics_specs=[
-            config_pb2.MetricsSpec(
-                metrics=[
-                    config_pb2.MetricConfig(
-                        class_name='ExampleCount',
-                        # 2 > 10, NOT OK.
-                        threshold=config_pb2.MetricThreshold(
-                            value_threshold=config_pb2.GenericValueThreshold(
-                                lower_bound={'value': 10}
-                            )
-                        ),
-                    ),
-                ],
-                model_names=['candidate', 'baseline'],
-                example_weights=config_pb2.ExampleWeightOptions(
-                    unweighted=True
-                ),
-            ),
-            config_pb2.MetricsSpec(
-                metrics=[
-                    config_pb2.MetricConfig(
-                        class_name='WeightedExampleCount',
-                        # 1.5 < 1, NOT OK.
-                        threshold=config_pb2.MetricThreshold(
-                            value_threshold=config_pb2.GenericValueThreshold(
-                                upper_bound={'value': 1}
-                            )
-                        ),
-                    ),
-                ],
-                model_names=['candidate', 'baseline'],
-                example_weights=config_pb2.ExampleWeightOptions(weighted=True),
-            ),
-            config_pb2.MetricsSpec(
-                metrics=[
-                    config_pb2.MetricConfig(
-                        class_name='MeanLabel',
-                        # 0 > 1 and 0 > 1?: NOT OK.
-                        threshold=config_pb2.MetricThreshold(
-                            change_threshold=config_pb2.GenericChangeThreshold(
-                                direction=config_pb2.MetricDirection.HIGHER_IS_BETTER,
-                                relative={'value': 1},
-                                absolute={'value': 1},
-                            )
-                        ),
-                    ),
-                    config_pb2.MetricConfig(
-                        # MeanPrediction = (0+0)/(1+0.5) = 0
-                        class_name='MeanPrediction',
-                        # -.01 < 0 < .01, OK.
-                        # Diff% = -.333/.333 = -100% < -99%, OK.
-                        # Diff = 0 - .333 = -.333 < 0, OK.
-                        threshold=config_pb2.MetricThreshold(
-                            value_threshold=config_pb2.GenericValueThreshold(
-                                upper_bound={'value': 0.01},
-                                lower_bound={'value': -0.01},
-                            ),
-                            change_threshold=config_pb2.GenericChangeThreshold(
-                                direction=config_pb2.MetricDirection.LOWER_IS_BETTER,
-                                relative={'value': -0.99},
-                                absolute={'value': 0},
-                            ),
-                        ),
-                    ),
-                ],
-                thresholds={
-                    'loss': config_pb2.MetricThreshold(
-                        value_threshold=config_pb2.GenericValueThreshold(
-                            upper_bound={'value': 0}
-                        )
-                    )
-                },
-                model_names=['candidate', 'baseline'],
-            ),
-        ],
-    )
-    eval_shared_models = [eval_shared_model, baseline_eval_shared_model]
-    extractors = [
-        features_extractor.FeaturesExtractor(
-            eval_config=eval_config,
-            tensor_representations=tensor_adapter_config.tensor_representations,
-        ),
-        labels_extractor.LabelsExtractor(eval_config),
-        example_weights_extractor.ExampleWeightsExtractor(eval_config),
-        predictions_extractor.PredictionsExtractor(
-            eval_shared_model=eval_shared_models, eval_config=eval_config
-        ),
-        unbatch_extractor.UnbatchExtractor(),
-        slice_key_extractor.SliceKeyExtractor(eval_config=eval_config),
-    ]
-    evaluators = [
-        metrics_plots_and_validations_evaluator.MetricsPlotsAndValidationsEvaluator(
-            eval_config=eval_config, eval_shared_model=eval_shared_models
-        )
-    ]
-
-    with beam.Pipeline() as pipeline:
-      # pylint: disable=no-value-for-parameter
-      evaluations = (
-          pipeline
-          | 'Create' >> beam.Create([e.SerializeToString() for e in examples])
-          | 'BatchExamples' >> tfx_io.BeamSource()
-          | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
-          | 'ExtractEvaluate'
-          >> model_eval_lib.ExtractAndEvaluate(
-              extractors=extractors, evaluators=evaluators
-          )
-      )
-
-      # pylint: enable=no-value-for-parameter
-
-      def check_validations(got):
-        try:
-          self.assertLen(got, 1)
-          got = got[0]
-          expected_metric_validations_per_slice = [
-              text_format.Parse(
-                  """
-                  metric_key {
-                    name: "loss"
-                    model_name: "candidate"
-                  }
-                  metric_threshold {
-                    value_threshold {
-                      upper_bound {
-                        value: 0.0
-                      }
-                    }
-                  }
-                  metric_value {
-                    double_value {
-                      value: 7.712474346160889
-                    }
-                  }
-                  """,
-                  validation_result_pb2.ValidationFailure(),
-              ),
-              text_format.Parse(
-                  """
-                  metric_key {
-                    name: "example_count"
-                    model_name: "candidate"
-                    example_weighted { }
-                  }
-                  metric_threshold {
-                    value_threshold {
-                      lower_bound {
-                        value: 10.0
-                      }
-                    }
-                  }
-                  metric_value {
-                    double_value {
-                      value: 2.0
-                    }
-                  }
-                  """,
-                  validation_result_pb2.ValidationFailure(),
-              ),
-              text_format.Parse(
-                  """
-                  metric_key {
-                    name: "weighted_example_count"
-                    model_name: "candidate"
-                    example_weighted { value: true }
-                  }
-                  metric_threshold {
-                    value_threshold {
-                      upper_bound {
-                        value: 1.0
-                      }
-                    }
-                  }
-                  metric_value {
-                    double_value {
-                      value: 1.5
-                    }
-                  }
-                  """,
-                  validation_result_pb2.ValidationFailure(),
-              ),
-              text_format.Parse(
-                  """
-                  metric_key {
-                    name: "mean_label"
-                    model_name: "candidate"
-                    is_diff: true
-                    example_weighted { value: true }
-                  }
-                  metric_threshold {
-                    change_threshold {
-                      absolute {
-                        value: 1.0
-                      }
-                      relative {
-                        value: 1.0
-                      }
-                      direction: HIGHER_IS_BETTER
-                    }
-                  }
-                  metric_value {
-                    double_value {
-                      value: 0.0
-                    }
-                  }
-                  """,
-                  validation_result_pb2.ValidationFailure(),
-              ),
-          ]
-          # Loss not supported in TFv1
-          if _TF_MAJOR_VERSION < 2:
-            expected_metric_validations_per_slice[0].ClearField('metric_value')
-            expected_metric_validations_per_slice[0].message = (
-                'Metric not found.'
-            )
-          self.assertFalse(got.validation_ok)
-          self.assertLen(got.metric_validations_per_slice, 1)
-          self.assertLen(
-              got.metric_validations_per_slice[0].failures,
-              len(expected_metric_validations_per_slice),
-          )
-          self.assertCountEqual(
-              got.metric_validations_per_slice[0].failures,
-              expected_metric_validations_per_slice,
-          )
-
-        except AssertionError as err:
-          raise util.BeamAssertException(err)
-
-      util.assert_that(
-          evaluations[constants.VALIDATIONS_KEY], check_validations
-      )
-
-    metric_filter = beam.metrics.metric.MetricsFilter().with_name(
-        'metric_computed_ExampleCount_v2_' + constants.TF_KERAS
-    )
-    actual_metrics_count = (
-        pipeline.run()
-        .metrics()
-        .query(filter=metric_filter)['counters'][0]
-        .committed
-    )
-    self.assertEqual(actual_metrics_count, 1)
-
   def testEvaluateWithKerasAndDiffMetrics(self):
     model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
     eval_shared_model = self._build_keras_model('candidate', model_dir, mul=0)
@@ -1080,31 +756,19 @@ def check_metrics(got):
               name='mean_label', example_weighted=False
           )
           binary_accuracy_key = metric_types.MetricKey(
-              name='binary_accuracy', example_weighted=None
+              name='binary_accuracy', example_weighted=False
           )
           self.assertIn(binary_accuracy_key, got_metrics)
           binary_accuracy_unweighted_key = metric_types.MetricKey(
               name='binary_accuracy', example_weighted=False
           )
           self.assertIn(binary_accuracy_unweighted_key, got_metrics)
-          # Loss not supported in TFv1
-          if _TF_MAJOR_VERSION > 1:
-            loss_key = metric_types.MetricKey(
-                name='loss', example_weighted=None
-            )
-            self.assertIn(loss_key, got_metrics)
           expected_values = {
               example_count_key: 2,
               weighted_example_count_key: 1.0 + 0.5,
               label_key: (1.0 * 1.0 + 0.0 * 0.5) / (1.0 + 0.5),
               label_unweighted_key: (1.0 + 0.0) / (1.0 + 1.0),
           }
-          if add_custom_metrics:
-            custom_key = metric_types.MetricKey(
-                name='custom', example_weighted=None
-            )
-            self.assertIn(custom_key, got_metrics)
-            expected_values[custom_key] = 0.0 + 1.0 + 0.0 + 1.0
           self.assertDictElementsAlmostEqual(got_metrics, expected_values)
 
         except AssertionError as err: