diff --git a/tensorflow_model_analysis/evaluators/metrics_plots_and_validations_evaluator_test.py b/tensorflow_model_analysis/evaluators/metrics_plots_and_validations_evaluator_test.py index dd5840f506..16241e3d99 100644 --- a/tensorflow_model_analysis/evaluators/metrics_plots_and_validations_evaluator_test.py +++ b/tensorflow_model_analysis/evaluators/metrics_plots_and_validations_evaluator_test.py @@ -166,330 +166,6 @@ def ci_derived_fn(): self.assertLen(derived, 1) self.assertLen(ci_derived, 1) - def testEvaluateWithKerasAndValidateMetrics(self): - model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir() - eval_shared_model = self._build_keras_model('candidate', model_dir, mul=0) - baseline_eval_shared_model = self._build_keras_model( - 'baseline', baseline_dir, mul=1 - ) - - schema = text_format.Parse( - """ - tensor_representation_group { - key: "" - value { - tensor_representation { - key: "input_1" - value { - dense_tensor { - column_name: "input_1" - shape { dim { size: 1 } } - } - } - } - } - } - feature { - name: "input_1" - type: FLOAT - } - feature { - name: "label" - type: FLOAT - } - feature { - name: "example_weight" - type: FLOAT - } - feature { - name: "extra_feature" - type: BYTES - } - """, - schema_pb2.Schema(), - ) - tfx_io = test_util.InMemoryTFExampleRecord( - schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN - ) - tensor_adapter_config = tensor_adapter.TensorAdapterConfig( - arrow_schema=tfx_io.ArrowSchema(), - tensor_representations=tfx_io.TensorRepresentations(), - ) - - examples = [ - self._makeExample( - input_1=0.0, - label=1.0, - example_weight=1.0, - extra_feature='non_model_feature', - ), - self._makeExample( - input_1=1.0, - label=0.0, - example_weight=0.5, - extra_feature='non_model_feature', - ), - ] - - eval_config = config_pb2.EvalConfig( - model_specs=[ - config_pb2.ModelSpec( - name='candidate', - label_key='label', - example_weight_key='example_weight', - ), - config_pb2.ModelSpec( - name='baseline', - label_key='label', - example_weight_key='example_weight', - is_baseline=True, - ), - ], - slicing_specs=[config_pb2.SlicingSpec()], - metrics_specs=[ - config_pb2.MetricsSpec( - metrics=[ - config_pb2.MetricConfig( - class_name='ExampleCount', - # 2 > 10, NOT OK. - threshold=config_pb2.MetricThreshold( - value_threshold=config_pb2.GenericValueThreshold( - lower_bound={'value': 10} - ) - ), - ), - ], - model_names=['candidate', 'baseline'], - example_weights=config_pb2.ExampleWeightOptions( - unweighted=True - ), - ), - config_pb2.MetricsSpec( - metrics=[ - config_pb2.MetricConfig( - class_name='WeightedExampleCount', - # 1.5 < 1, NOT OK. - threshold=config_pb2.MetricThreshold( - value_threshold=config_pb2.GenericValueThreshold( - upper_bound={'value': 1} - ) - ), - ), - ], - model_names=['candidate', 'baseline'], - example_weights=config_pb2.ExampleWeightOptions(weighted=True), - ), - config_pb2.MetricsSpec( - metrics=[ - config_pb2.MetricConfig( - class_name='MeanLabel', - # 0 > 1 and 0 > 1?: NOT OK. - threshold=config_pb2.MetricThreshold( - change_threshold=config_pb2.GenericChangeThreshold( - direction=config_pb2.MetricDirection.HIGHER_IS_BETTER, - relative={'value': 1}, - absolute={'value': 1}, - ) - ), - ), - config_pb2.MetricConfig( - # MeanPrediction = (0+0)/(1+0.5) = 0 - class_name='MeanPrediction', - # -.01 < 0 < .01, OK. - # Diff% = -.333/.333 = -100% < -99%, OK. - # Diff = 0 - .333 = -.333 < 0, OK. - threshold=config_pb2.MetricThreshold( - value_threshold=config_pb2.GenericValueThreshold( - upper_bound={'value': 0.01}, - lower_bound={'value': -0.01}, - ), - change_threshold=config_pb2.GenericChangeThreshold( - direction=config_pb2.MetricDirection.LOWER_IS_BETTER, - relative={'value': -0.99}, - absolute={'value': 0}, - ), - ), - ), - ], - thresholds={ - 'loss': config_pb2.MetricThreshold( - value_threshold=config_pb2.GenericValueThreshold( - upper_bound={'value': 0} - ) - ) - }, - model_names=['candidate', 'baseline'], - ), - ], - ) - eval_shared_models = [eval_shared_model, baseline_eval_shared_model] - extractors = [ - features_extractor.FeaturesExtractor( - eval_config=eval_config, - tensor_representations=tensor_adapter_config.tensor_representations, - ), - labels_extractor.LabelsExtractor(eval_config), - example_weights_extractor.ExampleWeightsExtractor(eval_config), - predictions_extractor.PredictionsExtractor( - eval_shared_model=eval_shared_models, eval_config=eval_config - ), - unbatch_extractor.UnbatchExtractor(), - slice_key_extractor.SliceKeyExtractor(eval_config=eval_config), - ] - evaluators = [ - metrics_plots_and_validations_evaluator.MetricsPlotsAndValidationsEvaluator( - eval_config=eval_config, eval_shared_model=eval_shared_models - ) - ] - - with beam.Pipeline() as pipeline: - # pylint: disable=no-value-for-parameter - evaluations = ( - pipeline - | 'Create' >> beam.Create([e.SerializeToString() for e in examples]) - | 'BatchExamples' >> tfx_io.BeamSource() - | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() - | 'ExtractEvaluate' - >> model_eval_lib.ExtractAndEvaluate( - extractors=extractors, evaluators=evaluators - ) - ) - - # pylint: enable=no-value-for-parameter - - def check_validations(got): - try: - self.assertLen(got, 1) - got = got[0] - expected_metric_validations_per_slice = [ - text_format.Parse( - """ - metric_key { - name: "loss" - model_name: "candidate" - } - metric_threshold { - value_threshold { - upper_bound { - value: 0.0 - } - } - } - metric_value { - double_value { - value: 7.712474346160889 - } - } - """, - validation_result_pb2.ValidationFailure(), - ), - text_format.Parse( - """ - metric_key { - name: "example_count" - model_name: "candidate" - example_weighted { } - } - metric_threshold { - value_threshold { - lower_bound { - value: 10.0 - } - } - } - metric_value { - double_value { - value: 2.0 - } - } - """, - validation_result_pb2.ValidationFailure(), - ), - text_format.Parse( - """ - metric_key { - name: "weighted_example_count" - model_name: "candidate" - example_weighted { value: true } - } - metric_threshold { - value_threshold { - upper_bound { - value: 1.0 - } - } - } - metric_value { - double_value { - value: 1.5 - } - } - """, - validation_result_pb2.ValidationFailure(), - ), - text_format.Parse( - """ - metric_key { - name: "mean_label" - model_name: "candidate" - is_diff: true - example_weighted { value: true } - } - metric_threshold { - change_threshold { - absolute { - value: 1.0 - } - relative { - value: 1.0 - } - direction: HIGHER_IS_BETTER - } - } - metric_value { - double_value { - value: 0.0 - } - } - """, - validation_result_pb2.ValidationFailure(), - ), - ] - # Loss not supported in TFv1 - if _TF_MAJOR_VERSION < 2: - expected_metric_validations_per_slice[0].ClearField('metric_value') - expected_metric_validations_per_slice[0].message = ( - 'Metric not found.' - ) - self.assertFalse(got.validation_ok) - self.assertLen(got.metric_validations_per_slice, 1) - self.assertLen( - got.metric_validations_per_slice[0].failures, - len(expected_metric_validations_per_slice), - ) - self.assertCountEqual( - got.metric_validations_per_slice[0].failures, - expected_metric_validations_per_slice, - ) - - except AssertionError as err: - raise util.BeamAssertException(err) - - util.assert_that( - evaluations[constants.VALIDATIONS_KEY], check_validations - ) - - metric_filter = beam.metrics.metric.MetricsFilter().with_name( - 'metric_computed_ExampleCount_v2_' + constants.TF_KERAS - ) - actual_metrics_count = ( - pipeline.run() - .metrics() - .query(filter=metric_filter)['counters'][0] - .committed - ) - self.assertEqual(actual_metrics_count, 1) - def testEvaluateWithKerasAndDiffMetrics(self): model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir() eval_shared_model = self._build_keras_model('candidate', model_dir, mul=0) @@ -1080,31 +756,19 @@ def check_metrics(got): name='mean_label', example_weighted=False ) binary_accuracy_key = metric_types.MetricKey( - name='binary_accuracy', example_weighted=None + name='binary_accuracy', example_weighted=False ) self.assertIn(binary_accuracy_key, got_metrics) binary_accuracy_unweighted_key = metric_types.MetricKey( name='binary_accuracy', example_weighted=False ) self.assertIn(binary_accuracy_unweighted_key, got_metrics) - # Loss not supported in TFv1 - if _TF_MAJOR_VERSION > 1: - loss_key = metric_types.MetricKey( - name='loss', example_weighted=None - ) - self.assertIn(loss_key, got_metrics) expected_values = { example_count_key: 2, weighted_example_count_key: 1.0 + 0.5, label_key: (1.0 * 1.0 + 0.0 * 0.5) / (1.0 + 0.5), label_unweighted_key: (1.0 + 0.0) / (1.0 + 1.0), } - if add_custom_metrics: - custom_key = metric_types.MetricKey( - name='custom', example_weighted=None - ) - self.assertIn(custom_key, got_metrics) - expected_values[custom_key] = 0.0 + 1.0 + 0.0 + 1.0 self.assertDictElementsAlmostEqual(got_metrics, expected_values) except AssertionError as err: