diff --git a/ann/src/main/python/dataflow/faiss_index_bq_dataset.py b/ann/src/main/python/dataflow/faiss_index_bq_dataset.py
index dd45070db..2205675f5 100644
--- a/ann/src/main/python/dataflow/faiss_index_bq_dataset.py
+++ b/ann/src/main/python/dataflow/faiss_index_bq_dataset.py
@@ -6,227 +6,246 @@
 from urllib.parse import urlsplit
 
 import apache_beam as beam
-from apache_beam.options.pipeline_options import PipelineOptions
 import faiss
+from apache_beam.options.pipeline_options import PipelineOptions
 
 
 def parse_d6w_config(argv=None):
-  """Parse d6w config.
-  :param argv: d6w config
-  :return: dictionary containing d6w config
-  """
+    """Parse d6w config.
+    :param argv: d6w config
+    :return: dictionary containing d6w config
+    """
 
-  parser = argparse.ArgumentParser(
-    description="See https://docbird.twitter.biz/d6w/model.html for any parameters inherited from d6w job config"
-  )
-  parser.add_argument("--job_name", dest="job_name", required=True, help="d6w attribute")
-  parser.add_argument("--project", dest="project", required=True, help="d6w attribute")
-  parser.add_argument(
-    "--staging_location", dest="staging_location", required=True, help="d6w attribute"
-  )
-  parser.add_argument("--temp_location", dest="temp_location", required=True, help="d6w attribute")
-  parser.add_argument(
-    "--output_location",
-    dest="output_location",
-    required=True,
-    help="GCS bucket and path where resulting artifacts are uploaded",
-  )
-  parser.add_argument(
-    "--service_account_email", dest="service_account_email", required=True, help="d6w attribute"
-  )
-  parser.add_argument(
-    "--factory_string",
-    dest="factory_string",
-    required=False,
-    help="FAISS factory string describing index to build. See https://github.com/facebookresearch/faiss/wiki/The-index-factory",
-  )
-  parser.add_argument(
-    "--metric",
-    dest="metric",
-    required=True,
-    help="Metric used to compute distance between embeddings. Valid values are 'l2', 'ip', 'l1', 'linf'",
-  )
-  parser.add_argument(
-    "--use_gpu",
-    dest="gpu",
-    required=True,
-    help="--use_gpu=yes if you want to use GPU during index building",
-  )
-
-  known_args, unknown_args = parser.parse_known_args(argv)
-  d6w_config = vars(known_args)
-  d6w_config["gpu"] = d6w_config["gpu"].lower() == "yes"
-  d6w_config["metric"] = parse_metric(d6w_config)
+    parser = argparse.ArgumentParser(
+        description="See https://docbird.twitter.biz/d6w/model.html for any parameters inherited from d6w job config"
+    )
+    parser.add_argument(
+        "--job_name", dest="job_name", required=True, help="d6w attribute"
+    )
+    parser.add_argument(
+        "--project", dest="project", required=True, help="d6w attribute"
+    )
+    parser.add_argument(
+        "--staging_location",
+        dest="staging_location",
+        required=True,
+        help="d6w attribute",
+    )
+    parser.add_argument(
+        "--temp_location", dest="temp_location", required=True, help="d6w attribute"
+    )
+    parser.add_argument(
+        "--output_location",
+        dest="output_location",
+        required=True,
+        help="GCS bucket and path where resulting artifacts are uploaded",
+    )
+    parser.add_argument(
+        "--service_account_email",
+        dest="service_account_email",
+        required=True,
+        help="d6w attribute",
+    )
+    parser.add_argument(
+        "--factory_string",
+        dest="factory_string",
+        required=False,
+        help="FAISS factory string describing index to build. See https://github.com/facebookresearch/faiss/wiki/The-index-factory",
+    )
+    parser.add_argument(
+        "--metric",
+        dest="metric",
+        required=True,
+        help="Metric used to compute distance between embeddings. Valid values are 'l2', 'ip', 'l1', 'linf'",
+    )
+    parser.add_argument(
+        "--use_gpu",
+        dest="gpu",
+        required=True,
+        help="--use_gpu=yes if you want to use GPU during index building",
+    )
 
-  """
+    known_args, unknown_args = parser.parse_known_args(argv)
+    d6w_config = vars(known_args)
+    d6w_config["gpu"] = d6w_config["gpu"].lower() == "yes"
+    d6w_config["metric"] = parse_metric(d6w_config)
+
+    """
   WARNING: Currently, d6w (a Twitter tool used to deploy Dataflow jobs to GCP) and
   PipelineOptions.for_dataflow_runner (a helper method in twitter.ml.common.apache_beam) do not
   play nicely together. The helper method will overwrite some of the config specified in the d6w
   file using the defaults in https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24.'
   However, the d6w output message will still report that the config specified in the d6w file was used.
   """
-  logging.warning(
-    f"The following d6w config parameters will be overwritten by the defaults in "
-    f"https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24\n"
-    f"{str(unknown_args)}"
-  )
-  return d6w_config
+    logging.warning(
+        f"The following d6w config parameters will be overwritten by the defaults in "
+        f"https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24\n"
+        f"{str(unknown_args)}"
+    )
+    return d6w_config
 
 
 def get_bq_query():
-  """
-  Query is expected to return rows with unique entityId
-  """
-  return pkgutil.get_data(__name__, "bq.sql").decode("utf-8")
+    """
+    Query is expected to return rows with unique entityId
+    """
+    return pkgutil.get_data(__name__, "bq.sql").decode("utf-8")
 
 
 def parse_metric(config):
-  metric_str = config["metric"].lower()
-  if metric_str == "l2":
-    return faiss.METRIC_L2
-  elif metric_str == "ip":
-    return faiss.METRIC_INNER_PRODUCT
-  elif metric_str == "l1":
-    return faiss.METRIC_L1
-  elif metric_str == "linf":
-    return faiss.METRIC_Linf
-  else:
-    raise Exception(f"Unknown metric: {metric_str}")
+    metric_str = config["metric"].lower()
+    if metric_str == "l2":
+        return faiss.METRIC_L2
+    elif metric_str == "ip":
+        return faiss.METRIC_INNER_PRODUCT
+    elif metric_str == "l1":
+        return faiss.METRIC_L1
+    elif metric_str == "linf":
+        return faiss.METRIC_Linf
+    else:
+        raise Exception(f"Unknown metric: {metric_str}")
 
 
 def run_pipeline(argv=[]):
-  config = parse_d6w_config(argv)
-  argv_with_extras = argv
-  if config["gpu"]:
-    argv_with_extras.extend(["--experiments", "use_runner_v2"])
-    argv_with_extras.extend(
-      ["--experiments", "worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver"]
-    )
-    argv_with_extras.extend(
-      [
-        "--worker_harness_container_image",
-        "gcr.io/twttr-recos-ml-prod/dataflow-gpu/beam2_39_0_py3_7",
-      ]
-    )
-
-  options = PipelineOptions(argv_with_extras)
-  output_bucket_name = urlsplit(config["output_location"]).netloc
-
-  with beam.Pipeline(options=options) as p:
-    input_data = p | "Read from BigQuery" >> beam.io.ReadFromBigQuery(
-      method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
-      query=get_bq_query(),
-      use_standard_sql=True,
-    )
-
-    index_built = input_data | "Build and upload index" >> beam.CombineGlobally(
-      MergeAndBuildIndex(
-        output_bucket_name,
-        config["output_location"],
-        config["factory_string"],
-        config["metric"],
-        config["gpu"],
-      )
-    )
-
-    # Make linter happy
-    index_built
+    config = parse_d6w_config(argv)
+    argv_with_extras = argv
+    if config["gpu"]:
+        argv_with_extras.extend(["--experiments", "use_runner_v2"])
+        argv_with_extras.extend(
+            [
+                "--experiments",
+                "worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver",
+            ]
+        )
+        argv_with_extras.extend(
+            [
+                "--worker_harness_container_image",
+                "gcr.io/twttr-recos-ml-prod/dataflow-gpu/beam2_39_0_py3_7",
+            ]
+        )
+
+    options = PipelineOptions(argv_with_extras)
+    output_bucket_name = urlsplit(config["output_location"]).netloc
+
+    with beam.Pipeline(options=options) as p:
+        input_data = p | "Read from BigQuery" >> beam.io.ReadFromBigQuery(
+            method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
+            query=get_bq_query(),
+            use_standard_sql=True,
+        )
+
+        index_built = input_data | "Build and upload index" >> beam.CombineGlobally(
+            MergeAndBuildIndex(
+                output_bucket_name,
+                config["output_location"],
+                config["factory_string"],
+                config["metric"],
+                config["gpu"],
+            )
+        )
+
+        # Make linter happy
+        index_built
 
 
 class MergeAndBuildIndex(beam.CombineFn):
-  def __init__(self, bucket_name, gcs_output_path, factory_string, metric, gpu):
-    self.bucket_name = bucket_name
-    self.gcs_output_path = gcs_output_path
-    self.factory_string = factory_string
-    self.metric = metric
-    self.gpu = gpu
-
-  def create_accumulator(self):
-    return []
-
-  def add_input(self, accumulator, element):
-    accumulator.append(element)
-    return accumulator
-
-  def merge_accumulators(self, accumulators):
-    merged = []
-    for accum in accumulators:
-      merged.extend(accum)
-    return merged
-
-  def extract_output(self, rows):
-    # Reimports are needed on workers
-    import glob
-    import subprocess
-
-    import faiss
-    from google.cloud import storage
-    import numpy as np
-
-    client = storage.Client()
-    bucket = client.get_bucket(self.bucket_name)
-
-    logging.info("Building FAISS index")
-    logging.info(f"There are {len(rows)} rows")
-
-    ids = np.array([x["entityId"] for x in rows]).astype("long")
-    embeds = np.array([x["embedding"] for x in rows]).astype("float32")
-    dimensions = len(embeds[0])
-    N = ids.shape[0]
-    logging.info(f"There are {dimensions} dimensions")
-
-    if self.factory_string is None:
-      M = 48
-
-      divideable_dimensions = (dimensions // M) * M
-      if divideable_dimensions != dimensions:
-        opq_prefix = f"OPQ{M}_{divideable_dimensions}"
-      else:
-        opq_prefix = f"OPQ{M}"
-
-      clusters = N // 20
-      self.factory_string = f"{opq_prefix},IVF{clusters},PQ{M}"
-
-    logging.info(f"Factory string is {self.factory_string}, metric={self.metric}")
-
-    if self.gpu:
-      logging.info("Using GPU")
-
-      res = faiss.StandardGpuResources()
-      cpu_index = faiss.index_factory(dimensions, self.factory_string, self.metric)
-      cpu_index = faiss.IndexIDMap(cpu_index)
-      gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
-      gpu_index.train(embeds)
-      gpu_index.add_with_ids(embeds, ids)
-      cpu_index = faiss.index_gpu_to_cpu(gpu_index)
-    else:
-      logging.info("Using CPU")
-
-      cpu_index = faiss.index_factory(dimensions, self.factory_string, self.metric)
-      cpu_index = faiss.IndexIDMap(cpu_index)
-      cpu_index.train(embeds)
-      cpu_index.add_with_ids(embeds, ids)
-
-    logging.info("Built faiss index")
-
-    local_path = "/indices"
-    logging.info(f"Writing indices to local {local_path}")
-    subprocess.run(f"mkdir -p {local_path}".strip().split())
-    local_index_path = os.path.join(local_path, "result.index")
-
-    faiss.write_index(cpu_index, local_index_path)
-    logging.info(f"Done writing indices to local {local_path}")
-
-    logging.info(f"Uploading to GCS with path {self.gcs_output_path}")
-    assert os.path.isdir(local_path)
-    for local_file in glob.glob(local_path + "/*"):
-      remote_path = os.path.join(
-        self.gcs_output_path.split("/")[-1], local_file[1 + len(local_path) :]
-      )
-      blob = bucket.blob(remote_path)
-      blob.upload_from_filename(local_file)
+    def __init__(self, bucket_name, gcs_output_path, factory_string, metric, gpu):
+        self.bucket_name = bucket_name
+        self.gcs_output_path = gcs_output_path
+        self.factory_string = factory_string
+        self.metric = metric
+        self.gpu = gpu
+
+    def create_accumulator(self):
+        return []
+
+    def add_input(self, accumulator, element):
+        accumulator.append(element)
+        return accumulator
+
+    def merge_accumulators(self, accumulators):
+        merged = []
+        for accum in accumulators:
+            merged.extend(accum)
+        return merged
+
+    def extract_output(self, rows):
+        # Reimports are needed on workers
+        import glob
+        import subprocess
+
+        import faiss
+        import numpy as np
+        from google.cloud import storage
+
+        client = storage.Client()
+        bucket = client.get_bucket(self.bucket_name)
+
+        logging.info("Building FAISS index")
+        logging.info(f"There are {len(rows)} rows")
+
+        ids = np.array([x["entityId"] for x in rows]).astype("long")
+        embeds = np.array([x["embedding"] for x in rows]).astype("float32")
+        dimensions = len(embeds[0])
+        N = ids.shape[0]
+        logging.info(f"There are {dimensions} dimensions")
+
+        if self.factory_string is None:
+            M = 48
+
+            divideable_dimensions = (dimensions // M) * M
+            if divideable_dimensions != dimensions:
+                opq_prefix = f"OPQ{M}_{divideable_dimensions}"
+            else:
+                opq_prefix = f"OPQ{M}"
+
+            clusters = N // 20
+            self.factory_string = f"{opq_prefix},IVF{clusters},PQ{M}"
+
+        logging.info(f"Factory string is {self.factory_string}, metric={self.metric}")
+
+        if self.gpu:
+            logging.info("Using GPU")
+
+            res = faiss.StandardGpuResources()
+            cpu_index = faiss.index_factory(
+                dimensions, self.factory_string, self.metric
+            )
+            cpu_index = faiss.IndexIDMap(cpu_index)
+            gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
+            gpu_index.train(embeds)
+            gpu_index.add_with_ids(embeds, ids)
+            cpu_index = faiss.index_gpu_to_cpu(gpu_index)
+        else:
+            logging.info("Using CPU")
+
+            cpu_index = faiss.index_factory(
+                dimensions, self.factory_string, self.metric
+            )
+            cpu_index = faiss.IndexIDMap(cpu_index)
+            cpu_index.train(embeds)
+            cpu_index.add_with_ids(embeds, ids)
+
+        logging.info("Built faiss index")
+
+        local_path = "/indices"
+        logging.info(f"Writing indices to local {local_path}")
+        subprocess.run(f"mkdir -p {local_path}".strip().split())
+        local_index_path = os.path.join(local_path, "result.index")
+
+        faiss.write_index(cpu_index, local_index_path)
+        logging.info(f"Done writing indices to local {local_path}")
+
+        logging.info(f"Uploading to GCS with path {self.gcs_output_path}")
+        assert os.path.isdir(local_path)
+        for local_file in glob.glob(local_path + "/*"):
+            remote_path = os.path.join(
+                self.gcs_output_path.split("/")[-1], local_file[1 + len(local_path) :]
+            )
+            blob = bucket.blob(remote_path)
+            blob.upload_from_filename(local_file)
 
 
 if __name__ == "__main__":
-  logging.getLogger().setLevel(logging.INFO)
-  run_pipeline(sys.argv)
+    logging.getLogger().setLevel(logging.INFO)
+    run_pipeline(sys.argv)
diff --git a/pushservice/src/main/python/models/heavy_ranking/deep_norm.py b/pushservice/src/main/python/models/heavy_ranking/deep_norm.py
index 7db281b4a..e0157e294 100644
--- a/pushservice/src/main/python/models/heavy_ranking/deep_norm.py
+++ b/pushservice/src/main/python/models/heavy_ranking/deep_norm.py
@@ -1,136 +1,147 @@
 """
 Training job for the heavy ranker of the push notification service.
 """
-from datetime import datetime
 import json
 import os
+from datetime import datetime
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import logging
 
 import twml
 
 from ..libs.metric_fn_utils import flip_disliked_labels, get_metric_fn
 from ..libs.model_utils import read_config
-from ..libs.warm_start_utils import get_feature_list_for_heavy_ranking, warm_start_checkpoint
+from ..libs.warm_start_utils import (
+    get_feature_list_for_heavy_ranking,
+    warm_start_checkpoint,
+)
 from .features import get_feature_config
 from .model_pools import ALL_MODELS
 from .params import load_graph_params
 from .run_args import get_training_arg_parser
 
-import tensorflow.compat.v1 as tf
-from tensorflow.compat.v1 import logging
-
 
 def main() -> None:
-  args, _ = get_training_arg_parser().parse_known_args()
-  logging.info(f"Parsed args: {args}")
-
-  params = load_graph_params(args)
-  logging.info(f"Loaded graph params: {params}")
-
-  param_file = os.path.join(args.save_dir, "params.json")
-  logging.info(f"Saving graph params to: {param_file}")
-  with tf.io.gfile.GFile(param_file, mode="w") as file:
-    json.dump(params.json(), file, ensure_ascii=False, indent=4)
-
-  logging.info(f"Get Feature Config: {args.feature_list}")
-  feature_list = read_config(args.feature_list).items()
-  feature_config = get_feature_config(
-    data_spec_path=args.data_spec,
-    params=params,
-    feature_list_provided=feature_list,
-  )
-  feature_list_path = args.feature_list
-
-  warm_start_from = args.warm_start_from
-  if args.warm_start_base_dir:
-    logging.info(f"Get warm started model from: {args.warm_start_base_dir}.")
+    args, _ = get_training_arg_parser().parse_known_args()
+    logging.info(f"Parsed args: {args}")
+
+    params = load_graph_params(args)
+    logging.info(f"Loaded graph params: {params}")
+
+    param_file = os.path.join(args.save_dir, "params.json")
+    logging.info(f"Saving graph params to: {param_file}")
+    with tf.io.gfile.GFile(param_file, mode="w") as file:
+        json.dump(params.json(), file, ensure_ascii=False, indent=4)
+
+    logging.info(f"Get Feature Config: {args.feature_list}")
+    feature_list = read_config(args.feature_list).items()
+    feature_config = get_feature_config(
+        data_spec_path=args.data_spec,
+        params=params,
+        feature_list_provided=feature_list,
+    )
+    feature_list_path = args.feature_list
+
+    warm_start_from = args.warm_start_from
+    if args.warm_start_base_dir:
+        logging.info(f"Get warm started model from: {args.warm_start_base_dir}.")
+
+        continuous_binary_feat_list_save_path = os.path.join(
+            args.warm_start_base_dir, "continuous_binary_feat_list.json"
+        )
+        warm_start_folder = os.path.join(args.warm_start_base_dir, "best_checkpoint")
+        job_name = os.path.basename(args.save_dir)
+        ws_output_ckpt_folder = os.path.join(
+            args.warm_start_base_dir, f"warm_start_for_{job_name}"
+        )
+        if tf.io.gfile.exists(ws_output_ckpt_folder):
+            tf.io.gfile.rmtree(ws_output_ckpt_folder)
+
+        tf.io.gfile.mkdir(ws_output_ckpt_folder)
+
+        warm_start_from = warm_start_checkpoint(
+            warm_start_folder,
+            continuous_binary_feat_list_save_path,
+            feature_list_path,
+            args.data_spec,
+            ws_output_ckpt_folder,
+        )
+        logging.info(f"Created warm_start_from_ckpt {warm_start_from}.")
+
+    logging.info("Build Trainer.")
+    metric_fn = get_metric_fn(
+        "OONC_Engagement" if len(params.tasks) == 2 else "OONC", False
+    )
+
+    trainer = twml.trainers.DataRecordTrainer(
+        name="magic_recs",
+        params=args,
+        build_graph_fn=lambda *args: ALL_MODELS[params.model.name](params=params)(
+            *args
+        ),
+        save_dir=args.save_dir,
+        run_config=None,
+        feature_config=feature_config,
+        metric_fn=flip_disliked_labels(metric_fn),
+        warm_start_from=warm_start_from,
+    )
+
+    logging.info("Build train and eval input functions.")
+    train_input_fn = trainer.get_train_input_fn(shuffle=True)
+    eval_input_fn = trainer.get_eval_input_fn(repeat=False, shuffle=False)
+
+    learn = trainer.learn
+    if args.distributed or args.num_workers is not None:
+        learn = trainer.train_and_evaluate
+
+    if not args.directly_export_best:
+        logging.info("Starting training")
+        start = datetime.now()
+        learn(
+            early_stop_minimize=False,
+            early_stop_metric="pr_auc_unweighted_OONC",
+            early_stop_patience=args.early_stop_patience,
+            early_stop_tolerance=args.early_stop_tolerance,
+            eval_input_fn=eval_input_fn,
+            train_input_fn=train_input_fn,
+        )
+        logging.info(f"Total training time: {datetime.now() - start}")
+    else:
+        logging.info("Directly exporting the model")
+
+    if not args.export_dir:
+        args.export_dir = os.path.join(args.save_dir, "exported_models")
+
+    logging.info(f"Exporting the model to {args.export_dir}.")
+    start = datetime.now()
+    twml.contrib.export.export_fn.export_all_models(
+        trainer=trainer,
+        export_dir=args.export_dir,
+        parse_fn=feature_config.get_parse_fn(),
+        serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn(),
+        export_output_fn=twml.export_output_fns.batch_prediction_continuous_output_fn,
+    )
+
+    logging.info(f"Total model export time: {datetime.now() - start}")
+    logging.info(f"The MLP directory is: {args.save_dir}")
 
     continuous_binary_feat_list_save_path = os.path.join(
-      args.warm_start_base_dir, "continuous_binary_feat_list.json"
+        args.save_dir, "continuous_binary_feat_list.json"
     )
-    warm_start_folder = os.path.join(args.warm_start_base_dir, "best_checkpoint")
-    job_name = os.path.basename(args.save_dir)
-    ws_output_ckpt_folder = os.path.join(args.warm_start_base_dir, f"warm_start_for_{job_name}")
-    if tf.io.gfile.exists(ws_output_ckpt_folder):
-      tf.io.gfile.rmtree(ws_output_ckpt_folder)
-
-    tf.io.gfile.mkdir(ws_output_ckpt_folder)
-
-    warm_start_from = warm_start_checkpoint(
-      warm_start_folder,
-      continuous_binary_feat_list_save_path,
-      feature_list_path,
-      args.data_spec,
-      ws_output_ckpt_folder,
+    logging.info(
+        f"Saving the list of continuous and binary features to {continuous_binary_feat_list_save_path}."
     )
-    logging.info(f"Created warm_start_from_ckpt {warm_start_from}.")
-
-  logging.info("Build Trainer.")
-  metric_fn = get_metric_fn("OONC_Engagement" if len(params.tasks) == 2 else "OONC", False)
-
-  trainer = twml.trainers.DataRecordTrainer(
-    name="magic_recs",
-    params=args,
-    build_graph_fn=lambda *args: ALL_MODELS[params.model.name](params=params)(*args),
-    save_dir=args.save_dir,
-    run_config=None,
-    feature_config=feature_config,
-    metric_fn=flip_disliked_labels(metric_fn),
-    warm_start_from=warm_start_from,
-  )
-
-  logging.info("Build train and eval input functions.")
-  train_input_fn = trainer.get_train_input_fn(shuffle=True)
-  eval_input_fn = trainer.get_eval_input_fn(repeat=False, shuffle=False)
-
-  learn = trainer.learn
-  if args.distributed or args.num_workers is not None:
-    learn = trainer.train_and_evaluate
-
-  if not args.directly_export_best:
-    logging.info("Starting training")
-    start = datetime.now()
-    learn(
-      early_stop_minimize=False,
-      early_stop_metric="pr_auc_unweighted_OONC",
-      early_stop_patience=args.early_stop_patience,
-      early_stop_tolerance=args.early_stop_tolerance,
-      eval_input_fn=eval_input_fn,
-      train_input_fn=train_input_fn,
+    continuous_binary_feat_list = get_feature_list_for_heavy_ranking(
+        feature_list_path, args.data_spec
+    )
+    twml.util.write_file(
+        continuous_binary_feat_list_save_path,
+        continuous_binary_feat_list,
+        encode="json",
     )
-    logging.info(f"Total training time: {datetime.now() - start}")
-  else:
-    logging.info("Directly exporting the model")
-
-  if not args.export_dir:
-    args.export_dir = os.path.join(args.save_dir, "exported_models")
-
-  logging.info(f"Exporting the model to {args.export_dir}.")
-  start = datetime.now()
-  twml.contrib.export.export_fn.export_all_models(
-    trainer=trainer,
-    export_dir=args.export_dir,
-    parse_fn=feature_config.get_parse_fn(),
-    serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn(),
-    export_output_fn=twml.export_output_fns.batch_prediction_continuous_output_fn,
-  )
-
-  logging.info(f"Total model export time: {datetime.now() - start}")
-  logging.info(f"The MLP directory is: {args.save_dir}")
-
-  continuous_binary_feat_list_save_path = os.path.join(
-    args.save_dir, "continuous_binary_feat_list.json"
-  )
-  logging.info(
-    f"Saving the list of continuous and binary features to {continuous_binary_feat_list_save_path}."
-  )
-  continuous_binary_feat_list = get_feature_list_for_heavy_ranking(
-    feature_list_path, args.data_spec
-  )
-  twml.util.write_file(
-    continuous_binary_feat_list_save_path, continuous_binary_feat_list, encode="json"
-  )
 
 
 if __name__ == "__main__":
-  main()
-  logging.info("Done.")
+    main()
+    logging.info("Done.")
diff --git a/pushservice/src/main/python/models/heavy_ranking/eval.py b/pushservice/src/main/python/models/heavy_ranking/eval.py
index 7f74472fb..893a39fe6 100644
--- a/pushservice/src/main/python/models/heavy_ranking/eval.py
+++ b/pushservice/src/main/python/models/heavy_ranking/eval.py
@@ -4,6 +4,8 @@
 
 from datetime import datetime
 
+from tensorflow.compat.v1 import logging
+
 import twml
 
 from ..libs.metric_fn_utils import get_metric_fn
@@ -13,47 +15,51 @@
 from .params import load_graph_params
 from .run_args import get_eval_arg_parser
 
-from tensorflow.compat.v1 import logging
-
 
 def main():
-  args, _ = get_eval_arg_parser().parse_known_args()
-  logging.info(f"Parsed args: {args}")
-
-  params = load_graph_params(args)
-  logging.info(f"Loaded graph params: {params}")
-
-  logging.info(f"Get Feature Config: {args.feature_list}")
-  feature_list = read_config(args.feature_list).items()
-  feature_config = get_feature_config(
-    data_spec_path=args.data_spec,
-    params=params,
-    feature_list_provided=feature_list,
-  )
-
-  logging.info("Build DataRecordTrainer.")
-  metric_fn = get_metric_fn("OONC_Engagement" if len(params.tasks) == 2 else "OONC", False)
-
-  trainer = twml.trainers.DataRecordTrainer(
-    name="magic_recs",
-    params=args,
-    build_graph_fn=lambda *args: ALL_MODELS[params.model.name](params=params)(*args),
-    save_dir=args.save_dir,
-    run_config=None,
-    feature_config=feature_config,
-    metric_fn=metric_fn,
-  )
-
-  logging.info("Run the evaluation.")
-  start = datetime.now()
-  trainer._estimator.evaluate(
-    input_fn=trainer.get_eval_input_fn(repeat=False, shuffle=False),
-    steps=None if (args.eval_steps is not None and args.eval_steps < 0) else args.eval_steps,
-    checkpoint_path=args.eval_checkpoint,
-  )
-  logging.info(f"Evaluating time: {datetime.now() - start}.")
+    args, _ = get_eval_arg_parser().parse_known_args()
+    logging.info(f"Parsed args: {args}")
+
+    params = load_graph_params(args)
+    logging.info(f"Loaded graph params: {params}")
+
+    logging.info(f"Get Feature Config: {args.feature_list}")
+    feature_list = read_config(args.feature_list).items()
+    feature_config = get_feature_config(
+        data_spec_path=args.data_spec,
+        params=params,
+        feature_list_provided=feature_list,
+    )
+
+    logging.info("Build DataRecordTrainer.")
+    metric_fn = get_metric_fn(
+        "OONC_Engagement" if len(params.tasks) == 2 else "OONC", False
+    )
+
+    trainer = twml.trainers.DataRecordTrainer(
+        name="magic_recs",
+        params=args,
+        build_graph_fn=lambda *args: ALL_MODELS[params.model.name](params=params)(
+            *args
+        ),
+        save_dir=args.save_dir,
+        run_config=None,
+        feature_config=feature_config,
+        metric_fn=metric_fn,
+    )
+
+    logging.info("Run the evaluation.")
+    start = datetime.now()
+    trainer._estimator.evaluate(
+        input_fn=trainer.get_eval_input_fn(repeat=False, shuffle=False),
+        steps=None
+        if (args.eval_steps is not None and args.eval_steps < 0)
+        else args.eval_steps,
+        checkpoint_path=args.eval_checkpoint,
+    )
+    logging.info(f"Evaluating time: {datetime.now() - start}.")
 
 
 if __name__ == "__main__":
-  main()
-  logging.info("Job done.")
+    main()
+    logging.info("Job done.")
diff --git a/pushservice/src/main/python/models/heavy_ranking/features.py b/pushservice/src/main/python/models/heavy_ranking/features.py
index ce6a2686a..bf91f1d77 100644
--- a/pushservice/src/main/python/models/heavy_ranking/features.py
+++ b/pushservice/src/main/python/models/heavy_ranking/features.py
@@ -1,138 +1,152 @@
 import os
 from typing import Dict
 
+import tensorflow as tf
+import tensorflow.compat.v1 as tf1
+from tensorflow import Tensor
 from twitter.deepbird.projects.magic_recs.libs.model_utils import filter_nans_and_infs
+
 import twml
 from twml.layers import full_sparse, sparse_max_norm
 
 from .params import FeaturesParams, GraphParams, SparseFeaturesParams
 
-import tensorflow as tf
-from tensorflow import Tensor
-import tensorflow.compat.v1 as tf1
-
-
 FEAT_CONFIG_DEFAULT_VAL = 0
 DEFAULT_FEATURE_LIST_PATH = "./feature_list_default.yaml"
 FEATURE_LIST_DEFAULT_PATH = os.path.join(
-  os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_PATH
+    os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_PATH
 )
 
 
-def get_feature_config(data_spec_path=None, feature_list_provided=[], params: GraphParams = None):
+def get_feature_config(
+    data_spec_path=None, feature_list_provided=[], params: GraphParams = None
+):
+    a_string_feat_list = [
+        feat for feat, feat_type in feature_list_provided if feat_type != "S"
+    ]
 
-  a_string_feat_list = [feat for feat, feat_type in feature_list_provided if feat_type != "S"]
-
-  builder = twml.contrib.feature_config.FeatureConfigBuilder(
-    data_spec_path=data_spec_path, debug=False
-  )
-
-  builder = builder.extract_feature_group(
-    feature_regexes=a_string_feat_list,
-    group_name="continuous_features",
-    default_value=FEAT_CONFIG_DEFAULT_VAL,
-    type_filter=["CONTINUOUS"],
-  )
-
-  builder = builder.extract_feature_group(
-    feature_regexes=a_string_feat_list,
-    group_name="binary_features",
-    type_filter=["BINARY"],
-  )
+    builder = twml.contrib.feature_config.FeatureConfigBuilder(
+        data_spec_path=data_spec_path, debug=False
+    )
 
-  if params.model.features.sparse_features:
-    builder = builder.extract_features_as_hashed_sparse(
-      feature_regexes=a_string_feat_list,
-      hash_space_size_bits=params.model.features.sparse_features.bits,
-      type_filter=["DISCRETE", "STRING", "SPARSE_BINARY"],
-      output_tensor_name="sparse_not_continuous",
+    builder = builder.extract_feature_group(
+        feature_regexes=a_string_feat_list,
+        group_name="continuous_features",
+        default_value=FEAT_CONFIG_DEFAULT_VAL,
+        type_filter=["CONTINUOUS"],
     )
 
-    builder = builder.extract_features_as_hashed_sparse(
-      feature_regexes=[feat for feat, feat_type in feature_list_provided if feat_type == "S"],
-      hash_space_size_bits=params.model.features.sparse_features.bits,
-      type_filter=["SPARSE_CONTINUOUS"],
-      output_tensor_name="sparse_continuous",
+    builder = builder.extract_feature_group(
+        feature_regexes=a_string_feat_list,
+        group_name="binary_features",
+        type_filter=["BINARY"],
     )
 
-  builder = builder.add_labels([task.label for task in params.tasks] + ["label.ntabDislike"])
+    if params.model.features.sparse_features:
+        builder = builder.extract_features_as_hashed_sparse(
+            feature_regexes=a_string_feat_list,
+            hash_space_size_bits=params.model.features.sparse_features.bits,
+            type_filter=["DISCRETE", "STRING", "SPARSE_BINARY"],
+            output_tensor_name="sparse_not_continuous",
+        )
+
+        builder = builder.extract_features_as_hashed_sparse(
+            feature_regexes=[
+                feat for feat, feat_type in feature_list_provided if feat_type == "S"
+            ],
+            hash_space_size_bits=params.model.features.sparse_features.bits,
+            type_filter=["SPARSE_CONTINUOUS"],
+            output_tensor_name="sparse_continuous",
+        )
+
+    builder = builder.add_labels(
+        [task.label for task in params.tasks] + ["label.ntabDislike"]
+    )
 
-  if params.weight:
-    builder = builder.define_weight(params.weight)
+    if params.weight:
+        builder = builder.define_weight(params.weight)
 
-  return builder.build()
+    return builder.build()
 
 
 def dense_features(features: Dict[str, Tensor], training: bool) -> Tensor:
-  """
-  Performs feature transformations on the raw dense features (continuous and binary).
-  """
-  with tf.name_scope("dense_features"):
-    x = filter_nans_and_infs(features["continuous_features"])
-
-    x = tf.sign(x) * tf.math.log(tf.abs(x) + 1)
-    x = tf1.layers.batch_normalization(
-      x, momentum=0.9999, training=training, renorm=training, axis=1
-    )
-    x = tf.clip_by_value(x, -5, 5)
+    """
+    Performs feature transformations on the raw dense features (continuous and binary).
+    """
+    with tf.name_scope("dense_features"):
+        x = filter_nans_and_infs(features["continuous_features"])
 
-    transformed_continous_features = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)
+        x = tf.sign(x) * tf.math.log(tf.abs(x) + 1)
+        x = tf1.layers.batch_normalization(
+            x, momentum=0.9999, training=training, renorm=training, axis=1
+        )
+        x = tf.clip_by_value(x, -5, 5)
 
-    binary_features = filter_nans_and_infs(features["binary_features"])
-    binary_features = tf.dtypes.cast(binary_features, tf.float32)
+        transformed_continous_features = tf.where(
+            tf.math.is_nan(x), tf.zeros_like(x), x
+        )
 
-    output = tf.concat([transformed_continous_features, binary_features], axis=1)
+        binary_features = filter_nans_and_infs(features["binary_features"])
+        binary_features = tf.dtypes.cast(binary_features, tf.float32)
 
-  return output
+        output = tf.concat([transformed_continous_features, binary_features], axis=1)
+
+    return output
 
 
 def sparse_features(
-  features: Dict[str, Tensor], training: bool, params: SparseFeaturesParams
+    features: Dict[str, Tensor], training: bool, params: SparseFeaturesParams
+) -> Tensor:
+    """
+    Performs feature transformations on the raw sparse features.
+    """
+
+    with tf.name_scope("sparse_features"):
+        with tf.name_scope("sparse_not_continuous"):
+            sparse_not_continuous = full_sparse(
+                inputs=features["sparse_not_continuous"],
+                output_size=params.embedding_size,
+                use_sparse_grads=training,
+                use_binary_values=False,
+            )
+
+        with tf.name_scope("sparse_continuous"):
+            shape_enforced_input = twml.util.limit_sparse_tensor_size(
+                sparse_tf=features["sparse_continuous"],
+                input_size_bits=params.bits,
+                mask_indices=False,
+            )
+
+            normalized_continuous_sparse = sparse_max_norm(
+                inputs=shape_enforced_input, is_training=training
+            )
+
+            sparse_continuous = full_sparse(
+                inputs=normalized_continuous_sparse,
+                output_size=params.embedding_size,
+                use_sparse_grads=training,
+                use_binary_values=False,
+            )
+
+        output = tf.concat([sparse_not_continuous, sparse_continuous], axis=1)
+
+    return output
+
+
+def get_features(
+    features: Dict[str, Tensor], training: bool, params: FeaturesParams
 ) -> Tensor:
-  """
-  Performs feature transformations on the raw sparse features.
-  """
-
-  with tf.name_scope("sparse_features"):
-    with tf.name_scope("sparse_not_continuous"):
-      sparse_not_continuous = full_sparse(
-        inputs=features["sparse_not_continuous"],
-        output_size=params.embedding_size,
-        use_sparse_grads=training,
-        use_binary_values=False,
-      )
-
-    with tf.name_scope("sparse_continuous"):
-      shape_enforced_input = twml.util.limit_sparse_tensor_size(
-        sparse_tf=features["sparse_continuous"], input_size_bits=params.bits, mask_indices=False
-      )
-
-      normalized_continuous_sparse = sparse_max_norm(
-        inputs=shape_enforced_input, is_training=training
-      )
-
-      sparse_continuous = full_sparse(
-        inputs=normalized_continuous_sparse,
-        output_size=params.embedding_size,
-        use_sparse_grads=training,
-        use_binary_values=False,
-      )
-
-    output = tf.concat([sparse_not_continuous, sparse_continuous], axis=1)
-
-  return output
-
-
-def get_features(features: Dict[str, Tensor], training: bool, params: FeaturesParams) -> Tensor:
-  """
-  Performs feature transformations on the dense and sparse features and combine the resulting
-  tensors into a single one.
-  """
-  with tf.name_scope("features"):
-    x = dense_features(features, training)
-    tf1.logging.info(f"Dense features: {x.shape}")
-
-    if params.sparse_features:
-      x = tf.concat([x, sparse_features(features, training, params.sparse_features)], axis=1)
-
-  return x
+    """
+    Performs feature transformations on the dense and sparse features and combine the resulting
+    tensors into a single one.
+    """
+    with tf.name_scope("features"):
+        x = dense_features(features, training)
+        tf1.logging.info(f"Dense features: {x.shape}")
+
+        if params.sparse_features:
+            x = tf.concat(
+                [x, sparse_features(features, training, params.sparse_features)], axis=1
+            )
+
+    return x
diff --git a/pushservice/src/main/python/models/heavy_ranking/graph.py b/pushservice/src/main/python/models/heavy_ranking/graph.py
index 4188736ac..7cd974ff0 100644
--- a/pushservice/src/main/python/models/heavy_ranking/graph.py
+++ b/pushservice/src/main/python/models/heavy_ranking/graph.py
@@ -11,119 +11,130 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict
 
+import tensorflow as tf
+import tensorflow.compat.v1 as tf1
 from twitter.deepbird.hparam import HParams
+
 import twml
 
 from ..libs.model_utils import generate_disliked_mask
 from .params import GraphParams
 
-import tensorflow as tf
-import tensorflow.compat.v1 as tf1
-
 
 class Graph(ABC):
-  def __init__(self, params: GraphParams):
-    self.params = params
-
-  @abstractmethod
-  def get_logits(self, features: Dict[str, tf.Tensor], mode: tf.estimator.ModeKeys) -> tf.Tensor:
-    pass
-
-  def get_probabilities(self, logits: tf.Tensor) -> tf.Tensor:
-    return tf.math.cumprod(tf.nn.sigmoid(logits), axis=1, name="probabilities")
-
-  def get_task_weights(self, labels: tf.Tensor) -> tf.Tensor:
-    oonc_label = tf.reshape(labels[:, 0], shape=(-1, 1))
-    task_weights = tf.concat([tf.ones_like(oonc_label), oonc_label], axis=1)
-
-    n_labels = len(self.params.tasks)
-    task_weights = tf.reshape(task_weights[:, 0:n_labels], shape=(-1, n_labels))
-
-    return task_weights
-
-  def get_loss(self, labels: tf.Tensor, logits: tf.Tensor, **kwargs: Any) -> tf.Tensor:
-    with tf.name_scope("weights"):
-      disliked_mask = generate_disliked_mask(labels)
-
-      labels = tf.reshape(labels[:, 0:2], shape=[-1, 2])
-
-      labels = labels * tf.cast(tf.logical_not(disliked_mask), dtype=labels.dtype)
-
-      with tf.name_scope("task_weight"):
-        task_weights = self.get_task_weights(labels)
-
-      with tf.name_scope("batch_size"):
-        batch_size = tf.cast(tf.shape(labels)[0], dtype=tf.float32, name="batch_size")
-
-      weights = task_weights / batch_size
-
-    with tf.name_scope("loss"):
-      loss = tf.reduce_sum(
-        tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) * weights,
-      )
-
-    return loss
-
-  def get_score(self, probabilities: tf.Tensor) -> tf.Tensor:
-    with tf.name_scope("score_weight"):
-      score_weights = tf.constant([task.score_weight for task in self.params.tasks])
-      score_weights = score_weights / tf.reduce_sum(score_weights, axis=0)
-
-    with tf.name_scope("score"):
-      score = tf.reshape(tf.reduce_sum(probabilities * score_weights, axis=1), shape=[-1, 1])
-
-    return score
-
-  def get_train_op(self, loss: tf.Tensor, twml_params) -> Any:
-    with tf.name_scope("optimizer"):
-      learning_rate = twml_params.learning_rate
-      optimizer = tf1.train.GradientDescentOptimizer(learning_rate=learning_rate)
-
-    update_ops = set(tf1.get_collection(tf1.GraphKeys.UPDATE_OPS))
-    with tf.control_dependencies(update_ops):
-      train_op = twml.optimizers.optimize_loss(
-        loss=loss,
-        variables=tf1.trainable_variables(),
-        global_step=tf1.train.get_global_step(),
-        optimizer=optimizer,
-        learning_rate=None,
-      )
-
-    return train_op
-
-  def __call__(
-    self,
-    features: Dict[str, tf.Tensor],
-    labels: tf.Tensor,
-    mode: tf.estimator.ModeKeys,
-    params: HParams,
-    config=None,
-  ) -> Dict[str, tf.Tensor]:
-    training = mode == tf.estimator.ModeKeys.TRAIN
-    logits = self.get_logits(features=features, training=training)
-    probabilities = self.get_probabilities(logits=logits)
-    score = None
-    loss = None
-    train_op = None
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      score = self.get_score(probabilities=probabilities)
-      output = {"loss": loss, "train_op": train_op, "prediction": score}
-
-    elif mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
-      loss = self.get_loss(labels=labels, logits=logits)
-
-      if mode == tf.estimator.ModeKeys.TRAIN:
-        train_op = self.get_train_op(loss=loss, twml_params=params)
-
-      output = {"loss": loss, "train_op": train_op, "output": probabilities}
-
-    else:
-      raise ValueError(
-        f"""
+    def __init__(self, params: GraphParams):
+        self.params = params
+
+    @abstractmethod
+    def get_logits(
+        self, features: Dict[str, tf.Tensor], mode: tf.estimator.ModeKeys
+    ) -> tf.Tensor:
+        pass
+
+    def get_probabilities(self, logits: tf.Tensor) -> tf.Tensor:
+        return tf.math.cumprod(tf.nn.sigmoid(logits), axis=1, name="probabilities")
+
+    def get_task_weights(self, labels: tf.Tensor) -> tf.Tensor:
+        oonc_label = tf.reshape(labels[:, 0], shape=(-1, 1))
+        task_weights = tf.concat([tf.ones_like(oonc_label), oonc_label], axis=1)
+
+        n_labels = len(self.params.tasks)
+        task_weights = tf.reshape(task_weights[:, 0:n_labels], shape=(-1, n_labels))
+
+        return task_weights
+
+    def get_loss(
+        self, labels: tf.Tensor, logits: tf.Tensor, **kwargs: Any
+    ) -> tf.Tensor:
+        with tf.name_scope("weights"):
+            disliked_mask = generate_disliked_mask(labels)
+
+            labels = tf.reshape(labels[:, 0:2], shape=[-1, 2])
+
+            labels = labels * tf.cast(tf.logical_not(disliked_mask), dtype=labels.dtype)
+
+            with tf.name_scope("task_weight"):
+                task_weights = self.get_task_weights(labels)
+
+            with tf.name_scope("batch_size"):
+                batch_size = tf.cast(
+                    tf.shape(labels)[0], dtype=tf.float32, name="batch_size"
+                )
+
+            weights = task_weights / batch_size
+
+        with tf.name_scope("loss"):
+            loss = tf.reduce_sum(
+                tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
+                * weights,
+            )
+
+        return loss
+
+    def get_score(self, probabilities: tf.Tensor) -> tf.Tensor:
+        with tf.name_scope("score_weight"):
+            score_weights = tf.constant(
+                [task.score_weight for task in self.params.tasks]
+            )
+            score_weights = score_weights / tf.reduce_sum(score_weights, axis=0)
+
+        with tf.name_scope("score"):
+            score = tf.reshape(
+                tf.reduce_sum(probabilities * score_weights, axis=1), shape=[-1, 1]
+            )
+
+        return score
+
+    def get_train_op(self, loss: tf.Tensor, twml_params) -> Any:
+        with tf.name_scope("optimizer"):
+            learning_rate = twml_params.learning_rate
+            optimizer = tf1.train.GradientDescentOptimizer(learning_rate=learning_rate)
+
+        update_ops = set(tf1.get_collection(tf1.GraphKeys.UPDATE_OPS))
+        with tf.control_dependencies(update_ops):
+            train_op = twml.optimizers.optimize_loss(
+                loss=loss,
+                variables=tf1.trainable_variables(),
+                global_step=tf1.train.get_global_step(),
+                optimizer=optimizer,
+                learning_rate=None,
+            )
+
+        return train_op
+
+    def __call__(
+        self,
+        features: Dict[str, tf.Tensor],
+        labels: tf.Tensor,
+        mode: tf.estimator.ModeKeys,
+        params: HParams,
+        config=None,
+    ) -> Dict[str, tf.Tensor]:
+        training = mode == tf.estimator.ModeKeys.TRAIN
+        logits = self.get_logits(features=features, training=training)
+        probabilities = self.get_probabilities(logits=logits)
+        score = None
+        loss = None
+        train_op = None
+
+        if mode == tf.estimator.ModeKeys.PREDICT:
+            score = self.get_score(probabilities=probabilities)
+            output = {"loss": loss, "train_op": train_op, "prediction": score}
+
+        elif mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
+            loss = self.get_loss(labels=labels, logits=logits)
+
+            if mode == tf.estimator.ModeKeys.TRAIN:
+                train_op = self.get_train_op(loss=loss, twml_params=params)
+
+            output = {"loss": loss, "train_op": train_op, "output": probabilities}
+
+        else:
+            raise ValueError(
+                f"""
         Invalid mode. Possible values are: {tf.estimator.ModeKeys.PREDICT}, {tf.estimator.ModeKeys.TRAIN}, and {tf.estimator.ModeKeys.EVAL}
         . Passed: {mode}
       """
-      )
+            )
 
-    return output
+        return output
diff --git a/pushservice/src/main/python/models/heavy_ranking/lib/layers.py b/pushservice/src/main/python/models/heavy_ranking/lib/layers.py
index 33dd6f012..84d44aa20 100644
--- a/pushservice/src/main/python/models/heavy_ranking/lib/layers.py
+++ b/pushservice/src/main/python/models/heavy_ranking/lib/layers.py
@@ -7,122 +7,127 @@
 
 
 class KerasConv1D(tf.keras.layers.Layer):
-  """
-  Basic Conv1D layer in a wrapper to be compatible with ClemNet.
-  """
-
-  def __init__(
-    self,
-    kernel_size: int,
-    filters: int,
-    strides: int,
-    padding: str,
-    use_bias: bool = True,
-    kernel_initializer: str = "glorot_uniform",
-    bias_initializer: str = "zeros",
-    **kwargs: Any,
-  ):
-    super(KerasConv1D, self).__init__(**kwargs)
-    self.kernel_size = kernel_size
-    self.filters = filters
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-    self.strides = strides
-    self.padding = padding
-
-  def build(self, input_shape: tf.TensorShape) -> None:
-    assert (
-      len(input_shape) == 3
-    ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
-
-    self.features = input_shape[1]
-
-    self.w = tf.keras.layers.Conv1D(
-      kernel_size=self.kernel_size,
-      filters=self.filters,
-      strides=self.strides,
-      padding=self.padding,
-      use_bias=self.use_bias,
-      kernel_initializer=self.kernel_initializer,
-      bias_initializer=self.bias_initializer,
-      name=self.name,
-    )
-
-  def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
-    return self.w(inputs)
+    """
+    Basic Conv1D layer in a wrapper to be compatible with ClemNet.
+    """
+
+    def __init__(
+        self,
+        kernel_size: int,
+        filters: int,
+        strides: int,
+        padding: str,
+        use_bias: bool = True,
+        kernel_initializer: str = "glorot_uniform",
+        bias_initializer: str = "zeros",
+        **kwargs: Any,
+    ):
+        super(KerasConv1D, self).__init__(**kwargs)
+        self.kernel_size = kernel_size
+        self.filters = filters
+        self.use_bias = use_bias
+        self.kernel_initializer = kernel_initializer
+        self.bias_initializer = bias_initializer
+        self.strides = strides
+        self.padding = padding
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        assert (
+            len(input_shape) == 3
+        ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
+
+        self.features = input_shape[1]
+
+        self.w = tf.keras.layers.Conv1D(
+            kernel_size=self.kernel_size,
+            filters=self.filters,
+            strides=self.strides,
+            padding=self.padding,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer,
+            bias_initializer=self.bias_initializer,
+            name=self.name,
+        )
+
+    def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
+        return self.w(inputs)
 
 
 class ChannelWiseDense(tf.keras.layers.Layer):
-  """
-  Dense layer is applied to each channel separately. This is more memory and computationally
-  efficient than flattening the channels and performing single dense layers over it which is the
-  default behavior in tf1.
-  """
-
-  def __init__(
-    self,
-    output_size: int,
-    use_bias: bool,
-    kernel_initializer: str = "uniform_glorot",
-    bias_initializer: str = "zeros",
-    **kwargs: Any,
-  ):
-    super(ChannelWiseDense, self).__init__(**kwargs)
-    self.output_size = output_size
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-
-  def build(self, input_shape: tf.TensorShape) -> None:
-    assert (
-      len(input_shape) == 3
-    ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
-
-    input_size = input_shape[1]
-    channels = input_shape[2]
-
-    self.kernel = self.add_weight(
-      name="kernel",
-      shape=(channels, input_size, self.output_size),
-      initializer=self.kernel_initializer,
-      trainable=True,
-    )
-
-    self.bias = self.add_weight(
-      name="bias",
-      shape=(channels, self.output_size),
-      initializer=self.bias_initializer,
-      trainable=self.use_bias,
-    )
-
-  def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
-    x = inputs
-
-    transposed_x = tf.transpose(x, perm=[2, 0, 1])
-    transposed_residual = (
-      tf.transpose(tf.matmul(transposed_x, self.kernel), perm=[1, 0, 2]) + self.bias
-    )
-    output = tf.transpose(transposed_residual, perm=[0, 2, 1])
-
-    return output
+    """
+    Dense layer is applied to each channel separately. This is more memory and computationally
+    efficient than flattening the channels and performing single dense layers over it which is the
+    default behavior in tf1.
+    """
+
+    def __init__(
+        self,
+        output_size: int,
+        use_bias: bool,
+        kernel_initializer: str = "uniform_glorot",
+        bias_initializer: str = "zeros",
+        **kwargs: Any,
+    ):
+        super(ChannelWiseDense, self).__init__(**kwargs)
+        self.output_size = output_size
+        self.use_bias = use_bias
+        self.kernel_initializer = kernel_initializer
+        self.bias_initializer = bias_initializer
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        assert (
+            len(input_shape) == 3
+        ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
+
+        input_size = input_shape[1]
+        channels = input_shape[2]
+
+        self.kernel = self.add_weight(
+            name="kernel",
+            shape=(channels, input_size, self.output_size),
+            initializer=self.kernel_initializer,
+            trainable=True,
+        )
+
+        self.bias = self.add_weight(
+            name="bias",
+            shape=(channels, self.output_size),
+            initializer=self.bias_initializer,
+            trainable=self.use_bias,
+        )
+
+    def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
+        x = inputs
+
+        transposed_x = tf.transpose(x, perm=[2, 0, 1])
+        transposed_residual = (
+            tf.transpose(tf.matmul(transposed_x, self.kernel), perm=[1, 0, 2])
+            + self.bias
+        )
+        output = tf.transpose(transposed_residual, perm=[0, 2, 1])
+
+        return output
 
 
 class ResidualLayer(tf.keras.layers.Layer):
-  """
-  Layer implementing a 3D-residual connection.
-  """
-
-  def build(self, input_shape: tf.TensorShape) -> None:
-    assert (
-      len(input_shape) == 3
-    ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
-
-  def call(self, inputs: tf.Tensor, residual: tf.Tensor, **kwargs: Any) -> tf.Tensor:
-    shortcut = tf.keras.layers.Conv1D(
-      filters=int(residual.shape[2]), strides=1, kernel_size=1, padding="SAME", use_bias=False
-    )(inputs)
-
-    output = tf.add(shortcut, residual)
-
-    return output
+    """
+    Layer implementing a 3D-residual connection.
+    """
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        assert (
+            len(input_shape) == 3
+        ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
+
+    def call(self, inputs: tf.Tensor, residual: tf.Tensor, **kwargs: Any) -> tf.Tensor:
+        shortcut = tf.keras.layers.Conv1D(
+            filters=int(residual.shape[2]),
+            strides=1,
+            kernel_size=1,
+            padding="SAME",
+            use_bias=False,
+        )(inputs)
+
+        output = tf.add(shortcut, residual)
+
+        return output
diff --git a/pushservice/src/main/python/models/heavy_ranking/lib/model.py b/pushservice/src/main/python/models/heavy_ranking/lib/model.py
index c6c8b1c6b..25b85df0d 100644
--- a/pushservice/src/main/python/models/heavy_ranking/lib/model.py
+++ b/pushservice/src/main/python/models/heavy_ranking/lib/model.py
@@ -3,74 +3,80 @@
 """
 from typing import Any
 
-from .layers import ChannelWiseDense, KerasConv1D, ResidualLayer
-from .params import BlockParams, ClemNetParams
-
 import tensorflow as tf
 import tensorflow.compat.v1 as tf1
 
+from .layers import ChannelWiseDense, KerasConv1D, ResidualLayer
+from .params import BlockParams, ClemNetParams
 
-class Block2(tf.keras.layers.Layer):
-  """
-  Possible ClemNet block. Architecture is as follow:
-    Optional(DenseLayer + BN + Act)
-    Optional(ConvLayer + BN + Act)
-    Optional(Residual Layer)
-
-  """
-
-  def __init__(self, params: BlockParams, **kwargs: Any):
-    super(Block2, self).__init__(**kwargs)
-    self.params = params
-
-  def build(self, input_shape: tf.TensorShape) -> None:
-    assert (
-      len(input_shape) == 3
-    ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
-
-  def call(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
-    x = inputs
-    if self.params.dense:
-      x = ChannelWiseDense(**self.params.dense.dict())(inputs=x, training=training)
-      x = tf1.layers.batch_normalization(x, momentum=0.9999, training=training, axis=1)
-      x = tf.keras.layers.Activation(self.params.activation)(x)
-
-    if self.params.conv:
-      x = KerasConv1D(**self.params.conv.dict())(inputs=x, training=training)
-      x = tf1.layers.batch_normalization(x, momentum=0.9999, training=training, axis=1)
-      x = tf.keras.layers.Activation(self.params.activation)(x)
-
-    if self.params.residual:
-      x = ResidualLayer()(inputs=inputs, residual=x)
 
-    return x
+class Block2(tf.keras.layers.Layer):
+    """
+    Possible ClemNet block. Architecture is as follow:
+      Optional(DenseLayer + BN + Act)
+      Optional(ConvLayer + BN + Act)
+      Optional(Residual Layer)
+
+    """
+
+    def __init__(self, params: BlockParams, **kwargs: Any):
+        super(Block2, self).__init__(**kwargs)
+        self.params = params
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        assert (
+            len(input_shape) == 3
+        ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
+
+    def call(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
+        x = inputs
+        if self.params.dense:
+            x = ChannelWiseDense(**self.params.dense.dict())(
+                inputs=x, training=training
+            )
+            x = tf1.layers.batch_normalization(
+                x, momentum=0.9999, training=training, axis=1
+            )
+            x = tf.keras.layers.Activation(self.params.activation)(x)
+
+        if self.params.conv:
+            x = KerasConv1D(**self.params.conv.dict())(inputs=x, training=training)
+            x = tf1.layers.batch_normalization(
+                x, momentum=0.9999, training=training, axis=1
+            )
+            x = tf.keras.layers.Activation(self.params.activation)(x)
+
+        if self.params.residual:
+            x = ResidualLayer()(inputs=inputs, residual=x)
+
+        return x
 
 
 class ClemNet(tf.keras.layers.Layer):
-  """
-  A residual network stacking residual blocks composed of dense layers and convolutions.
-  """
+    """
+    A residual network stacking residual blocks composed of dense layers and convolutions.
+    """
 
-  def __init__(self, params: ClemNetParams, **kwargs: Any):
-    super(ClemNet, self).__init__(**kwargs)
-    self.params = params
+    def __init__(self, params: ClemNetParams, **kwargs: Any):
+        super(ClemNet, self).__init__(**kwargs)
+        self.params = params
 
-  def build(self, input_shape: tf.TensorShape) -> None:
-    assert len(input_shape) in (
-      2,
-      3,
-    ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
+    def build(self, input_shape: tf.TensorShape) -> None:
+        assert len(input_shape) in (
+            2,
+            3,
+        ), f"Tensor shape must be of length 3. Passed tensor of shape {input_shape}."
 
-  def call(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
-    if len(inputs.shape) < 3:
-      inputs = tf.expand_dims(inputs, axis=-1)
+    def call(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
+        if len(inputs.shape) < 3:
+            inputs = tf.expand_dims(inputs, axis=-1)
 
-    x = inputs
-    for block_params in self.params.blocks:
-      x = Block2(block_params)(inputs=x, training=training)
+        x = inputs
+        for block_params in self.params.blocks:
+            x = Block2(block_params)(inputs=x, training=training)
 
-    x = tf.keras.layers.Flatten(name="flattened")(x)
-    if self.params.top:
-      x = tf.keras.layers.Dense(units=self.params.top.n_labels, name="logits")(x)
+        x = tf.keras.layers.Flatten(name="flattened")(x)
+        if self.params.top:
+            x = tf.keras.layers.Dense(units=self.params.top.n_labels, name="logits")(x)
 
-    return x
+        return x
diff --git a/pushservice/src/main/python/models/heavy_ranking/lib/params.py b/pushservice/src/main/python/models/heavy_ranking/lib/params.py
index 721d6ed95..211aebf86 100644
--- a/pushservice/src/main/python/models/heavy_ranking/lib/params.py
+++ b/pushservice/src/main/python/models/heavy_ranking/lib/params.py
@@ -5,45 +5,44 @@
 
 from pydantic import BaseModel, Extra, Field, PositiveInt
 
-
 # checkstyle: noqa
 
 
 class ExtendedBaseModel(BaseModel):
-  class Config:
-    extra = Extra.forbid
+    class Config:
+        extra = Extra.forbid
 
 
 class DenseParams(ExtendedBaseModel):
-  name: Optional[str]
-  bias_initializer: str = "zeros"
-  kernel_initializer: str = "glorot_uniform"
-  output_size: PositiveInt
-  use_bias: bool = Field(True)
+    name: Optional[str]
+    bias_initializer: str = "zeros"
+    kernel_initializer: str = "glorot_uniform"
+    output_size: PositiveInt
+    use_bias: bool = Field(True)
 
 
 class ConvParams(ExtendedBaseModel):
-  name: Optional[str]
-  bias_initializer: str = "zeros"
-  filters: PositiveInt
-  kernel_initializer: str = "glorot_uniform"
-  kernel_size: PositiveInt
-  padding: str = "SAME"
-  strides: PositiveInt = 1
-  use_bias: bool = Field(True)
+    name: Optional[str]
+    bias_initializer: str = "zeros"
+    filters: PositiveInt
+    kernel_initializer: str = "glorot_uniform"
+    kernel_size: PositiveInt
+    padding: str = "SAME"
+    strides: PositiveInt = 1
+    use_bias: bool = Field(True)
 
 
 class BlockParams(ExtendedBaseModel):
-  activation: Optional[str]
-  conv: Optional[ConvParams]
-  dense: Optional[DenseParams]
-  residual: Optional[bool]
+    activation: Optional[str]
+    conv: Optional[ConvParams]
+    dense: Optional[DenseParams]
+    residual: Optional[bool]
 
 
 class TopLayerParams(ExtendedBaseModel):
-  n_labels: PositiveInt
+    n_labels: PositiveInt
 
 
 class ClemNetParams(ExtendedBaseModel):
-  blocks: List[BlockParams] = []
-  top: Optional[TopLayerParams]
+    blocks: List[BlockParams] = []
+    top: Optional[TopLayerParams]
diff --git a/pushservice/src/main/python/models/heavy_ranking/model_pools.py b/pushservice/src/main/python/models/heavy_ranking/model_pools.py
index de59ee1a6..d7d886311 100644
--- a/pushservice/src/main/python/models/heavy_ranking/model_pools.py
+++ b/pushservice/src/main/python/models/heavy_ranking/model_pools.py
@@ -6,29 +6,30 @@
 
 from typing import Dict
 
+import tensorflow as tf
+
 from .features import get_features
 from .graph import Graph
 from .lib.model import ClemNet
 from .params import ModelTypeEnum
 
-import tensorflow as tf
-
 
 class MagicRecsClemNet(Graph):
-  def get_logits(self, features: Dict[str, tf.Tensor], training: bool) -> tf.Tensor:
-
-    with tf.name_scope("logits"):
-      inputs = get_features(features=features, training=training, params=self.params.model.features)
+    def get_logits(self, features: Dict[str, tf.Tensor], training: bool) -> tf.Tensor:
+        with tf.name_scope("logits"):
+            inputs = get_features(
+                features=features, training=training, params=self.params.model.features
+            )
 
-      with tf.name_scope("OONC_logits"):
-        model = ClemNet(params=self.params.model.architecture)
-        oonc_logit = model(inputs=inputs, training=training)
+            with tf.name_scope("OONC_logits"):
+                model = ClemNet(params=self.params.model.architecture)
+                oonc_logit = model(inputs=inputs, training=training)
 
-      with tf.name_scope("EngagementGivenOONC_logits"):
-        model = ClemNet(params=self.params.model.architecture)
-        eng_logits = model(inputs=inputs, training=training)
+            with tf.name_scope("EngagementGivenOONC_logits"):
+                model = ClemNet(params=self.params.model.architecture)
+                eng_logits = model(inputs=inputs, training=training)
 
-      return tf.concat([oonc_logit, eng_logits], axis=1)
+            return tf.concat([oonc_logit, eng_logits], axis=1)
 
 
 ALL_MODELS = {ModelTypeEnum.clemnet: MagicRecsClemNet}
diff --git a/pushservice/src/main/python/models/heavy_ranking/params.py b/pushservice/src/main/python/models/heavy_ranking/params.py
index 64a7de2b1..408c6e8c3 100644
--- a/pushservice/src/main/python/models/heavy_ranking/params.py
+++ b/pushservice/src/main/python/models/heavy_ranking/params.py
@@ -2,88 +2,95 @@
 import json
 from typing import List, Optional
 
-from .lib.params import BlockParams, ClemNetParams, ConvParams, DenseParams, TopLayerParams
-
-from pydantic import BaseModel, Extra, NonNegativeFloat
 import tensorflow.compat.v1 as tf
+from pydantic import BaseModel, Extra, NonNegativeFloat
 
+from .lib.params import (
+    BlockParams,
+    ClemNetParams,
+    ConvParams,
+    DenseParams,
+    TopLayerParams,
+)
 
 # checkstyle: noqa
 
 
 class ExtendedBaseModel(BaseModel):
-  class Config:
-    extra = Extra.forbid
+    class Config:
+        extra = Extra.forbid
 
 
 class SparseFeaturesParams(ExtendedBaseModel):
-  bits: int
-  embedding_size: int
+    bits: int
+    embedding_size: int
 
 
 class FeaturesParams(ExtendedBaseModel):
-  sparse_features: Optional[SparseFeaturesParams]
+    sparse_features: Optional[SparseFeaturesParams]
 
 
 class ModelTypeEnum(str, enum.Enum):
-  clemnet: str = "clemnet"
+    clemnet: str = "clemnet"
 
 
 class ModelParams(ExtendedBaseModel):
-  name: ModelTypeEnum
-  features: FeaturesParams
-  architecture: ClemNetParams
+    name: ModelTypeEnum
+    features: FeaturesParams
+    architecture: ClemNetParams
 
 
 class TaskNameEnum(str, enum.Enum):
-  oonc: str = "OONC"
-  engagement: str = "Engagement"
+    oonc: str = "OONC"
+    engagement: str = "Engagement"
 
 
 class Task(ExtendedBaseModel):
-  name: TaskNameEnum
-  label: str
-  score_weight: NonNegativeFloat
+    name: TaskNameEnum
+    label: str
+    score_weight: NonNegativeFloat
 
 
 DEFAULT_TASKS = [
-  Task(name=TaskNameEnum.oonc, label="label", score_weight=0.9),
-  Task(name=TaskNameEnum.engagement, label="label.engagement", score_weight=0.1),
+    Task(name=TaskNameEnum.oonc, label="label", score_weight=0.9),
+    Task(name=TaskNameEnum.engagement, label="label.engagement", score_weight=0.1),
 ]
 
 
 class GraphParams(ExtendedBaseModel):
-  tasks: List[Task] = DEFAULT_TASKS
-  model: ModelParams
-  weight: Optional[str]
+    tasks: List[Task] = DEFAULT_TASKS
+    model: ModelParams
+    weight: Optional[str]
 
 
 DEFAULT_ARCHITECTURE_PARAMS = ClemNetParams(
-  blocks=[
-    BlockParams(
-      activation="relu",
-      conv=ConvParams(kernel_size=3, filters=5),
-      dense=DenseParams(output_size=output_size),
-      residual=False,
-    )
-    for output_size in [1024, 512, 256, 128]
-  ],
-  top=TopLayerParams(n_labels=1),
+    blocks=[
+        BlockParams(
+            activation="relu",
+            conv=ConvParams(kernel_size=3, filters=5),
+            dense=DenseParams(output_size=output_size),
+            residual=False,
+        )
+        for output_size in [1024, 512, 256, 128]
+    ],
+    top=TopLayerParams(n_labels=1),
 )
 
 DEFAULT_GRAPH_PARAMS = GraphParams(
-  model=ModelParams(
-    name=ModelTypeEnum.clemnet,
-    architecture=DEFAULT_ARCHITECTURE_PARAMS,
-    features=FeaturesParams(sparse_features=SparseFeaturesParams(bits=18, embedding_size=50)),
-  ),
+    model=ModelParams(
+        name=ModelTypeEnum.clemnet,
+        architecture=DEFAULT_ARCHITECTURE_PARAMS,
+        features=FeaturesParams(
+            sparse_features=SparseFeaturesParams(bits=18, embedding_size=50)
+        ),
+    ),
 )
 
 
 def load_graph_params(args) -> GraphParams:
-  params = DEFAULT_GRAPH_PARAMS
-  if args.param_file:
-    with tf.io.gfile.GFile(args.param_file, mode="r+") as file:
-      params = GraphParams.parse_obj(json.load(file))
+    params = DEFAULT_GRAPH_PARAMS
+    if args.param_file:
+        with tf.io.gfile.GFile(args.param_file, mode="r+") as file:
+            params = GraphParams.parse_obj(json.load(file))
 
-  return params
+    return params
diff --git a/pushservice/src/main/python/models/heavy_ranking/run_args.py b/pushservice/src/main/python/models/heavy_ranking/run_args.py
index 1cc33a8e0..c8d5f1941 100644
--- a/pushservice/src/main/python/models/heavy_ranking/run_args.py
+++ b/pushservice/src/main/python/models/heavy_ranking/run_args.py
@@ -4,56 +4,56 @@
 
 
 def get_training_arg_parser():
-  parser = DataRecordTrainer.add_parser_arguments()
-
-  parser.add_argument(
-    "--feature_list",
-    default=FEATURE_LIST_DEFAULT_PATH,
-    type=str,
-    help="Which features to use for training",
-  )
-
-  parser.add_argument(
-    "--param_file",
-    default=None,
-    type=str,
-    help="Path to JSON file containing the graph parameters. If None, model will load default parameters.",
-  )
-
-  parser.add_argument(
-    "--directly_export_best",
-    default=False,
-    action="store_true",
-    help="whether to directly_export best_checkpoint",
-  )
-
-  parser.add_argument(
-    "--warm_start_from", default=None, type=str, help="model dir to warm start from"
-  )
-
-  parser.add_argument(
-    "--warm_start_base_dir",
-    default=None,
-    type=str,
-    help="latest ckpt in this folder will be used to ",
-  )
-
-  parser.add_argument(
-    "--model_type",
-    default=None,
-    type=str,
-    help="Which type of model to train.",
-  )
-  return parser
+    parser = DataRecordTrainer.add_parser_arguments()
+
+    parser.add_argument(
+        "--feature_list",
+        default=FEATURE_LIST_DEFAULT_PATH,
+        type=str,
+        help="Which features to use for training",
+    )
+
+    parser.add_argument(
+        "--param_file",
+        default=None,
+        type=str,
+        help="Path to JSON file containing the graph parameters. If None, model will load default parameters.",
+    )
+
+    parser.add_argument(
+        "--directly_export_best",
+        default=False,
+        action="store_true",
+        help="whether to directly_export best_checkpoint",
+    )
+
+    parser.add_argument(
+        "--warm_start_from", default=None, type=str, help="model dir to warm start from"
+    )
+
+    parser.add_argument(
+        "--warm_start_base_dir",
+        default=None,
+        type=str,
+        help="latest ckpt in this folder will be used to ",
+    )
+
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        help="Which type of model to train.",
+    )
+    return parser
 
 
 def get_eval_arg_parser():
-  parser = get_training_arg_parser()
-  parser.add_argument(
-    "--eval_checkpoint",
-    default=None,
-    type=str,
-    help="Which checkpoint to use for evaluation",
-  )
-
-  return parser
+    parser = get_training_arg_parser()
+    parser.add_argument(
+        "--eval_checkpoint",
+        default=None,
+        type=str,
+        help="Which checkpoint to use for evaluation",
+    )
+
+    return parser
diff --git a/pushservice/src/main/python/models/heavy_ranking/update_warm_start_checkpoint.py b/pushservice/src/main/python/models/heavy_ranking/update_warm_start_checkpoint.py
index 04887b9cf..8176260ce 100644
--- a/pushservice/src/main/python/models/heavy_ranking/update_warm_start_checkpoint.py
+++ b/pushservice/src/main/python/models/heavy_ranking/update_warm_start_checkpoint.py
@@ -5,142 +5,161 @@
 
 import os
 
-from twitter.deepbird.projects.magic_recs.libs.get_feat_config import FEATURE_LIST_DEFAULT_PATH
+import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import logging
+from twitter.deepbird.projects.magic_recs.libs.get_feat_config import (
+    FEATURE_LIST_DEFAULT_PATH,
+)
 from twitter.deepbird.projects.magic_recs.libs.warm_start_utils_v11 import (
-  get_feature_list_for_heavy_ranking,
-  mkdirp,
-  rename_dir,
-  rmdir,
-  warm_start_checkpoint,
+    get_feature_list_for_heavy_ranking,
+    mkdirp,
+    rename_dir,
+    rmdir,
+    warm_start_checkpoint,
 )
+
 import twml
 from twml.trainers import DataRecordTrainer
 
-import tensorflow.compat.v1 as tf
-from tensorflow.compat.v1 import logging
-
 
 def get_arg_parser():
-  parser = DataRecordTrainer.add_parser_arguments()
-  parser.add_argument(
-    "--model_type",
-    default="deepnorm_gbdt_inputdrop2_rescale",
-    type=str,
-    help="specify the model type to use.",
-  )
-
-  parser.add_argument(
-    "--model_trainer_name",
-    default="None",
-    type=str,
-    help="deprecated, added here just for api compatibility.",
-  )
-
-  parser.add_argument(
-    "--warm_start_base_dir",
-    default="none",
-    type=str,
-    help="latest ckpt in this folder will be used.",
-  )
-
-  parser.add_argument(
-    "--output_checkpoint_dir",
-    default="none",
-    type=str,
-    help="Output folder for warm started ckpt. If none, it will move warm_start_base_dir to backup, and overwrite it",
-  )
-
-  parser.add_argument(
-    "--feature_list",
-    default="none",
-    type=str,
-    help="Which features to use for training",
-  )
-
-  parser.add_argument(
-    "--old_feature_list",
-    default="none",
-    type=str,
-    help="Which features to use for training",
-  )
-
-  return parser
-
-
-def get_params(args=None):
-  parser = get_arg_parser()
-  if args is None:
-    return parser.parse_args()
-  else:
-    return parser.parse_args(args)
-
-
-def _main():
-  opt = get_params()
-  logging.info("parse is: ")
-  logging.info(opt)
-
-  if opt.feature_list == "none":
-    feature_list_path = FEATURE_LIST_DEFAULT_PATH
-  else:
-    feature_list_path = opt.feature_list
-
-  if opt.warm_start_base_dir != "none" and tf.io.gfile.exists(opt.warm_start_base_dir):
-    if opt.output_checkpoint_dir == "none" or opt.output_checkpoint_dir == opt.warm_start_base_dir:
-      _warm_start_base_dir = os.path.normpath(opt.warm_start_base_dir) + "_backup_warm_start"
-      _output_folder_dir = opt.warm_start_base_dir
-
-      rename_dir(opt.warm_start_base_dir, _warm_start_base_dir)
-      tf.logging.info(f"moved {opt.warm_start_base_dir} to {_warm_start_base_dir}")
-    else:
-      _warm_start_base_dir = opt.warm_start_base_dir
-      _output_folder_dir = opt.output_checkpoint_dir
+    parser = DataRecordTrainer.add_parser_arguments()
+    parser.add_argument(
+        "--model_type",
+        default="deepnorm_gbdt_inputdrop2_rescale",
+        type=str,
+        help="specify the model type to use.",
+    )
 
-    continuous_binary_feat_list_save_path = os.path.join(
-      _warm_start_base_dir, "continuous_binary_feat_list.json"
+    parser.add_argument(
+        "--model_trainer_name",
+        default="None",
+        type=str,
+        help="deprecated, added here just for api compatibility.",
     )
 
-    if opt.old_feature_list != "none":
-      tf.logging.info("getting old continuous_binary_feat_list")
-      continuous_binary_feat_list = get_feature_list_for_heavy_ranking(
-        opt.old_feature_list, opt.data_spec
-      )
-      rmdir(continuous_binary_feat_list_save_path)
-      twml.util.write_file(
-        continuous_binary_feat_list_save_path, continuous_binary_feat_list, encode="json"
-      )
-      tf.logging.info(f"Finish writting files to {continuous_binary_feat_list_save_path}")
-
-    warm_start_folder = os.path.join(_warm_start_base_dir, "best_checkpoint")
-    if not tf.io.gfile.exists(warm_start_folder):
-      warm_start_folder = _warm_start_base_dir
-
-    rmdir(_output_folder_dir)
-    mkdirp(_output_folder_dir)
-
-    new_ckpt = warm_start_checkpoint(
-      warm_start_folder,
-      continuous_binary_feat_list_save_path,
-      feature_list_path,
-      opt.data_spec,
-      _output_folder_dir,
-      opt.model_type,
+    parser.add_argument(
+        "--warm_start_base_dir",
+        default="none",
+        type=str,
+        help="latest ckpt in this folder will be used.",
     )
-    logging.info(f"Created new ckpt {new_ckpt} from {warm_start_folder}")
 
-    tf.logging.info("getting new continuous_binary_feat_list")
-    new_continuous_binary_feat_list_save_path = os.path.join(
-      _output_folder_dir, "continuous_binary_feat_list.json"
+    parser.add_argument(
+        "--output_checkpoint_dir",
+        default="none",
+        type=str,
+        help="Output folder for warm started ckpt. If none, it will move warm_start_base_dir to backup, and overwrite it",
     )
-    continuous_binary_feat_list = get_feature_list_for_heavy_ranking(
-      feature_list_path, opt.data_spec
+
+    parser.add_argument(
+        "--feature_list",
+        default="none",
+        type=str,
+        help="Which features to use for training",
     )
-    rmdir(new_continuous_binary_feat_list_save_path)
-    twml.util.write_file(
-      new_continuous_binary_feat_list_save_path, continuous_binary_feat_list, encode="json"
+
+    parser.add_argument(
+        "--old_feature_list",
+        default="none",
+        type=str,
+        help="Which features to use for training",
     )
-    tf.logging.info(f"Finish writting files to {new_continuous_binary_feat_list_save_path}")
+
+    return parser
+
+
+def get_params(args=None):
+    parser = get_arg_parser()
+    if args is None:
+        return parser.parse_args()
+    else:
+        return parser.parse_args(args)
+
+
+def _main():
+    opt = get_params()
+    logging.info("parse is: ")
+    logging.info(opt)
+
+    if opt.feature_list == "none":
+        feature_list_path = FEATURE_LIST_DEFAULT_PATH
+    else:
+        feature_list_path = opt.feature_list
+
+    if opt.warm_start_base_dir != "none" and tf.io.gfile.exists(
+        opt.warm_start_base_dir
+    ):
+        if (
+            opt.output_checkpoint_dir == "none"
+            or opt.output_checkpoint_dir == opt.warm_start_base_dir
+        ):
+            _warm_start_base_dir = (
+                os.path.normpath(opt.warm_start_base_dir) + "_backup_warm_start"
+            )
+            _output_folder_dir = opt.warm_start_base_dir
+
+            rename_dir(opt.warm_start_base_dir, _warm_start_base_dir)
+            tf.logging.info(
+                f"moved {opt.warm_start_base_dir} to {_warm_start_base_dir}"
+            )
+        else:
+            _warm_start_base_dir = opt.warm_start_base_dir
+            _output_folder_dir = opt.output_checkpoint_dir
+
+        continuous_binary_feat_list_save_path = os.path.join(
+            _warm_start_base_dir, "continuous_binary_feat_list.json"
+        )
+
+        if opt.old_feature_list != "none":
+            tf.logging.info("getting old continuous_binary_feat_list")
+            continuous_binary_feat_list = get_feature_list_for_heavy_ranking(
+                opt.old_feature_list, opt.data_spec
+            )
+            rmdir(continuous_binary_feat_list_save_path)
+            twml.util.write_file(
+                continuous_binary_feat_list_save_path,
+                continuous_binary_feat_list,
+                encode="json",
+            )
+            tf.logging.info(
+                f"Finish writting files to {continuous_binary_feat_list_save_path}"
+            )
+
+        warm_start_folder = os.path.join(_warm_start_base_dir, "best_checkpoint")
+        if not tf.io.gfile.exists(warm_start_folder):
+            warm_start_folder = _warm_start_base_dir
+
+        rmdir(_output_folder_dir)
+        mkdirp(_output_folder_dir)
+
+        new_ckpt = warm_start_checkpoint(
+            warm_start_folder,
+            continuous_binary_feat_list_save_path,
+            feature_list_path,
+            opt.data_spec,
+            _output_folder_dir,
+            opt.model_type,
+        )
+        logging.info(f"Created new ckpt {new_ckpt} from {warm_start_folder}")
+
+        tf.logging.info("getting new continuous_binary_feat_list")
+        new_continuous_binary_feat_list_save_path = os.path.join(
+            _output_folder_dir, "continuous_binary_feat_list.json"
+        )
+        continuous_binary_feat_list = get_feature_list_for_heavy_ranking(
+            feature_list_path, opt.data_spec
+        )
+        rmdir(new_continuous_binary_feat_list_save_path)
+        twml.util.write_file(
+            new_continuous_binary_feat_list_save_path,
+            continuous_binary_feat_list,
+            encode="json",
+        )
+        tf.logging.info(
+            f"Finish writting files to {new_continuous_binary_feat_list_save_path}"
+        )
 
 
 if __name__ == "__main__":
-  _main()
+    _main()
diff --git a/pushservice/src/main/python/models/libs/customized_full_sparse.py b/pushservice/src/main/python/models/libs/customized_full_sparse.py
index b41f7d694..c5f10c4b9 100644
--- a/pushservice/src/main/python/models/libs/customized_full_sparse.py
+++ b/pushservice/src/main/python/models/libs/customized_full_sparse.py
@@ -4,53 +4,55 @@
 overide default action.
 """
 
+import tensorflow.compat.v1 as tf
+
 from twml.layers import FullSparse as defaultFullSparse
 from twml.layers.full_sparse import sparse_dense_matmul
 
-import tensorflow.compat.v1 as tf
-
 
 class FullSparse(defaultFullSparse):
-  def call(self, inputs, use_binary_values=None, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        A SparseTensor or a list of SparseTensors.
-        If `inputs` is a list, all tensors must have same `dense_shape`.
-
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
-      - If `inputs` is a `list[SparseTensor`, then returns
-       `bias + add_n([sp_a * dense_b for sp_a in inputs])`.
-    """
-
-    if use_binary_values is not None:
-      default_use_binary_values = use_binary_values
-    else:
-      default_use_binary_values = self.use_binary_values
-
-    if isinstance(default_use_binary_values, (list, tuple)):
-      raise ValueError(
-        "use_binary_values can not be %s when inputs is %s"
-        % (type(default_use_binary_values), type(inputs))
-      )
-
-    outputs = sparse_dense_matmul(
-      inputs,
-      self.weight,
-      self.use_sparse_grads,
-      default_use_binary_values,
-      name="sparse_mm",
-      partition_axis=self.partition_axis,
-      num_partitions=self.num_partitions,
-      compress_ids=self._use_compression,
-      cast_indices_dtype=self._cast_indices_dtype,
-    )
-
-    if self.bias is not None:
-      outputs = tf.nn.bias_add(outputs, self.bias)
-
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
+    def call(
+        self, inputs, use_binary_values=None, **kwargs
+    ):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Arguments:
+          inputs:
+            A SparseTensor or a list of SparseTensors.
+            If `inputs` is a list, all tensors must have same `dense_shape`.
+
+        Returns:
+          - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
+          - If `inputs` is a `list[SparseTensor`, then returns
+           `bias + add_n([sp_a * dense_b for sp_a in inputs])`.
+        """
+
+        if use_binary_values is not None:
+            default_use_binary_values = use_binary_values
+        else:
+            default_use_binary_values = self.use_binary_values
+
+        if isinstance(default_use_binary_values, (list, tuple)):
+            raise ValueError(
+                "use_binary_values can not be %s when inputs is %s"
+                % (type(default_use_binary_values), type(inputs))
+            )
+
+        outputs = sparse_dense_matmul(
+            inputs,
+            self.weight,
+            self.use_sparse_grads,
+            default_use_binary_values,
+            name="sparse_mm",
+            partition_axis=self.partition_axis,
+            num_partitions=self.num_partitions,
+            compress_ids=self._use_compression,
+            cast_indices_dtype=self._cast_indices_dtype,
+        )
+
+        if self.bias is not None:
+            outputs = tf.nn.bias_add(outputs, self.bias)
+
+        if self.activation is not None:
+            return self.activation(outputs)  # pylint: disable=not-callable
+        return outputs
diff --git a/pushservice/src/main/python/models/libs/get_feat_config.py b/pushservice/src/main/python/models/libs/get_feat_config.py
index 4d8b3e93c..62f87391b 100644
--- a/pushservice/src/main/python/models/libs/get_feat_config.py
+++ b/pushservice/src/main/python/models/libs/get_feat_config.py
@@ -1,9 +1,11 @@
 import os
 
-from twitter.deepbird.projects.magic_recs.libs.metric_fn_utils import USER_AGE_FEATURE_NAME
+from twitter.deepbird.projects.magic_recs.libs.metric_fn_utils import (
+    USER_AGE_FEATURE_NAME,
+)
 from twitter.deepbird.projects.magic_recs.libs.model_utils import read_config
-from twml.contrib import feature_config as contrib_feature_config
 
+from twml.contrib import feature_config as contrib_feature_config
 
 # checkstyle: noqa
 
@@ -13,164 +15,172 @@
 
 DEFAULT_FEATURE_LIST_PATH = "./feature_list_default.yaml"
 FEATURE_LIST_DEFAULT_PATH = os.path.join(
-  os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_PATH
+    os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_PATH
 )
 
 DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH = "./feature_list_light_ranking.yaml"
 FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH = os.path.join(
-  os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH
+    os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH
 )
 
 FEATURE_LIST_DEFAULT = read_config(FEATURE_LIST_DEFAULT_PATH).items()
-FEATURE_LIST_LIGHT_RANKING_DEFAULT = read_config(FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH).items()
+FEATURE_LIST_LIGHT_RANKING_DEFAULT = read_config(
+    FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH
+).items()
 
 
 LABELS = ["label"]
 LABELS_MTL = {"OONC": ["label"], "OONC_Engagement": ["label", "label.engagement"]}
 LABELS_LR = {
-  "Sent": ["label.sent"],
-  "HeavyRankPosition": ["meta.ranking.is_top3"],
-  "HeavyRankProbability": ["meta.ranking.weighted_oonc_model_score"],
+    "Sent": ["label.sent"],
+    "HeavyRankPosition": ["meta.ranking.is_top3"],
+    "HeavyRankProbability": ["meta.ranking.weighted_oonc_model_score"],
 }
 
 
 def _get_new_feature_config_base(
-  data_spec_path,
-  labels,
-  add_sparse_continous=True,
-  add_gbdt=True,
-  add_user_id=False,
-  add_timestamp=False,
-  add_user_age=False,
-  feature_list_provided=[],
-  opt=None,
-  run_light_ranking_group_metrics_in_bq=False,
+    data_spec_path,
+    labels,
+    add_sparse_continous=True,
+    add_gbdt=True,
+    add_user_id=False,
+    add_timestamp=False,
+    add_user_age=False,
+    feature_list_provided=[],
+    opt=None,
+    run_light_ranking_group_metrics_in_bq=False,
 ):
-  """
-  Getter of the feature config based on specification.
-
-  Args:
-    data_spec_path: A string indicating the path of the data_spec.json file, which could be
-      either a local path or a hdfs path.
-    labels: A list of strings indicating the name of the label in the data spec.
-    add_sparse_continous: A bool indicating if sparse_continuous feature needs to be included.
-    add_gbdt: A bool indicating if gbdt feature needs to be included.
-    add_user_id: A bool indicating if user_id feature needs to be included.
-    add_timestamp: A bool indicating if timestamp feature needs to be included. This will be useful
-      for sequential models and meta learning models.
-    add_user_age: A bool indicating if the user age feature needs to be included.
-    feature_list_provided: A list of features thats need to be included. If not specified, will use
-      FEATURE_LIST_DEFAULT by default.
-    opt: A namespace of arguments indicating the hyparameters.
-    run_light_ranking_group_metrics_in_bq: A bool indicating if heavy ranker score info needs to be included to compute group metrics in BigQuery.
-
-  Returns:
-    A twml feature config object.
-  """
-
-  input_size_bits = DEFAULT_INPUT_SIZE_BITS if opt is None else opt.input_size_bits
-
-  feature_list = feature_list_provided if feature_list_provided != [] else FEATURE_LIST_DEFAULT
-  a_string_feat_list = [f[0] for f in feature_list if f[1] != "S"]
-
-  builder = contrib_feature_config.FeatureConfigBuilder(data_spec_path=data_spec_path)
-
-  builder = builder.extract_feature_group(
-    feature_regexes=a_string_feat_list,
-    group_name="continuous",
-    default_value=FEAT_CONFIG_DEFAULT_VAL,
-    type_filter=["CONTINUOUS"],
-  )
-
-  builder = builder.extract_features_as_hashed_sparse(
-    feature_regexes=a_string_feat_list,
-    output_tensor_name="sparse_no_continuous",
-    hash_space_size_bits=input_size_bits,
-    type_filter=["BINARY", "DISCRETE", "STRING", "SPARSE_BINARY"],
-  )
-
-  if add_gbdt:
-    builder = builder.extract_features_as_hashed_sparse(
-      feature_regexes=["ads\..*"],
-      output_tensor_name="gbdt_sparse",
-      hash_space_size_bits=input_size_bits,
+    """
+    Getter of the feature config based on specification.
+
+    Args:
+      data_spec_path: A string indicating the path of the data_spec.json file, which could be
+        either a local path or a hdfs path.
+      labels: A list of strings indicating the name of the label in the data spec.
+      add_sparse_continous: A bool indicating if sparse_continuous feature needs to be included.
+      add_gbdt: A bool indicating if gbdt feature needs to be included.
+      add_user_id: A bool indicating if user_id feature needs to be included.
+      add_timestamp: A bool indicating if timestamp feature needs to be included. This will be useful
+        for sequential models and meta learning models.
+      add_user_age: A bool indicating if the user age feature needs to be included.
+      feature_list_provided: A list of features thats need to be included. If not specified, will use
+        FEATURE_LIST_DEFAULT by default.
+      opt: A namespace of arguments indicating the hyparameters.
+      run_light_ranking_group_metrics_in_bq: A bool indicating if heavy ranker score info needs to be included to compute group metrics in BigQuery.
+
+    Returns:
+      A twml feature config object.
+    """
+
+    input_size_bits = DEFAULT_INPUT_SIZE_BITS if opt is None else opt.input_size_bits
+
+    feature_list = (
+        feature_list_provided if feature_list_provided != [] else FEATURE_LIST_DEFAULT
     )
+    a_string_feat_list = [f[0] for f in feature_list if f[1] != "S"]
+
+    builder = contrib_feature_config.FeatureConfigBuilder(data_spec_path=data_spec_path)
 
-  if add_sparse_continous:
-    s_string_feat_list = [f[0] for f in feature_list if f[1] == "S"]
+    builder = builder.extract_feature_group(
+        feature_regexes=a_string_feat_list,
+        group_name="continuous",
+        default_value=FEAT_CONFIG_DEFAULT_VAL,
+        type_filter=["CONTINUOUS"],
+    )
 
     builder = builder.extract_features_as_hashed_sparse(
-      feature_regexes=s_string_feat_list,
-      output_tensor_name="sparse_continuous",
-      hash_space_size_bits=input_size_bits,
-      type_filter=["SPARSE_CONTINUOUS"],
+        feature_regexes=a_string_feat_list,
+        output_tensor_name="sparse_no_continuous",
+        hash_space_size_bits=input_size_bits,
+        type_filter=["BINARY", "DISCRETE", "STRING", "SPARSE_BINARY"],
     )
 
-  if add_user_id:
-    builder = builder.extract_feature("meta.user_id")
-  if add_timestamp:
-    builder = builder.extract_feature("meta.timestamp")
-  if add_user_age:
-    builder = builder.extract_feature(USER_AGE_FEATURE_NAME)
+    if add_gbdt:
+        builder = builder.extract_features_as_hashed_sparse(
+            feature_regexes=["ads\..*"],
+            output_tensor_name="gbdt_sparse",
+            hash_space_size_bits=input_size_bits,
+        )
+
+    if add_sparse_continous:
+        s_string_feat_list = [f[0] for f in feature_list if f[1] == "S"]
 
-  if run_light_ranking_group_metrics_in_bq:
-    builder = builder.extract_feature("meta.trace_id")
-    builder = builder.extract_feature("meta.ranking.weighted_oonc_model_score")
+        builder = builder.extract_features_as_hashed_sparse(
+            feature_regexes=s_string_feat_list,
+            output_tensor_name="sparse_continuous",
+            hash_space_size_bits=input_size_bits,
+            type_filter=["SPARSE_CONTINUOUS"],
+        )
 
-  builder = builder.add_labels(labels).define_weight("meta.weight")
+    if add_user_id:
+        builder = builder.extract_feature("meta.user_id")
+    if add_timestamp:
+        builder = builder.extract_feature("meta.timestamp")
+    if add_user_age:
+        builder = builder.extract_feature(USER_AGE_FEATURE_NAME)
 
-  return builder.build()
+    if run_light_ranking_group_metrics_in_bq:
+        builder = builder.extract_feature("meta.trace_id")
+        builder = builder.extract_feature("meta.ranking.weighted_oonc_model_score")
+
+    builder = builder.add_labels(labels).define_weight("meta.weight")
+
+    return builder.build()
 
 
 def get_feature_config_with_sparse_continuous(
-  data_spec_path,
-  feature_list_provided=[],
-  opt=None,
-  add_user_id=False,
-  add_timestamp=False,
-  add_user_age=False,
+    data_spec_path,
+    feature_list_provided=[],
+    opt=None,
+    add_user_id=False,
+    add_timestamp=False,
+    add_user_age=False,
 ):
-  task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "OONC"
-  if task_name not in LABELS_MTL:
-    raise ValueError("Invalid Task Name !")
-
-  return _get_new_feature_config_base(
-    data_spec_path=data_spec_path,
-    labels=LABELS_MTL[task_name],
-    add_sparse_continous=True,
-    add_user_id=add_user_id,
-    add_timestamp=add_timestamp,
-    add_user_age=add_user_age,
-    feature_list_provided=feature_list_provided,
-    opt=opt,
-  )
+    task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "OONC"
+    if task_name not in LABELS_MTL:
+        raise ValueError("Invalid Task Name !")
+
+    return _get_new_feature_config_base(
+        data_spec_path=data_spec_path,
+        labels=LABELS_MTL[task_name],
+        add_sparse_continous=True,
+        add_user_id=add_user_id,
+        add_timestamp=add_timestamp,
+        add_user_age=add_user_age,
+        feature_list_provided=feature_list_provided,
+        opt=opt,
+    )
 
 
 def get_feature_config_light_ranking(
-  data_spec_path,
-  feature_list_provided=[],
-  opt=None,
-  add_user_id=True,
-  add_timestamp=False,
-  add_user_age=False,
-  add_gbdt=False,
-  run_light_ranking_group_metrics_in_bq=False,
+    data_spec_path,
+    feature_list_provided=[],
+    opt=None,
+    add_user_id=True,
+    add_timestamp=False,
+    add_user_age=False,
+    add_gbdt=False,
+    run_light_ranking_group_metrics_in_bq=False,
 ):
-  task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "HeavyRankPosition"
-  if task_name not in LABELS_LR:
-    raise ValueError("Invalid Task Name !")
-  if not feature_list_provided:
-    feature_list_provided = FEATURE_LIST_LIGHT_RANKING_DEFAULT
-
-  return _get_new_feature_config_base(
-    data_spec_path=data_spec_path,
-    labels=LABELS_LR[task_name],
-    add_sparse_continous=False,
-    add_gbdt=add_gbdt,
-    add_user_id=add_user_id,
-    add_timestamp=add_timestamp,
-    add_user_age=add_user_age,
-    feature_list_provided=feature_list_provided,
-    opt=opt,
-    run_light_ranking_group_metrics_in_bq=run_light_ranking_group_metrics_in_bq,
-  )
+    task_name = (
+        opt.task_name
+        if getattr(opt, "task_name", None) is not None
+        else "HeavyRankPosition"
+    )
+    if task_name not in LABELS_LR:
+        raise ValueError("Invalid Task Name !")
+    if not feature_list_provided:
+        feature_list_provided = FEATURE_LIST_LIGHT_RANKING_DEFAULT
+
+    return _get_new_feature_config_base(
+        data_spec_path=data_spec_path,
+        labels=LABELS_LR[task_name],
+        add_sparse_continous=False,
+        add_gbdt=add_gbdt,
+        add_user_id=add_user_id,
+        add_timestamp=add_timestamp,
+        add_user_age=add_user_age,
+        feature_list_provided=feature_list_provided,
+        opt=opt,
+        run_light_ranking_group_metrics_in_bq=run_light_ranking_group_metrics_in_bq,
+    )
diff --git a/pushservice/src/main/python/models/libs/graph_utils.py b/pushservice/src/main/python/models/libs/graph_utils.py
index 4a4626a59..a587a850f 100644
--- a/pushservice/src/main/python/models/libs/graph_utils.py
+++ b/pushservice/src/main/python/models/libs/graph_utils.py
@@ -8,35 +8,39 @@
 
 
 def get_trainable_variables(all_trainable_variables, trainable_regexes):
-  """Returns a subset of trainable variables for training.
-
-  Given a collection of trainable variables, this will return all those that match the given regexes.
-  Will also log those variables.
-
-  Args:
-      all_trainable_variables (a collection of trainable tf.Variable): The variables to search through.
-      trainable_regexes (a collection of regexes): Variables that match any regex will be included.
-
-  Returns a list of tf.Variable
-  """
-  if trainable_regexes is None or len(trainable_regexes) == 0:
-    tf.logging.info("No trainable regexes found. Not using get_trainable_variables behavior.")
-    return None
-
-  assert any(
-    tf.is_tensor(var) for var in all_trainable_variables
-  ), f"Non TF variable found: {all_trainable_variables}"
-  trainable_variables = list(
-    filter(
-      lambda var: any(re.match(regex, var.name, re.IGNORECASE) for regex in trainable_regexes),
-      all_trainable_variables,
+    """Returns a subset of trainable variables for training.
+
+    Given a collection of trainable variables, this will return all those that match the given regexes.
+    Will also log those variables.
+
+    Args:
+        all_trainable_variables (a collection of trainable tf.Variable): The variables to search through.
+        trainable_regexes (a collection of regexes): Variables that match any regex will be included.
+
+    Returns a list of tf.Variable
+    """
+    if trainable_regexes is None or len(trainable_regexes) == 0:
+        tf.logging.info(
+            "No trainable regexes found. Not using get_trainable_variables behavior."
+        )
+        return None
+
+    assert any(
+        tf.is_tensor(var) for var in all_trainable_variables
+    ), f"Non TF variable found: {all_trainable_variables}"
+    trainable_variables = list(
+        filter(
+            lambda var: any(
+                re.match(regex, var.name, re.IGNORECASE) for regex in trainable_regexes
+            ),
+            all_trainable_variables,
+        )
     )
-  )
-  tf.logging.info(f"Using filtered trainable variables: {trainable_variables}")
-
-  assert (
-    trainable_variables
-  ), "Did not find trainable variables after filtering after filtering from {} number of vars originaly. All vars: {} and train regexes: {}".format(
-    len(all_trainable_variables), all_trainable_variables, trainable_regexes
-  )
-  return trainable_variables
+    tf.logging.info(f"Using filtered trainable variables: {trainable_variables}")
+
+    assert (
+        trainable_variables
+    ), "Did not find trainable variables after filtering after filtering from {} number of vars originaly. All vars: {} and train regexes: {}".format(
+        len(all_trainable_variables), all_trainable_variables, trainable_regexes
+    )
+    return trainable_variables
diff --git a/pushservice/src/main/python/models/libs/group_metrics.py b/pushservice/src/main/python/models/libs/group_metrics.py
index eeef3c501..681fdd28a 100644
--- a/pushservice/src/main/python/models/libs/group_metrics.py
+++ b/pushservice/src/main/python/models/libs/group_metrics.py
@@ -1,114 +1,123 @@
 import os
 import time
 
+import numpy as np
+import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import logging
 from twitter.cortex.ml.embeddings.deepbird.grouped_metrics.computation import (
-  write_grouped_metrics_to_mldash,
+    write_grouped_metrics_to_mldash,
 )
 from twitter.cortex.ml.embeddings.deepbird.grouped_metrics.configuration import (
-  ClassificationGroupedMetricsConfiguration,
-  NDCGGroupedMetricsConfiguration,
+    ClassificationGroupedMetricsConfiguration,
+    NDCGGroupedMetricsConfiguration,
 )
+
 import twml
 
 from .light_ranking_metrics import (
-  CGRGroupedMetricsConfiguration,
-  ExpectedLossGroupedMetricsConfiguration,
-  RecallGroupedMetricsConfiguration,
+    CGRGroupedMetricsConfiguration,
+    ExpectedLossGroupedMetricsConfiguration,
+    RecallGroupedMetricsConfiguration,
 )
 
-import numpy as np
-import tensorflow.compat.v1 as tf
-from tensorflow.compat.v1 import logging
-
-
 # checkstyle: noqa
 
 
-def run_group_metrics(trainer, data_dir, model_path, parse_fn, group_feature_name="meta.user_id"):
-
-  start_time = time.time()
-  logging.info("Evaluating with group metrics.")
-
-  metrics = write_grouped_metrics_to_mldash(
-    trainer=trainer,
-    data_dir=data_dir,
-    model_path=model_path,
-    group_fn=lambda datarecord: str(
-      datarecord.discreteFeatures[twml.feature_id(group_feature_name)[0]]
-    ),
-    parse_fn=parse_fn,
-    metric_configurations=[
-      ClassificationGroupedMetricsConfiguration(),
-      NDCGGroupedMetricsConfiguration(k=[5, 10, 20]),
-    ],
-    total_records_to_read=1000000000,
-    shuffle=False,
-    mldash_metrics_name="grouped_metrics",
-  )
-
-  end_time = time.time()
-  logging.info(f"Evaluated Group Metics: {metrics}.")
-  logging.info(f"Group metrics evaluation time {end_time - start_time}.")
+def run_group_metrics(
+    trainer, data_dir, model_path, parse_fn, group_feature_name="meta.user_id"
+):
+    start_time = time.time()
+    logging.info("Evaluating with group metrics.")
+
+    metrics = write_grouped_metrics_to_mldash(
+        trainer=trainer,
+        data_dir=data_dir,
+        model_path=model_path,
+        group_fn=lambda datarecord: str(
+            datarecord.discreteFeatures[twml.feature_id(group_feature_name)[0]]
+        ),
+        parse_fn=parse_fn,
+        metric_configurations=[
+            ClassificationGroupedMetricsConfiguration(),
+            NDCGGroupedMetricsConfiguration(k=[5, 10, 20]),
+        ],
+        total_records_to_read=1000000000,
+        shuffle=False,
+        mldash_metrics_name="grouped_metrics",
+    )
+
+    end_time = time.time()
+    logging.info(f"Evaluated Group Metics: {metrics}.")
+    logging.info(f"Group metrics evaluation time {end_time - start_time}.")
 
 
 def run_group_metrics_light_ranking(
-  trainer, data_dir, model_path, parse_fn, group_feature_name="meta.trace_id"
+    trainer, data_dir, model_path, parse_fn, group_feature_name="meta.trace_id"
 ):
-
-  start_time = time.time()
-  logging.info("Evaluating with group metrics.")
-
-  metrics = write_grouped_metrics_to_mldash(
-    trainer=trainer,
-    data_dir=data_dir,
-    model_path=model_path,
-    group_fn=lambda datarecord: str(
-      datarecord.discreteFeatures[twml.feature_id(group_feature_name)[0]]
-    ),
-    parse_fn=parse_fn,
-    metric_configurations=[
-      CGRGroupedMetricsConfiguration(lightNs=[50, 100, 200], heavyKs=[1, 3, 10, 20, 50]),
-      RecallGroupedMetricsConfiguration(n=[50, 100, 200], k=[1, 3, 10, 20, 50]),
-      ExpectedLossGroupedMetricsConfiguration(lightNs=[50, 100, 200]),
-    ],
-    total_records_to_read=10000000,
-    num_batches_to_load=50,
-    batch_size=1024,
-    shuffle=False,
-    mldash_metrics_name="grouped_metrics_for_light_ranking",
-  )
-
-  end_time = time.time()
-  logging.info(f"Evaluated Group Metics for Light Ranking: {metrics}.")
-  logging.info(f"Group metrics evaluation time {end_time - start_time}.")
+    start_time = time.time()
+    logging.info("Evaluating with group metrics.")
+
+    metrics = write_grouped_metrics_to_mldash(
+        trainer=trainer,
+        data_dir=data_dir,
+        model_path=model_path,
+        group_fn=lambda datarecord: str(
+            datarecord.discreteFeatures[twml.feature_id(group_feature_name)[0]]
+        ),
+        parse_fn=parse_fn,
+        metric_configurations=[
+            CGRGroupedMetricsConfiguration(
+                lightNs=[50, 100, 200], heavyKs=[1, 3, 10, 20, 50]
+            ),
+            RecallGroupedMetricsConfiguration(n=[50, 100, 200], k=[1, 3, 10, 20, 50]),
+            ExpectedLossGroupedMetricsConfiguration(lightNs=[50, 100, 200]),
+        ],
+        total_records_to_read=10000000,
+        num_batches_to_load=50,
+        batch_size=1024,
+        shuffle=False,
+        mldash_metrics_name="grouped_metrics_for_light_ranking",
+    )
+
+    end_time = time.time()
+    logging.info(f"Evaluated Group Metics for Light Ranking: {metrics}.")
+    logging.info(f"Group metrics evaluation time {end_time - start_time}.")
 
 
 def run_group_metrics_light_ranking_in_bq(trainer, params, checkpoint_path):
-  logging.info("getting Test Predictions for Light Ranking Group Metrics in BigQuery !!!")
-  eval_input_fn = trainer.get_eval_input_fn(repeat=False, shuffle=False)
-  info_pool = []
-
-  for result in trainer.estimator.predict(
-    eval_input_fn, checkpoint_path=checkpoint_path, yield_single_examples=False
-  ):
-    traceID = result["trace_id"]
-    pred = result["prediction"]
-    label = result["target"]
-    info = np.concatenate([traceID, pred, label], axis=1)
-    info_pool.append(info)
-
-  info_pool = np.concatenate(info_pool)
-
-  locname = "/tmp/000/"
-  if not os.path.exists(locname):
-    os.makedirs(locname)
-
-  locfile = locname + params.pred_file_name
-  columns = ["trace_id", "model_prediction", "meta__ranking__weighted_oonc_model_score"]
-  np.savetxt(locfile, info_pool, delimiter=",", header=",".join(columns))
-  tf.io.gfile.copy(locfile, params.pred_file_path + params.pred_file_name, overwrite=True)
-
-  if os.path.isfile(locfile):
-    os.remove(locfile)
-
-  logging.info("Done Prediction for Light Ranking Group Metrics in BigQuery.")
+    logging.info(
+        "getting Test Predictions for Light Ranking Group Metrics in BigQuery !!!"
+    )
+    eval_input_fn = trainer.get_eval_input_fn(repeat=False, shuffle=False)
+    info_pool = []
+
+    for result in trainer.estimator.predict(
+        eval_input_fn, checkpoint_path=checkpoint_path, yield_single_examples=False
+    ):
+        traceID = result["trace_id"]
+        pred = result["prediction"]
+        label = result["target"]
+        info = np.concatenate([traceID, pred, label], axis=1)
+        info_pool.append(info)
+
+    info_pool = np.concatenate(info_pool)
+
+    locname = "/tmp/000/"
+    if not os.path.exists(locname):
+        os.makedirs(locname)
+
+    locfile = locname + params.pred_file_name
+    columns = [
+        "trace_id",
+        "model_prediction",
+        "meta__ranking__weighted_oonc_model_score",
+    ]
+    np.savetxt(locfile, info_pool, delimiter=",", header=",".join(columns))
+    tf.io.gfile.copy(
+        locfile, params.pred_file_path + params.pred_file_name, overwrite=True
+    )
+
+    if os.path.isfile(locfile):
+        os.remove(locfile)
+
+    logging.info("Done Prediction for Light Ranking Group Metrics in BigQuery.")
diff --git a/pushservice/src/main/python/models/libs/initializer.py b/pushservice/src/main/python/models/libs/initializer.py
index 8bba00216..e5ec13a9c 100644
--- a/pushservice/src/main/python/models/libs/initializer.py
+++ b/pushservice/src/main/python/models/libs/initializer.py
@@ -3,116 +3,120 @@
 
 
 class VarianceScaling(object):
-  """Initializer capable of adapting its scale to the shape of weights.
-  With `distribution="normal"`, samples are drawn from a truncated normal
-  distribution centered on zero, with `stddev = sqrt(scale / n)` where n is:
-      - number of input units in the weight tensor, if mode = "fan_in"
-      - number of output units, if mode = "fan_out"
-      - average of the numbers of input and output units, if mode = "fan_avg"
-  With `distribution="uniform"`,
-  samples are drawn from a uniform distribution
-  within [-limit, limit], with `limit = sqrt(3 * scale / n)`.
-  # Arguments
-      scale: Scaling factor (positive float).
-      mode: One of "fan_in", "fan_out", "fan_avg".
-      distribution: Random distribution to use. One of "normal", "uniform".
-      seed: A Python integer. Used to seed the random generator.
-  # Raises
-      ValueError: In case of an invalid value for the "scale", mode" or
-        "distribution" arguments."""
+    """Initializer capable of adapting its scale to the shape of weights.
+    With `distribution="normal"`, samples are drawn from a truncated normal
+    distribution centered on zero, with `stddev = sqrt(scale / n)` where n is:
+        - number of input units in the weight tensor, if mode = "fan_in"
+        - number of output units, if mode = "fan_out"
+        - average of the numbers of input and output units, if mode = "fan_avg"
+    With `distribution="uniform"`,
+    samples are drawn from a uniform distribution
+    within [-limit, limit], with `limit = sqrt(3 * scale / n)`.
+    # Arguments
+        scale: Scaling factor (positive float).
+        mode: One of "fan_in", "fan_out", "fan_avg".
+        distribution: Random distribution to use. One of "normal", "uniform".
+        seed: A Python integer. Used to seed the random generator.
+    # Raises
+        ValueError: In case of an invalid value for the "scale", mode" or
+          "distribution" arguments."""
 
-  def __init__(
-    self,
-    scale=1.0,
-    mode="fan_in",
-    distribution="normal",
-    seed=None,
-    fan_in=None,
-    fan_out=None,
-  ):
-    self.fan_in = fan_in
-    self.fan_out = fan_out
-    if scale <= 0.0:
-      raise ValueError("`scale` must be a positive float. Got:", scale)
-    mode = mode.lower()
-    if mode not in {"fan_in", "fan_out", "fan_avg"}:
-      raise ValueError(
-        "Invalid `mode` argument: " 'expected on of {"fan_in", "fan_out", "fan_avg"} ' "but got",
-        mode,
-      )
-    distribution = distribution.lower()
-    if distribution not in {"normal", "uniform"}:
-      raise ValueError(
-        "Invalid `distribution` argument: " 'expected one of {"normal", "uniform"} ' "but got",
-        distribution,
-      )
-    self.scale = scale
-    self.mode = mode
-    self.distribution = distribution
-    self.seed = seed
+    def __init__(
+        self,
+        scale=1.0,
+        mode="fan_in",
+        distribution="normal",
+        seed=None,
+        fan_in=None,
+        fan_out=None,
+    ):
+        self.fan_in = fan_in
+        self.fan_out = fan_out
+        if scale <= 0.0:
+            raise ValueError("`scale` must be a positive float. Got:", scale)
+        mode = mode.lower()
+        if mode not in {"fan_in", "fan_out", "fan_avg"}:
+            raise ValueError(
+                "Invalid `mode` argument: "
+                'expected on of {"fan_in", "fan_out", "fan_avg"} '
+                "but got",
+                mode,
+            )
+        distribution = distribution.lower()
+        if distribution not in {"normal", "uniform"}:
+            raise ValueError(
+                "Invalid `distribution` argument: "
+                'expected one of {"normal", "uniform"} '
+                "but got",
+                distribution,
+            )
+        self.scale = scale
+        self.mode = mode
+        self.distribution = distribution
+        self.seed = seed
 
-  def __call__(self, shape, dtype=None, partition_info=None):
-    fan_in = shape[-2] if self.fan_in is None else self.fan_in
-    fan_out = shape[-1] if self.fan_out is None else self.fan_out
+    def __call__(self, shape, dtype=None, partition_info=None):
+        fan_in = shape[-2] if self.fan_in is None else self.fan_in
+        fan_out = shape[-1] if self.fan_out is None else self.fan_out
 
-    scale = self.scale
-    if self.mode == "fan_in":
-      scale /= max(1.0, fan_in)
-    elif self.mode == "fan_out":
-      scale /= max(1.0, fan_out)
-    else:
-      scale /= max(1.0, float(fan_in + fan_out) / 2)
-    if self.distribution == "normal":
-      stddev = np.sqrt(scale) / 0.87962566103423978
-      return K.truncated_normal(shape, 0.0, stddev, dtype=dtype, seed=self.seed)
-    else:
-      limit = np.sqrt(3.0 * scale)
-      return K.random_uniform(shape, -limit, limit, dtype=dtype, seed=self.seed)
+        scale = self.scale
+        if self.mode == "fan_in":
+            scale /= max(1.0, fan_in)
+        elif self.mode == "fan_out":
+            scale /= max(1.0, fan_out)
+        else:
+            scale /= max(1.0, float(fan_in + fan_out) / 2)
+        if self.distribution == "normal":
+            stddev = np.sqrt(scale) / 0.87962566103423978
+            return K.truncated_normal(shape, 0.0, stddev, dtype=dtype, seed=self.seed)
+        else:
+            limit = np.sqrt(3.0 * scale)
+            return K.random_uniform(shape, -limit, limit, dtype=dtype, seed=self.seed)
 
-  def get_config(self):
-    return {
-      "scale": self.scale,
-      "mode": self.mode,
-      "distribution": self.distribution,
-      "seed": self.seed,
-    }
+    def get_config(self):
+        return {
+            "scale": self.scale,
+            "mode": self.mode,
+            "distribution": self.distribution,
+            "seed": self.seed,
+        }
 
 
 def customized_glorot_uniform(seed=None, fan_in=None, fan_out=None):
-  """Glorot uniform initializer, also called Xavier uniform initializer.
-  It draws samples from a uniform distribution within [-limit, limit]
-  where `limit` is `sqrt(6 / (fan_in + fan_out))`
-  where `fan_in` is the number of input units in the weight tensor
-  and `fan_out` is the number of output units in the weight tensor.
-  # Arguments
-      seed: A Python integer. Used to seed the random generator.
-  # Returns
-      An initializer."""
-  return VarianceScaling(
-    scale=1.0,
-    mode="fan_avg",
-    distribution="uniform",
-    seed=seed,
-    fan_in=fan_in,
-    fan_out=fan_out,
-  )
+    """Glorot uniform initializer, also called Xavier uniform initializer.
+    It draws samples from a uniform distribution within [-limit, limit]
+    where `limit` is `sqrt(6 / (fan_in + fan_out))`
+    where `fan_in` is the number of input units in the weight tensor
+    and `fan_out` is the number of output units in the weight tensor.
+    # Arguments
+        seed: A Python integer. Used to seed the random generator.
+    # Returns
+        An initializer."""
+    return VarianceScaling(
+        scale=1.0,
+        mode="fan_avg",
+        distribution="uniform",
+        seed=seed,
+        fan_in=fan_in,
+        fan_out=fan_out,
+    )
 
 
 def customized_glorot_norm(seed=None, fan_in=None, fan_out=None):
-  """Glorot norm initializer, also called Xavier uniform initializer.
-  It draws samples from a uniform distribution within [-limit, limit]
-  where `limit` is `sqrt(6 / (fan_in + fan_out))`
-  where `fan_in` is the number of input units in the weight tensor
-  and `fan_out` is the number of output units in the weight tensor.
-  # Arguments
-      seed: A Python integer. Used to seed the random generator.
-  # Returns
-      An initializer."""
-  return VarianceScaling(
-    scale=1.0,
-    mode="fan_avg",
-    distribution="normal",
-    seed=seed,
-    fan_in=fan_in,
-    fan_out=fan_out,
-  )
+    """Glorot norm initializer, also called Xavier uniform initializer.
+    It draws samples from a uniform distribution within [-limit, limit]
+    where `limit` is `sqrt(6 / (fan_in + fan_out))`
+    where `fan_in` is the number of input units in the weight tensor
+    and `fan_out` is the number of output units in the weight tensor.
+    # Arguments
+        seed: A Python integer. Used to seed the random generator.
+    # Returns
+        An initializer."""
+    return VarianceScaling(
+        scale=1.0,
+        mode="fan_avg",
+        distribution="normal",
+        seed=seed,
+        fan_in=fan_in,
+        fan_out=fan_out,
+    )
diff --git a/pushservice/src/main/python/models/libs/light_ranking_metrics.py b/pushservice/src/main/python/models/libs/light_ranking_metrics.py
index b83fcf3ae..7e4122011 100644
--- a/pushservice/src/main/python/models/libs/light_ranking_metrics.py
+++ b/pushservice/src/main/python/models/libs/light_ranking_metrics.py
@@ -1,255 +1,265 @@
 from functools import partial
 
 from twitter.cortex.ml.embeddings.deepbird.grouped_metrics.configuration import (
-  GroupedMetricsConfiguration,
+    GroupedMetricsConfiguration,
 )
 from twitter.cortex.ml.embeddings.deepbird.grouped_metrics.helpers import (
-  extract_prediction_from_prediction_record,
+    extract_prediction_from_prediction_record,
 )
 
-
 # checkstyle: noqa
 
 
 def score_loss_at_n(labels, predictions, lightN):
-  """
-  Compute the absolute ScoreLoss ranking metric
-  Args:
-    labels (list)     : A list of label values       (HeavyRanking Reference)
-    predictions (list): A list of prediction values  (LightRanking Predictions)
-    lightN (int): size of the list at which of Initial candidates to compute ScoreLoss. (LightRanking)
-  """
-  assert len(labels) == len(predictions)
+    """
+    Compute the absolute ScoreLoss ranking metric
+    Args:
+      labels (list)     : A list of label values       (HeavyRanking Reference)
+      predictions (list): A list of prediction values  (LightRanking Predictions)
+      lightN (int): size of the list at which of Initial candidates to compute ScoreLoss. (LightRanking)
+    """
+    assert len(labels) == len(predictions)
 
-  if lightN <= 0:
-    return None
+    if lightN <= 0:
+        return None
 
-  labels_with_predictions = zip(labels, predictions)
-  labels_with_sorted_predictions = sorted(
-    labels_with_predictions, key=lambda x: x[1], reverse=True
-  )[:lightN]
-  labels_top1_light = max([label for label, _ in labels_with_sorted_predictions])
-  labels_top1_heavy = max(labels)
+    labels_with_predictions = zip(labels, predictions)
+    labels_with_sorted_predictions = sorted(
+        labels_with_predictions, key=lambda x: x[1], reverse=True
+    )[:lightN]
+    labels_top1_light = max([label for label, _ in labels_with_sorted_predictions])
+    labels_top1_heavy = max(labels)
 
-  return labels_top1_heavy - labels_top1_light
+    return labels_top1_heavy - labels_top1_light
 
 
 def cgr_at_nk(labels, predictions, lightN, heavyK):
-  """
-  Compute Cumulative Gain Ratio (CGR) ranking metric
-  Args:
-    labels (list)     : A list of label values       (HeavyRanking Reference)
-    predictions (list): A list of prediction values  (LightRanking Predictions)
-    lightN (int): size of the list at which of Initial candidates to compute CGR. (LightRanking)
-    heavyK (int): size of the list at which of Refined candidates to compute CGR. (HeavyRanking)
-  """
-  assert len(labels) == len(predictions)
-
-  if (not lightN) or (not heavyK):
-    out = None
-  elif lightN <= 0 or heavyK <= 0:
-    out = None
-  else:
-
-    labels_with_predictions = zip(labels, predictions)
-    labels_with_sorted_predictions = sorted(
-      labels_with_predictions, key=lambda x: x[1], reverse=True
-    )[:lightN]
-    labels_topN_light = [label for label, _ in labels_with_sorted_predictions]
+    """
+    Compute Cumulative Gain Ratio (CGR) ranking metric
+    Args:
+      labels (list)     : A list of label values       (HeavyRanking Reference)
+      predictions (list): A list of prediction values  (LightRanking Predictions)
+      lightN (int): size of the list at which of Initial candidates to compute CGR. (LightRanking)
+      heavyK (int): size of the list at which of Refined candidates to compute CGR. (HeavyRanking)
+    """
+    assert len(labels) == len(predictions)
 
-    if lightN <= heavyK:
-      cg_light = sum(labels_topN_light)
+    if (not lightN) or (not heavyK):
+        out = None
+    elif lightN <= 0 or heavyK <= 0:
+        out = None
     else:
-      labels_topK_heavy_from_light = sorted(labels_topN_light, reverse=True)[:heavyK]
-      cg_light = sum(labels_topK_heavy_from_light)
+        labels_with_predictions = zip(labels, predictions)
+        labels_with_sorted_predictions = sorted(
+            labels_with_predictions, key=lambda x: x[1], reverse=True
+        )[:lightN]
+        labels_topN_light = [label for label, _ in labels_with_sorted_predictions]
+
+        if lightN <= heavyK:
+            cg_light = sum(labels_topN_light)
+        else:
+            labels_topK_heavy_from_light = sorted(labels_topN_light, reverse=True)[
+                :heavyK
+            ]
+            cg_light = sum(labels_topK_heavy_from_light)
 
-    ideal_ordering = sorted(labels, reverse=True)
-    cg_heavy = sum(ideal_ordering[: min(lightN, heavyK)])
+        ideal_ordering = sorted(labels, reverse=True)
+        cg_heavy = sum(ideal_ordering[: min(lightN, heavyK)])
 
-    out = 0.0
-    if cg_heavy != 0:
-      out = max(cg_light / cg_heavy, 0)
+        out = 0.0
+        if cg_heavy != 0:
+            out = max(cg_light / cg_heavy, 0)
 
-  return out
+    return out
 
 
 def _get_weight(w, atK):
-  if not w:
-    return 1.0
-  elif len(w) <= atK:
-    return 0.0
-  else:
-    return w[atK]
+    if not w:
+        return 1.0
+    elif len(w) <= atK:
+        return 0.0
+    else:
+        return w[atK]
 
 
 def recall_at_nk(labels, predictions, n=None, k=None, w=None):
-  """
-  Recall at N-K ranking metric
-  Args:
-    labels (list): A list of label values
-    predictions (list): A list of prediction values
-    n (int): size of the list at which of predictions to compute recall. (Light Ranking Predictions)
-             The default is None in which case the length of the provided predictions is used as L
-    k (int): size of the list at which of labels to compute recall. (Heavy Ranking Predictions)
-             The default is None in which case the length of the provided labels is used as L
-    w (list): weight vector sorted by labels
-  """
-  assert len(labels) == len(predictions)
-
-  if not any(labels):
-    out = None
-  else:
-
-    safe_n = len(predictions) if not n else min(len(predictions), n)
-    safe_k = len(labels) if not k else min(len(labels), k)
+    """
+    Recall at N-K ranking metric
+    Args:
+      labels (list): A list of label values
+      predictions (list): A list of prediction values
+      n (int): size of the list at which of predictions to compute recall. (Light Ranking Predictions)
+               The default is None in which case the length of the provided predictions is used as L
+      k (int): size of the list at which of labels to compute recall. (Heavy Ranking Predictions)
+               The default is None in which case the length of the provided labels is used as L
+      w (list): weight vector sorted by labels
+    """
+    assert len(labels) == len(predictions)
 
-    labels_with_predictions = zip(labels, predictions)
-    sorted_labels_with_predictions = sorted(
-      labels_with_predictions, key=lambda x: x[0], reverse=True
-    )
+    if not any(labels):
+        out = None
+    else:
+        safe_n = len(predictions) if not n else min(len(predictions), n)
+        safe_k = len(labels) if not k else min(len(labels), k)
 
-    order_sorted_labels_predictions = zip(range(len(labels)), *zip(*sorted_labels_with_predictions))
+        labels_with_predictions = zip(labels, predictions)
+        sorted_labels_with_predictions = sorted(
+            labels_with_predictions, key=lambda x: x[0], reverse=True
+        )
 
-    order_with_predictions = [
-      (order, pred) for order, label, pred in order_sorted_labels_predictions
-    ]
-    order_with_sorted_predictions = sorted(order_with_predictions, key=lambda x: x[1], reverse=True)
+        order_sorted_labels_predictions = zip(
+            range(len(labels)), *zip(*sorted_labels_with_predictions)
+        )
 
-    pred_sorted_order_at_n = [order for order, _ in order_with_sorted_predictions][:safe_n]
+        order_with_predictions = [
+            (order, pred) for order, label, pred in order_sorted_labels_predictions
+        ]
+        order_with_sorted_predictions = sorted(
+            order_with_predictions, key=lambda x: x[1], reverse=True
+        )
 
-    intersection_weight = [
-      _get_weight(w, order) if order < safe_k else 0 for order in pred_sorted_order_at_n
-    ]
+        pred_sorted_order_at_n = [order for order, _ in order_with_sorted_predictions][
+            :safe_n
+        ]
 
-    intersection_score = sum(intersection_weight)
-    full_score = sum(w) if w else float(safe_k)
+        intersection_weight = [
+            _get_weight(w, order) if order < safe_k else 0
+            for order in pred_sorted_order_at_n
+        ]
 
-    out = 0.0
-    if full_score != 0:
-      out = intersection_score / full_score
+        intersection_score = sum(intersection_weight)
+        full_score = sum(w) if w else float(safe_k)
 
-  return out
+        out = 0.0
+        if full_score != 0:
+            out = intersection_score / full_score
 
+    return out
 
-class ExpectedLossGroupedMetricsConfiguration(GroupedMetricsConfiguration):
-  """
-  This is the Expected Loss Grouped metric computation configuration.
-  """
 
-  def __init__(self, lightNs=[]):
+class ExpectedLossGroupedMetricsConfiguration(GroupedMetricsConfiguration):
     """
-    Args:
-      lightNs (list): size of the list at which of Initial candidates to compute Expected Loss. (LightRanking)
+    This is the Expected Loss Grouped metric computation configuration.
     """
-    self.lightNs = lightNs
 
-  @property
-  def name(self):
-    return "ExpectedLoss"
+    def __init__(self, lightNs=[]):
+        """
+        Args:
+          lightNs (list): size of the list at which of Initial candidates to compute Expected Loss. (LightRanking)
+        """
+        self.lightNs = lightNs
 
-  @property
-  def metrics_dict(self):
-    metrics_to_compute = {}
-    for lightN in self.lightNs:
-      metric_name = "ExpectedLoss_atLight_" + str(lightN)
-      metrics_to_compute[metric_name] = partial(score_loss_at_n, lightN=lightN)
-    return metrics_to_compute
+    @property
+    def name(self):
+        return "ExpectedLoss"
 
-  def extract_label(self, prec, drec, drec_label):
-    return drec_label
+    @property
+    def metrics_dict(self):
+        metrics_to_compute = {}
+        for lightN in self.lightNs:
+            metric_name = "ExpectedLoss_atLight_" + str(lightN)
+            metrics_to_compute[metric_name] = partial(score_loss_at_n, lightN=lightN)
+        return metrics_to_compute
 
-  def extract_prediction(self, prec, drec, drec_label):
-    return extract_prediction_from_prediction_record(prec)
+    def extract_label(self, prec, drec, drec_label):
+        return drec_label
 
+    def extract_prediction(self, prec, drec, drec_label):
+        return extract_prediction_from_prediction_record(prec)
 
-class CGRGroupedMetricsConfiguration(GroupedMetricsConfiguration):
-  """
-  This is the Cumulative Gain Ratio (CGR) Grouped metric computation configuration.
-  CGR at the max length of each session is the default.
-  CGR at additional positions can be computed by specifying a list of 'n's and 'k's
-  """
 
-  def __init__(self, lightNs=[], heavyKs=[]):
+class CGRGroupedMetricsConfiguration(GroupedMetricsConfiguration):
     """
-    Args:
-      lightNs (list): size of the list at which of Initial candidates to compute CGR. (LightRanking)
-      heavyK (int):   size of the list at which of Refined candidates to compute CGR. (HeavyRanking)
+    This is the Cumulative Gain Ratio (CGR) Grouped metric computation configuration.
+    CGR at the max length of each session is the default.
+    CGR at additional positions can be computed by specifying a list of 'n's and 'k's
     """
-    self.lightNs = lightNs
-    self.heavyKs = heavyKs
-
-  @property
-  def name(self):
-    return "cgr"
-
-  @property
-  def metrics_dict(self):
-    metrics_to_compute = {}
-    for lightN in self.lightNs:
-      for heavyK in self.heavyKs:
-        metric_name = "cgr_atLight_" + str(lightN) + "_atHeavy_" + str(heavyK)
-        metrics_to_compute[metric_name] = partial(cgr_at_nk, lightN=lightN, heavyK=heavyK)
-    return metrics_to_compute
-
-  def extract_label(self, prec, drec, drec_label):
-    return drec_label
 
-  def extract_prediction(self, prec, drec, drec_label):
-    return extract_prediction_from_prediction_record(prec)
+    def __init__(self, lightNs=[], heavyKs=[]):
+        """
+        Args:
+          lightNs (list): size of the list at which of Initial candidates to compute CGR. (LightRanking)
+          heavyK (int):   size of the list at which of Refined candidates to compute CGR. (HeavyRanking)
+        """
+        self.lightNs = lightNs
+        self.heavyKs = heavyKs
+
+    @property
+    def name(self):
+        return "cgr"
+
+    @property
+    def metrics_dict(self):
+        metrics_to_compute = {}
+        for lightN in self.lightNs:
+            for heavyK in self.heavyKs:
+                metric_name = "cgr_atLight_" + str(lightN) + "_atHeavy_" + str(heavyK)
+                metrics_to_compute[metric_name] = partial(
+                    cgr_at_nk, lightN=lightN, heavyK=heavyK
+                )
+        return metrics_to_compute
+
+    def extract_label(self, prec, drec, drec_label):
+        return drec_label
+
+    def extract_prediction(self, prec, drec, drec_label):
+        return extract_prediction_from_prediction_record(prec)
 
 
 class RecallGroupedMetricsConfiguration(GroupedMetricsConfiguration):
-  """
-  This is the Recall Grouped metric computation configuration.
-  Recall at the max length of each session is the default.
-  Recall at additional positions can be computed by specifying a list of 'n's and 'k's
-  """
-
-  def __init__(self, n=[], k=[], w=[]):
     """
-    Args:
-      n (list): A list of ints. List of prediction rank thresholds (for light)
-      k (list): A list of ints. List of label rank thresholds (for heavy)
+    This is the Recall Grouped metric computation configuration.
+    Recall at the max length of each session is the default.
+    Recall at additional positions can be computed by specifying a list of 'n's and 'k's
     """
-    self.predN = n
-    self.labelK = k
-    self.weight = w
-
-  @property
-  def name(self):
-    return "group_recall"
-
-  @property
-  def metrics_dict(self):
-    metrics_to_compute = {"group_recall_unweighted": recall_at_nk}
-    if not self.weight:
-      metrics_to_compute["group_recall_weighted"] = partial(recall_at_nk, w=self.weight)
-
-    if self.predN and self.labelK:
-      for n in self.predN:
-        for k in self.labelK:
-          if n >= k:
-            metrics_to_compute[
-              "group_recall_unweighted_at_L" + str(n) + "_at_H" + str(k)
-            ] = partial(recall_at_nk, n=n, k=k)
-            if self.weight:
-              metrics_to_compute[
-                "group_recall_weighted_at_L" + str(n) + "_at_H" + str(k)
-              ] = partial(recall_at_nk, n=n, k=k, w=self.weight)
-
-    if self.labelK and not self.predN:
-      for k in self.labelK:
-        metrics_to_compute["group_recall_unweighted_at_full_at_H" + str(k)] = partial(
-          recall_at_nk, k=k
-        )
-        if self.weight:
-          metrics_to_compute["group_recall_weighted_at_full_at_H" + str(k)] = partial(
-            recall_at_nk, k=k, w=self.weight
-          )
-    return metrics_to_compute
-
-  def extract_label(self, prec, drec, drec_label):
-    return drec_label
 
-  def extract_prediction(self, prec, drec, drec_label):
-    return extract_prediction_from_prediction_record(prec)
+    def __init__(self, n=[], k=[], w=[]):
+        """
+        Args:
+          n (list): A list of ints. List of prediction rank thresholds (for light)
+          k (list): A list of ints. List of label rank thresholds (for heavy)
+        """
+        self.predN = n
+        self.labelK = k
+        self.weight = w
+
+    @property
+    def name(self):
+        return "group_recall"
+
+    @property
+    def metrics_dict(self):
+        metrics_to_compute = {"group_recall_unweighted": recall_at_nk}
+        if not self.weight:
+            metrics_to_compute["group_recall_weighted"] = partial(
+                recall_at_nk, w=self.weight
+            )
+
+        if self.predN and self.labelK:
+            for n in self.predN:
+                for k in self.labelK:
+                    if n >= k:
+                        metrics_to_compute[
+                            "group_recall_unweighted_at_L" + str(n) + "_at_H" + str(k)
+                        ] = partial(recall_at_nk, n=n, k=k)
+                        if self.weight:
+                            metrics_to_compute[
+                                "group_recall_weighted_at_L" + str(n) + "_at_H" + str(k)
+                            ] = partial(recall_at_nk, n=n, k=k, w=self.weight)
+
+        if self.labelK and not self.predN:
+            for k in self.labelK:
+                metrics_to_compute[
+                    "group_recall_unweighted_at_full_at_H" + str(k)
+                ] = partial(recall_at_nk, k=k)
+                if self.weight:
+                    metrics_to_compute[
+                        "group_recall_weighted_at_full_at_H" + str(k)
+                    ] = partial(recall_at_nk, k=k, w=self.weight)
+        return metrics_to_compute
+
+    def extract_label(self, prec, drec, drec_label):
+        return drec_label
+
+    def extract_prediction(self, prec, drec, drec_label):
+        return extract_prediction_from_prediction_record(prec)
diff --git a/pushservice/src/main/python/models/libs/metric_fn_utils.py b/pushservice/src/main/python/models/libs/metric_fn_utils.py
index fc26a1305..c1d25ed70 100644
--- a/pushservice/src/main/python/models/libs/metric_fn_utils.py
+++ b/pushservice/src/main/python/models/libs/metric_fn_utils.py
@@ -2,24 +2,23 @@
 Utilties for constructing a metric_fn for magic recs.
 """
 
+import tensorflow.compat.v1 as tf
+
 from twml.contrib.metrics.metrics import (
-  get_dual_binary_tasks_metric_fn,
-  get_numeric_metric_fn,
-  get_partial_multi_binary_class_metric_fn,
-  get_single_binary_task_metric_fn,
+    get_dual_binary_tasks_metric_fn,
+    get_numeric_metric_fn,
+    get_partial_multi_binary_class_metric_fn,
+    get_single_binary_task_metric_fn,
 )
 
 from .model_utils import generate_disliked_mask
 
-import tensorflow.compat.v1 as tf
-
-
 METRIC_BOOK = {
-  "OONC": ["OONC"],
-  "OONC_Engagement": ["OONC", "Engagement"],
-  "Sent": ["Sent"],
-  "HeavyRankPosition": ["HeavyRankPosition"],
-  "HeavyRankProbability": ["HeavyRankProbability"],
+    "OONC": ["OONC"],
+    "OONC_Engagement": ["OONC", "Engagement"],
+    "Sent": ["Sent"],
+    "HeavyRankPosition": ["HeavyRankPosition"],
+    "HeavyRankProbability": ["HeavyRankProbability"],
 }
 
 USER_AGE_FEATURE_NAME = "accountAge"
@@ -27,268 +26,288 @@
 
 
 def remove_padding_and_flatten(tensor, valid_batch_size):
-  """Remove the padding of the input padded tensor given the valid batch size tensor,
-    then flatten the output with respect to the first dimension.
-  Args:
-    tensor: A tensor of size [META_BATCH_SIZE, BATCH_SIZE, FEATURE_DIM].
-    valid_batch_size: A tensor of size [META_BATCH_SIZE], with each element indicating
-      the effective batch size of the BATCH_SIZE dimension.
+    """Remove the padding of the input padded tensor given the valid batch size tensor,
+      then flatten the output with respect to the first dimension.
+    Args:
+      tensor: A tensor of size [META_BATCH_SIZE, BATCH_SIZE, FEATURE_DIM].
+      valid_batch_size: A tensor of size [META_BATCH_SIZE], with each element indicating
+        the effective batch size of the BATCH_SIZE dimension.
 
-  Returns:
-    A tesnor of size [tf.reduce_sum(valid_batch_size), FEATURE_DIM].
-  """
-  unpadded_ragged_tensor = tf.RaggedTensor.from_tensor(tensor=tensor, lengths=valid_batch_size)
+    Returns:
+      A tesnor of size [tf.reduce_sum(valid_batch_size), FEATURE_DIM].
+    """
+    unpadded_ragged_tensor = tf.RaggedTensor.from_tensor(
+        tensor=tensor, lengths=valid_batch_size
+    )
 
-  return unpadded_ragged_tensor.flat_values
+    return unpadded_ragged_tensor.flat_values
 
 
 def safe_mask(values, mask):
-  """Mask values if possible.
+    """Mask values if possible.
 
-  Boolean mask inputed values if and only if values is a tensor of the same dimension as mask (or can be broadcasted to that dimension).
+    Boolean mask inputed values if and only if values is a tensor of the same dimension as mask (or can be broadcasted to that dimension).
 
-  Args:
-      values (Any or Tensor): Input tensor to mask. Dim 0 should be size N.
-      mask (boolean tensor): A boolean tensor of size N.
+    Args:
+        values (Any or Tensor): Input tensor to mask. Dim 0 should be size N.
+        mask (boolean tensor): A boolean tensor of size N.
 
-  Returns Values or Values masked.
-  """
-  if values is None:
-    return values
-  if not tf.is_tensor(values):
-    return values
-  values_shape = values.get_shape()
-  if not values_shape or len(values_shape) == 0:
-    return values
-  if not mask.get_shape().is_compatible_with(values_shape[0]):
-    return values
-  return tf.boolean_mask(values, mask)
+    Returns Values or Values masked.
+    """
+    if values is None:
+        return values
+    if not tf.is_tensor(values):
+        return values
+    values_shape = values.get_shape()
+    if not values_shape or len(values_shape) == 0:
+        return values
+    if not mask.get_shape().is_compatible_with(values_shape[0]):
+        return values
+    return tf.boolean_mask(values, mask)
 
 
 def add_new_user_metrics(metric_fn):
-  """Will stratify the metric_fn by adding new user metrics.
+    """Will stratify the metric_fn by adding new user metrics.
 
-  Given an input metric_fn, double every metric: One will be the orignal and the other will only include those for new users.
+    Given an input metric_fn, double every metric: One will be the orignal and the other will only include those for new users.
 
-  Args:
-      metric_fn (python function): Base twml metric_fn.
+    Args:
+        metric_fn (python function): Base twml metric_fn.
 
-  Returns a metric_fn with new user metrics included.
-  """
+    Returns a metric_fn with new user metrics included.
+    """
 
-  def metric_fn_with_new_users(graph_output, labels, weights):
-    if USER_AGE_FEATURE_NAME not in graph_output:
-      raise ValueError(
-        "In order to get metrics stratified by user age, {name} feature should be added to model graph output. However, only the following output keys were found: {keys}.".format(
-          name=USER_AGE_FEATURE_NAME, keys=graph_output.keys()
+    def metric_fn_with_new_users(graph_output, labels, weights):
+        if USER_AGE_FEATURE_NAME not in graph_output:
+            raise ValueError(
+                "In order to get metrics stratified by user age, {name} feature should be added to model graph output. However, only the following output keys were found: {keys}.".format(
+                    name=USER_AGE_FEATURE_NAME, keys=graph_output.keys()
+                )
+            )
+
+        metric_ops = metric_fn(graph_output, labels, weights)
+
+        is_new = tf.reshape(
+            tf.math.less_equal(
+                tf.cast(graph_output[USER_AGE_FEATURE_NAME], tf.int64),
+                tf.cast(NEW_USER_AGE_CUTOFF, tf.int64),
+            ),
+            [-1],
         )
-      )
 
-    metric_ops = metric_fn(graph_output, labels, weights)
+        labels = safe_mask(labels, is_new)
+        weights = safe_mask(weights, is_new)
+        graph_output = {
+            key: safe_mask(values, is_new) for key, values in graph_output.items()
+        }
 
-    is_new = tf.reshape(
-      tf.math.less_equal(
-        tf.cast(graph_output[USER_AGE_FEATURE_NAME], tf.int64),
-        tf.cast(NEW_USER_AGE_CUTOFF, tf.int64),
-      ),
-      [-1],
-    )
+        new_user_metric_ops = metric_fn(graph_output, labels, weights)
+        new_user_metric_ops = {
+            name + "_new_users": ops for name, ops in new_user_metric_ops.items()
+        }
+        metric_ops.update(new_user_metric_ops)
+        return metric_ops
 
-    labels = safe_mask(labels, is_new)
-    weights = safe_mask(weights, is_new)
-    graph_output = {key: safe_mask(values, is_new) for key, values in graph_output.items()}
-
-    new_user_metric_ops = metric_fn(graph_output, labels, weights)
-    new_user_metric_ops = {name + "_new_users": ops for name, ops in new_user_metric_ops.items()}
-    metric_ops.update(new_user_metric_ops)
-    return metric_ops
-
-  return metric_fn_with_new_users
+    return metric_fn_with_new_users
 
 
 def get_meta_learn_single_binary_task_metric_fn(
-  metrics, classnames, top_k=(5, 5, 5), use_top_k=False
+    metrics, classnames, top_k=(5, 5, 5), use_top_k=False
 ):
-  """Wrapper function to use the metric_fn with meta learning evaluation scheme.
-
-  Args:
-    metrics: A list of string representing metric names.
-    classnames: A list of string repsenting class names, In case of multiple binary class models,
-      the names for each class or label.
-    top_k: A tuple of int to specify top K metrics.
-    use_top_k: A boolean value indicating of top K of metrics is used.
-
-  Returns:
-    A customized metric_fn function.
-  """
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """The op func of the eval_metrics. Comparing with normal version,
-      the difference is we flatten the output, label, and weights.
+    """Wrapper function to use the metric_fn with meta learning evaluation scheme.
 
     Args:
-      graph_output: A dict of tensors.
-      labels: A tensor of int32 be the value of either 0 or 1.
-      weights: A tensor of float32 to indicate the per record weight.
+      metrics: A list of string representing metric names.
+      classnames: A list of string repsenting class names, In case of multiple binary class models,
+        the names for each class or label.
+      top_k: A tuple of int to specify top K metrics.
+      use_top_k: A boolean value indicating of top K of metrics is used.
 
     Returns:
-      A dict of metric names and values.
+      A customized metric_fn function.
     """
-    metric_op_weighted = get_partial_multi_binary_class_metric_fn(
-      metrics, predcols=0, classes=classnames
-    )
-    classnames_unweighted = ["unweighted_" + classname for classname in classnames]
-    metric_op_unweighted = get_partial_multi_binary_class_metric_fn(
-      metrics, predcols=0, classes=classnames_unweighted
-    )
 
-    valid_batch_size = graph_output["valid_batch_size"]
-    graph_output["output"] = remove_padding_and_flatten(graph_output["output"], valid_batch_size)
-    labels = remove_padding_and_flatten(labels, valid_batch_size)
-    weights = remove_padding_and_flatten(weights, valid_batch_size)
+    def get_eval_metric_ops(graph_output, labels, weights):
+        """The op func of the eval_metrics. Comparing with normal version,
+          the difference is we flatten the output, label, and weights.
 
-    tf.ensure_shape(graph_output["output"], [None, 1])
-    tf.ensure_shape(labels, [None, 1])
-    tf.ensure_shape(weights, [None, 1])
+        Args:
+          graph_output: A dict of tensors.
+          labels: A tensor of int32 be the value of either 0 or 1.
+          weights: A tensor of float32 to indicate the per record weight.
 
-    metrics_weighted = metric_op_weighted(graph_output, labels, weights)
-    metrics_unweighted = metric_op_unweighted(graph_output, labels, None)
-    metrics_weighted.update(metrics_unweighted)
+        Returns:
+          A dict of metric names and values.
+        """
+        metric_op_weighted = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=0, classes=classnames
+        )
+        classnames_unweighted = ["unweighted_" + classname for classname in classnames]
+        metric_op_unweighted = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=0, classes=classnames_unweighted
+        )
 
-    if use_top_k:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=top_k, predcol=0, labelcol=1)
-      metrics_numeric = metric_op_numeric(graph_output, labels, weights)
-      metrics_weighted.update(metrics_numeric)
-    return metrics_weighted
+        valid_batch_size = graph_output["valid_batch_size"]
+        graph_output["output"] = remove_padding_and_flatten(
+            graph_output["output"], valid_batch_size
+        )
+        labels = remove_padding_and_flatten(labels, valid_batch_size)
+        weights = remove_padding_and_flatten(weights, valid_batch_size)
 
-  return get_eval_metric_ops
+        tf.ensure_shape(graph_output["output"], [None, 1])
+        tf.ensure_shape(labels, [None, 1])
+        tf.ensure_shape(weights, [None, 1])
 
+        metrics_weighted = metric_op_weighted(graph_output, labels, weights)
+        metrics_unweighted = metric_op_unweighted(graph_output, labels, None)
+        metrics_weighted.update(metrics_unweighted)
 
-def get_meta_learn_dual_binary_tasks_metric_fn(
-  metrics, classnames, top_k=(5, 5, 5), use_top_k=False
-):
-  """Wrapper function to use the metric_fn with meta learning evaluation scheme.
+        if use_top_k:
+            metric_op_numeric = get_numeric_metric_fn(
+                metrics=None, topK=top_k, predcol=0, labelcol=1
+            )
+            metrics_numeric = metric_op_numeric(graph_output, labels, weights)
+            metrics_weighted.update(metrics_numeric)
+        return metrics_weighted
 
-  Args:
-    metrics: A list of string representing metric names.
-    classnames: A list of string repsenting class names, In case of multiple binary class models,
-      the names for each class or label.
-    top_k: A tuple of int to specify top K metrics.
-    use_top_k: A boolean value indicating of top K of metrics is used.
+    return get_eval_metric_ops
 
-  Returns:
-    A customized metric_fn function.
-  """
 
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """The op func of the eval_metrics. Comparing with normal version,
-      the difference is we flatten the output, label, and weights.
+def get_meta_learn_dual_binary_tasks_metric_fn(
+    metrics, classnames, top_k=(5, 5, 5), use_top_k=False
+):
+    """Wrapper function to use the metric_fn with meta learning evaluation scheme.
 
     Args:
-      graph_output: A dict of tensors.
-      labels: A tensor of int32 be the value of either 0 or 1.
-      weights: A tensor of float32 to indicate the per record weight.
+      metrics: A list of string representing metric names.
+      classnames: A list of string repsenting class names, In case of multiple binary class models,
+        the names for each class or label.
+      top_k: A tuple of int to specify top K metrics.
+      use_top_k: A boolean value indicating of top K of metrics is used.
 
     Returns:
-      A dict of metric names and values.
+      A customized metric_fn function.
     """
-    metric_op_weighted = get_partial_multi_binary_class_metric_fn(
-      metrics, predcols=[0, 1], classes=classnames
-    )
-    classnames_unweighted = ["unweighted_" + classname for classname in classnames]
-    metric_op_unweighted = get_partial_multi_binary_class_metric_fn(
-      metrics, predcols=[0, 1], classes=classnames_unweighted
-    )
 
-    valid_batch_size = graph_output["valid_batch_size"]
-    graph_output["output"] = remove_padding_and_flatten(graph_output["output"], valid_batch_size)
-    labels = remove_padding_and_flatten(labels, valid_batch_size)
-    weights = remove_padding_and_flatten(weights, valid_batch_size)
+    def get_eval_metric_ops(graph_output, labels, weights):
+        """The op func of the eval_metrics. Comparing with normal version,
+          the difference is we flatten the output, label, and weights.
 
-    tf.ensure_shape(graph_output["output"], [None, 2])
-    tf.ensure_shape(labels, [None, 2])
-    tf.ensure_shape(weights, [None, 1])
+        Args:
+          graph_output: A dict of tensors.
+          labels: A tensor of int32 be the value of either 0 or 1.
+          weights: A tensor of float32 to indicate the per record weight.
 
-    metrics_weighted = metric_op_weighted(graph_output, labels, weights)
-    metrics_unweighted = metric_op_unweighted(graph_output, labels, None)
-    metrics_weighted.update(metrics_unweighted)
+        Returns:
+          A dict of metric names and values.
+        """
+        metric_op_weighted = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=[0, 1], classes=classnames
+        )
+        classnames_unweighted = ["unweighted_" + classname for classname in classnames]
+        metric_op_unweighted = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=[0, 1], classes=classnames_unweighted
+        )
 
-    if use_top_k:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=top_k, predcol=2, labelcol=2)
-      metrics_numeric = metric_op_numeric(graph_output, labels, weights)
-      metrics_weighted.update(metrics_numeric)
-    return metrics_weighted
+        valid_batch_size = graph_output["valid_batch_size"]
+        graph_output["output"] = remove_padding_and_flatten(
+            graph_output["output"], valid_batch_size
+        )
+        labels = remove_padding_and_flatten(labels, valid_batch_size)
+        weights = remove_padding_and_flatten(weights, valid_batch_size)
 
-  return get_eval_metric_ops
+        tf.ensure_shape(graph_output["output"], [None, 2])
+        tf.ensure_shape(labels, [None, 2])
+        tf.ensure_shape(weights, [None, 1])
 
+        metrics_weighted = metric_op_weighted(graph_output, labels, weights)
+        metrics_unweighted = metric_op_unweighted(graph_output, labels, None)
+        metrics_weighted.update(metrics_unweighted)
 
-def get_metric_fn(task_name, use_stratify_metrics, use_meta_batch=False):
-  """Will retrieve the metric_fn for magic recs.
-
-  Args:
-    task_name (string): Which task is being used for this model.
-    use_stratify_metrics (boolean): Should we add stratified metrics (new user metrics).
-    use_meta_batch (boolean): If the output/label/weights are passed in 3D shape instead of
-    2D shape.
-
-  Returns:
-    A metric_fn function to pass in twml Trainer.
-  """
-  if task_name not in METRIC_BOOK:
-    raise ValueError(
-      "Task name of {task_name} not recognized. Unable to retrieve metrics.".format(
-        task_name=task_name
-      )
-    )
-  class_names = METRIC_BOOK[task_name]
-  if use_meta_batch:
-    get_n_binary_task_metric_fn = (
-      get_meta_learn_single_binary_task_metric_fn
-      if len(class_names) == 1
-      else get_meta_learn_dual_binary_tasks_metric_fn
-    )
-  else:
-    get_n_binary_task_metric_fn = (
-      get_single_binary_task_metric_fn if len(class_names) == 1 else get_dual_binary_tasks_metric_fn
-    )
+        if use_top_k:
+            metric_op_numeric = get_numeric_metric_fn(
+                metrics=None, topK=top_k, predcol=2, labelcol=2
+            )
+            metrics_numeric = metric_op_numeric(graph_output, labels, weights)
+            metrics_weighted.update(metrics_numeric)
+        return metrics_weighted
 
-  metric_fn = get_n_binary_task_metric_fn(metrics=None, classnames=METRIC_BOOK[task_name])
+    return get_eval_metric_ops
 
-  if use_stratify_metrics:
-    metric_fn = add_new_user_metrics(metric_fn)
 
-  return metric_fn
+def get_metric_fn(task_name, use_stratify_metrics, use_meta_batch=False):
+    """Will retrieve the metric_fn for magic recs.
 
+    Args:
+      task_name (string): Which task is being used for this model.
+      use_stratify_metrics (boolean): Should we add stratified metrics (new user metrics).
+      use_meta_batch (boolean): If the output/label/weights are passed in 3D shape instead of
+      2D shape.
 
-def flip_disliked_labels(metric_fn):
-  """This function returns an adapted metric_fn which flips the labels of the OONCed evaluation data to 0 if it is disliked.
-  Args:
-    metric_fn: A metric_fn function to pass in twml Trainer.
+    Returns:
+      A metric_fn function to pass in twml Trainer.
+    """
+    if task_name not in METRIC_BOOK:
+        raise ValueError(
+            "Task name of {task_name} not recognized. Unable to retrieve metrics.".format(
+                task_name=task_name
+            )
+        )
+    class_names = METRIC_BOOK[task_name]
+    if use_meta_batch:
+        get_n_binary_task_metric_fn = (
+            get_meta_learn_single_binary_task_metric_fn
+            if len(class_names) == 1
+            else get_meta_learn_dual_binary_tasks_metric_fn
+        )
+    else:
+        get_n_binary_task_metric_fn = (
+            get_single_binary_task_metric_fn
+            if len(class_names) == 1
+            else get_dual_binary_tasks_metric_fn
+        )
+
+    metric_fn = get_n_binary_task_metric_fn(
+        metrics=None, classnames=METRIC_BOOK[task_name]
+    )
 
-  Returns:
-    _adapted_metric_fn: A customized metric_fn function with disliked OONC labels flipped.
-  """
+    if use_stratify_metrics:
+        metric_fn = add_new_user_metrics(metric_fn)
 
-  def _adapted_metric_fn(graph_output, labels, weights):
-    """A customized metric_fn function with disliked OONC labels flipped.
+    return metric_fn
 
+
+def flip_disliked_labels(metric_fn):
+    """This function returns an adapted metric_fn which flips the labels of the OONCed evaluation data to 0 if it is disliked.
     Args:
-      graph_output: A dict of tensors.
-      labels: labels of training samples, which is a 2D tensor of shape batch_size x 3: [OONCs, engagements, dislikes]
-      weights: A tensor of float32 to indicate the per record weight.
+      metric_fn: A metric_fn function to pass in twml Trainer.
 
     Returns:
-      A dict of metric names and values.
+      _adapted_metric_fn: A customized metric_fn function with disliked OONC labels flipped.
     """
-    # We want to multiply the label of the observation by 0 only when it is disliked
-    disliked_mask = generate_disliked_mask(labels)
 
-    # Extract OONC and engagement labels only.
-    labels = tf.reshape(labels[:, 0:2], shape=[-1, 2])
+    def _adapted_metric_fn(graph_output, labels, weights):
+        """A customized metric_fn function with disliked OONC labels flipped.
+
+        Args:
+          graph_output: A dict of tensors.
+          labels: labels of training samples, which is a 2D tensor of shape batch_size x 3: [OONCs, engagements, dislikes]
+          weights: A tensor of float32 to indicate the per record weight.
 
-    # Labels will be set to 0 if it is disliked.
-    adapted_labels = labels * tf.cast(tf.logical_not(disliked_mask), dtype=labels.dtype)
+        Returns:
+          A dict of metric names and values.
+        """
+        # We want to multiply the label of the observation by 0 only when it is disliked
+        disliked_mask = generate_disliked_mask(labels)
+
+        # Extract OONC and engagement labels only.
+        labels = tf.reshape(labels[:, 0:2], shape=[-1, 2])
+
+        # Labels will be set to 0 if it is disliked.
+        adapted_labels = labels * tf.cast(
+            tf.logical_not(disliked_mask), dtype=labels.dtype
+        )
 
-    return metric_fn(graph_output, adapted_labels, weights)
+        return metric_fn(graph_output, adapted_labels, weights)
 
-  return _adapted_metric_fn
+    return _adapted_metric_fn
diff --git a/pushservice/src/main/python/models/libs/model_args.py b/pushservice/src/main/python/models/libs/model_args.py
index ae142d818..4642d7974 100644
--- a/pushservice/src/main/python/models/libs/model_args.py
+++ b/pushservice/src/main/python/models/libs/model_args.py
@@ -1,231 +1,256 @@
 from twml.trainers import DataRecordTrainer
 
-
 # checkstyle: noqa
 
 
 def get_arg_parser():
-  parser = DataRecordTrainer.add_parser_arguments()
-
-  parser.add_argument(
-    "--input_size_bits",
-    type=int,
-    default=18,
-    help="number of bits allocated to the input size",
-  )
-  parser.add_argument(
-    "--model_trainer_name",
-    default="magic_recs_mlp_calibration_MTL_OONC_Engagement",
-    type=str,
-    help="specify the model trainer name.",
-  )
-
-  parser.add_argument(
-    "--model_type",
-    default="deepnorm_gbdt_inputdrop2_rescale",
-    type=str,
-    help="specify the model type to use.",
-  )
-  parser.add_argument(
-    "--feat_config_type",
-    default="get_feature_config_with_sparse_continuous",
-    type=str,
-    help="specify the feature configure function to use.",
-  )
-
-  parser.add_argument(
-    "--directly_export_best",
-    default=False,
-    action="store_true",
-    help="whether to directly_export best_checkpoint",
-  )
-
-  parser.add_argument(
-    "--warm_start_base_dir",
-    default="none",
-    type=str,
-    help="latest ckpt in this folder will be used to ",
-  )
-
-  parser.add_argument(
-    "--feature_list",
-    default="none",
-    type=str,
-    help="Which features to use for training",
-  )
-  parser.add_argument(
-    "--warm_start_from", default=None, type=str, help="model dir to warm start from"
-  )
-
-  parser.add_argument(
-    "--momentum", default=0.99999, type=float, help="Momentum term for batch normalization"
-  )
-  parser.add_argument(
-    "--dropout",
-    default=0.2,
-    type=float,
-    help="input_dropout_rate to rescale output by (1 - input_dropout_rate)",
-  )
-  parser.add_argument(
-    "--out_layer_1_size", default=256, type=int, help="Size of MLP_branch layer 1"
-  )
-  parser.add_argument(
-    "--out_layer_2_size", default=128, type=int, help="Size of MLP_branch layer 2"
-  )
-  parser.add_argument("--out_layer_3_size", default=64, type=int, help="Size of MLP_branch layer 3")
-  parser.add_argument(
-    "--sparse_embedding_size", default=50, type=int, help="Dimensionality of sparse embedding layer"
-  )
-  parser.add_argument(
-    "--dense_embedding_size", default=128, type=int, help="Dimensionality of dense embedding layer"
-  )
-
-  parser.add_argument(
-    "--use_uam_label",
-    default=False,
-    type=str,
-    help="Whether to use uam_label or not",
-  )
-
-  parser.add_argument(
-    "--task_name",
-    default="OONC_Engagement",
-    type=str,
-    help="specify the task name to use: OONC or OONC_Engagement.",
-  )
-  parser.add_argument(
-    "--init_weight",
-    default=0.9,
-    type=float,
-    help="Initial OONC Task Weight MTL: OONC+Engagement.",
-  )
-  parser.add_argument(
-    "--use_engagement_weight",
-    default=False,
-    action="store_true",
-    help="whether to use engagement weight for base model.",
-  )
-  parser.add_argument(
-    "--mtl_num_extra_layers",
-    type=int,
-    default=1,
-    help="Number of Hidden Layers for each TaskBranch.",
-  )
-  parser.add_argument(
-    "--mtl_neuron_scale", type=int, default=4, help="Scaling Factor of Neurons in MTL Extra Layers."
-  )
-  parser.add_argument(
-    "--use_oonc_score",
-    default=False,
-    action="store_true",
-    help="whether to use oonc score only or combined score.",
-  )
-  parser.add_argument(
-    "--use_stratified_metrics",
-    default=False,
-    action="store_true",
-    help="Use stratified metrics: Break out new-user metrics.",
-  )
-  parser.add_argument(
-    "--run_group_metrics",
-    default=False,
-    action="store_true",
-    help="Will run evaluation metrics grouped by user.",
-  )
-  parser.add_argument(
-    "--use_full_scope",
-    default=False,
-    action="store_true",
-    help="Will add extra scope and naming to graph.",
-  )
-  parser.add_argument(
-    "--trainable_regexes",
-    default=None,
-    nargs="*",
-    help="The union of variables specified by the list of regexes will be considered trainable.",
-  )
-  parser.add_argument(
-    "--fine_tuning.ckpt_to_initialize_from",
-    dest="fine_tuning_ckpt_to_initialize_from",
-    type=str,
-    default=None,
-    help="Checkpoint path from which to warm start. Indicates the pre-trained model.",
-  )
-  parser.add_argument(
-    "--fine_tuning.warm_start_scope_regex",
-    dest="fine_tuning_warm_start_scope_regex",
-    type=str,
-    default=None,
-    help="All variables matching this will be restored.",
-  )
-
-  return parser
+    parser = DataRecordTrainer.add_parser_arguments()
+
+    parser.add_argument(
+        "--input_size_bits",
+        type=int,
+        default=18,
+        help="number of bits allocated to the input size",
+    )
+    parser.add_argument(
+        "--model_trainer_name",
+        default="magic_recs_mlp_calibration_MTL_OONC_Engagement",
+        type=str,
+        help="specify the model trainer name.",
+    )
+
+    parser.add_argument(
+        "--model_type",
+        default="deepnorm_gbdt_inputdrop2_rescale",
+        type=str,
+        help="specify the model type to use.",
+    )
+    parser.add_argument(
+        "--feat_config_type",
+        default="get_feature_config_with_sparse_continuous",
+        type=str,
+        help="specify the feature configure function to use.",
+    )
+
+    parser.add_argument(
+        "--directly_export_best",
+        default=False,
+        action="store_true",
+        help="whether to directly_export best_checkpoint",
+    )
+
+    parser.add_argument(
+        "--warm_start_base_dir",
+        default="none",
+        type=str,
+        help="latest ckpt in this folder will be used to ",
+    )
+
+    parser.add_argument(
+        "--feature_list",
+        default="none",
+        type=str,
+        help="Which features to use for training",
+    )
+    parser.add_argument(
+        "--warm_start_from", default=None, type=str, help="model dir to warm start from"
+    )
+
+    parser.add_argument(
+        "--momentum",
+        default=0.99999,
+        type=float,
+        help="Momentum term for batch normalization",
+    )
+    parser.add_argument(
+        "--dropout",
+        default=0.2,
+        type=float,
+        help="input_dropout_rate to rescale output by (1 - input_dropout_rate)",
+    )
+    parser.add_argument(
+        "--out_layer_1_size", default=256, type=int, help="Size of MLP_branch layer 1"
+    )
+    parser.add_argument(
+        "--out_layer_2_size", default=128, type=int, help="Size of MLP_branch layer 2"
+    )
+    parser.add_argument(
+        "--out_layer_3_size", default=64, type=int, help="Size of MLP_branch layer 3"
+    )
+    parser.add_argument(
+        "--sparse_embedding_size",
+        default=50,
+        type=int,
+        help="Dimensionality of sparse embedding layer",
+    )
+    parser.add_argument(
+        "--dense_embedding_size",
+        default=128,
+        type=int,
+        help="Dimensionality of dense embedding layer",
+    )
+
+    parser.add_argument(
+        "--use_uam_label",
+        default=False,
+        type=str,
+        help="Whether to use uam_label or not",
+    )
+
+    parser.add_argument(
+        "--task_name",
+        default="OONC_Engagement",
+        type=str,
+        help="specify the task name to use: OONC or OONC_Engagement.",
+    )
+    parser.add_argument(
+        "--init_weight",
+        default=0.9,
+        type=float,
+        help="Initial OONC Task Weight MTL: OONC+Engagement.",
+    )
+    parser.add_argument(
+        "--use_engagement_weight",
+        default=False,
+        action="store_true",
+        help="whether to use engagement weight for base model.",
+    )
+    parser.add_argument(
+        "--mtl_num_extra_layers",
+        type=int,
+        default=1,
+        help="Number of Hidden Layers for each TaskBranch.",
+    )
+    parser.add_argument(
+        "--mtl_neuron_scale",
+        type=int,
+        default=4,
+        help="Scaling Factor of Neurons in MTL Extra Layers.",
+    )
+    parser.add_argument(
+        "--use_oonc_score",
+        default=False,
+        action="store_true",
+        help="whether to use oonc score only or combined score.",
+    )
+    parser.add_argument(
+        "--use_stratified_metrics",
+        default=False,
+        action="store_true",
+        help="Use stratified metrics: Break out new-user metrics.",
+    )
+    parser.add_argument(
+        "--run_group_metrics",
+        default=False,
+        action="store_true",
+        help="Will run evaluation metrics grouped by user.",
+    )
+    parser.add_argument(
+        "--use_full_scope",
+        default=False,
+        action="store_true",
+        help="Will add extra scope and naming to graph.",
+    )
+    parser.add_argument(
+        "--trainable_regexes",
+        default=None,
+        nargs="*",
+        help="The union of variables specified by the list of regexes will be considered trainable.",
+    )
+    parser.add_argument(
+        "--fine_tuning.ckpt_to_initialize_from",
+        dest="fine_tuning_ckpt_to_initialize_from",
+        type=str,
+        default=None,
+        help="Checkpoint path from which to warm start. Indicates the pre-trained model.",
+    )
+    parser.add_argument(
+        "--fine_tuning.warm_start_scope_regex",
+        dest="fine_tuning_warm_start_scope_regex",
+        type=str,
+        default=None,
+        help="All variables matching this will be restored.",
+    )
+
+    return parser
 
 
 def get_params(args=None):
-  parser = get_arg_parser()
-  if args is None:
-    return parser.parse_args()
-  else:
-    return parser.parse_args(args)
+    parser = get_arg_parser()
+    if args is None:
+        return parser.parse_args()
+    else:
+        return parser.parse_args(args)
 
 
 def get_arg_parser_light_ranking():
-  parser = get_arg_parser()
-
-  parser.add_argument(
-    "--use_record_weight",
-    default=False,
-    action="store_true",
-    help="whether to use record weight for base model.",
-  )
-  parser.add_argument(
-    "--min_record_weight", default=0.0, type=float, help="Minimum record weight to use."
-  )
-  parser.add_argument(
-    "--smooth_weight", default=0.0, type=float, help="Factor to smooth Rank Position Weight."
-  )
-
-  parser.add_argument(
-    "--num_mlp_layers", type=int, default=3, help="Number of Hidden Layers for MLP model."
-  )
-  parser.add_argument(
-    "--mlp_neuron_scale", type=int, default=4, help="Scaling Factor of Neurons in MLP Layers."
-  )
-  parser.add_argument(
-    "--run_light_ranking_group_metrics",
-    default=False,
-    action="store_true",
-    help="Will run evaluation metrics grouped by user for Light Ranking.",
-  )
-  parser.add_argument(
-    "--use_missing_sub_branch",
-    default=False,
-    action="store_true",
-    help="Whether to use missing value sub-branch for Light Ranking.",
-  )
-  parser.add_argument(
-    "--use_gbdt_features",
-    default=False,
-    action="store_true",
-    help="Whether to use GBDT features for Light Ranking.",
-  )
-  parser.add_argument(
-    "--run_light_ranking_group_metrics_in_bq",
-    default=False,
-    action="store_true",
-    help="Whether to get_predictions for Light Ranking to compute group metrics in BigQuery.",
-  )
-  parser.add_argument(
-    "--pred_file_path",
-    default=None,
-    type=str,
-    help="path",
-  )
-  parser.add_argument(
-    "--pred_file_name",
-    default=None,
-    type=str,
-    help="path",
-  )
-  return parser
+    parser = get_arg_parser()
+
+    parser.add_argument(
+        "--use_record_weight",
+        default=False,
+        action="store_true",
+        help="whether to use record weight for base model.",
+    )
+    parser.add_argument(
+        "--min_record_weight",
+        default=0.0,
+        type=float,
+        help="Minimum record weight to use.",
+    )
+    parser.add_argument(
+        "--smooth_weight",
+        default=0.0,
+        type=float,
+        help="Factor to smooth Rank Position Weight.",
+    )
+
+    parser.add_argument(
+        "--num_mlp_layers",
+        type=int,
+        default=3,
+        help="Number of Hidden Layers for MLP model.",
+    )
+    parser.add_argument(
+        "--mlp_neuron_scale",
+        type=int,
+        default=4,
+        help="Scaling Factor of Neurons in MLP Layers.",
+    )
+    parser.add_argument(
+        "--run_light_ranking_group_metrics",
+        default=False,
+        action="store_true",
+        help="Will run evaluation metrics grouped by user for Light Ranking.",
+    )
+    parser.add_argument(
+        "--use_missing_sub_branch",
+        default=False,
+        action="store_true",
+        help="Whether to use missing value sub-branch for Light Ranking.",
+    )
+    parser.add_argument(
+        "--use_gbdt_features",
+        default=False,
+        action="store_true",
+        help="Whether to use GBDT features for Light Ranking.",
+    )
+    parser.add_argument(
+        "--run_light_ranking_group_metrics_in_bq",
+        default=False,
+        action="store_true",
+        help="Whether to get_predictions for Light Ranking to compute group metrics in BigQuery.",
+    )
+    parser.add_argument(
+        "--pred_file_path",
+        default=None,
+        type=str,
+        help="path",
+    )
+    parser.add_argument(
+        "--pred_file_name",
+        default=None,
+        type=str,
+        help="path",
+    )
+    return parser
diff --git a/pushservice/src/main/python/models/libs/model_utils.py b/pushservice/src/main/python/models/libs/model_utils.py
index 1c5306911..ec12c98ca 100644
--- a/pushservice/src/main/python/models/libs/model_utils.py
+++ b/pushservice/src/main/python/models/libs/model_utils.py
@@ -1,339 +1,352 @@
 import sys
 
-import twml
-
-from .initializer import customized_glorot_uniform
-
 import tensorflow.compat.v1 as tf
 import yaml
 
+import twml
+
+from .initializer import customized_glorot_uniform
 
 # checkstyle: noqa
 
 
 def read_config(whitelist_yaml_file):
-  with tf.gfile.FastGFile(whitelist_yaml_file) as f:
-    try:
-      return yaml.safe_load(f)
-    except yaml.YAMLError as exc:
-      print(exc)
-      sys.exit(1)
+    with tf.gfile.FastGFile(whitelist_yaml_file) as f:
+        try:
+            return yaml.safe_load(f)
+        except yaml.YAMLError as exc:
+            print(exc)
+            sys.exit(1)
 
 
 def _sparse_feature_fixup(features, input_size_bits):
-  """Rebuild a sparse tensor feature so that its dense shape attribute is present.
+    """Rebuild a sparse tensor feature so that its dense shape attribute is present.
 
-  Arguments:
-    features (SparseTensor): Sparse feature tensor of shape ``(B, sparse_feature_dim)``.
-    input_size_bits (int): Number of columns in ``log2`` scale. Must be positive.
+    Arguments:
+      features (SparseTensor): Sparse feature tensor of shape ``(B, sparse_feature_dim)``.
+      input_size_bits (int): Number of columns in ``log2`` scale. Must be positive.
 
-  Returns:
-    SparseTensor: Rebuilt and non-faulty version of `features`."""
-  sparse_feature_dim = tf.constant(2**input_size_bits, dtype=tf.int64)
-  sparse_shape = tf.stack([features.dense_shape[0], sparse_feature_dim])
-  sparse_tf = tf.SparseTensor(features.indices, features.values, sparse_shape)
-  return sparse_tf
+    Returns:
+      SparseTensor: Rebuilt and non-faulty version of `features`."""
+    sparse_feature_dim = tf.constant(2**input_size_bits, dtype=tf.int64)
+    sparse_shape = tf.stack([features.dense_shape[0], sparse_feature_dim])
+    sparse_tf = tf.SparseTensor(features.indices, features.values, sparse_shape)
+    return sparse_tf
 
 
 def self_atten_dense(input, out_dim, activation=None, use_bias=True, name=None):
-  def safe_concat(base, suffix):
-    """Concats variables name components if base is given."""
-    if not base:
-      return base
-    return f"{base}:{suffix}"
-
-  input_dim = input.shape.as_list()[1]
-
-  sigmoid_out = twml.layers.FullDense(
-    input_dim, dtype=tf.float32, activation=tf.nn.sigmoid, name=safe_concat(name, "sigmoid_out")
-  )(input)
-  atten_input = sigmoid_out * input
-  mlp_out = twml.layers.FullDense(
-    out_dim,
-    dtype=tf.float32,
-    activation=activation,
-    use_bias=use_bias,
-    name=safe_concat(name, "mlp_out"),
-  )(atten_input)
-  return mlp_out
+    def safe_concat(base, suffix):
+        """Concats variables name components if base is given."""
+        if not base:
+            return base
+        return f"{base}:{suffix}"
+
+    input_dim = input.shape.as_list()[1]
+
+    sigmoid_out = twml.layers.FullDense(
+        input_dim,
+        dtype=tf.float32,
+        activation=tf.nn.sigmoid,
+        name=safe_concat(name, "sigmoid_out"),
+    )(input)
+    atten_input = sigmoid_out * input
+    mlp_out = twml.layers.FullDense(
+        out_dim,
+        dtype=tf.float32,
+        activation=activation,
+        use_bias=use_bias,
+        name=safe_concat(name, "mlp_out"),
+    )(atten_input)
+    return mlp_out
 
 
 def get_dense_out(input, out_dim, activation, dense_type):
-  if dense_type == "full_dense":
-    out = twml.layers.FullDense(out_dim, dtype=tf.float32, activation=activation)(input)
-  elif dense_type == "self_atten_dense":
-    out = self_atten_dense(input, out_dim, activation=activation)
-  return out
+    if dense_type == "full_dense":
+        out = twml.layers.FullDense(out_dim, dtype=tf.float32, activation=activation)(
+            input
+        )
+    elif dense_type == "self_atten_dense":
+        out = self_atten_dense(input, out_dim, activation=activation)
+    return out
 
 
 def get_input_trans_func(bn_normalized_dense, is_training):
-  gw_normalized_dense = tf.expand_dims(bn_normalized_dense, -1)
-  group_num = bn_normalized_dense.shape.as_list()[1]
+    gw_normalized_dense = tf.expand_dims(bn_normalized_dense, -1)
+    group_num = bn_normalized_dense.shape.as_list()[1]
 
-  gw_normalized_dense = GroupWiseTrans(group_num, 1, 8, name="groupwise_1", activation=tf.tanh)(
-    gw_normalized_dense
-  )
-  gw_normalized_dense = GroupWiseTrans(group_num, 8, 4, name="groupwise_2", activation=tf.tanh)(
-    gw_normalized_dense
-  )
-  gw_normalized_dense = GroupWiseTrans(group_num, 4, 1, name="groupwise_3", activation=tf.tanh)(
-    gw_normalized_dense
-  )
+    gw_normalized_dense = GroupWiseTrans(
+        group_num, 1, 8, name="groupwise_1", activation=tf.tanh
+    )(gw_normalized_dense)
+    gw_normalized_dense = GroupWiseTrans(
+        group_num, 8, 4, name="groupwise_2", activation=tf.tanh
+    )(gw_normalized_dense)
+    gw_normalized_dense = GroupWiseTrans(
+        group_num, 4, 1, name="groupwise_3", activation=tf.tanh
+    )(gw_normalized_dense)
 
-  gw_normalized_dense = tf.squeeze(gw_normalized_dense, [-1])
+    gw_normalized_dense = tf.squeeze(gw_normalized_dense, [-1])
 
-  bn_gw_normalized_dense = tf.layers.batch_normalization(
-    gw_normalized_dense,
-    training=is_training,
-    renorm_momentum=0.9999,
-    momentum=0.9999,
-    renorm=is_training,
-    trainable=True,
-  )
+    bn_gw_normalized_dense = tf.layers.batch_normalization(
+        gw_normalized_dense,
+        training=is_training,
+        renorm_momentum=0.9999,
+        momentum=0.9999,
+        renorm=is_training,
+        trainable=True,
+    )
 
-  return bn_gw_normalized_dense
+    return bn_gw_normalized_dense
 
 
 def tensor_dropout(
-  input_tensor,
-  rate,
-  is_training,
-  sparse_tensor=None,
+    input_tensor,
+    rate,
+    is_training,
+    sparse_tensor=None,
 ):
-  """
-  Implements dropout layer for both dense and sparse input_tensor
-
-  Arguments:
-    input_tensor:
-      B x D dense tensor, or a sparse tensor
-    rate (float32):
-      dropout rate
-    is_training (bool):
-      training stage or not.
-    sparse_tensor (bool):
-      whether the input_tensor is sparse tensor or not. Default to be None, this value has to be passed explicitly.
-    rescale_sparse_dropout (bool):
-      Do we need to do rescaling or not.
-  Returns:
-    tensor dropped out"""
-  if sparse_tensor == True:
-    if is_training:
-      with tf.variable_scope("sparse_dropout"):
-        values = input_tensor.values
-        keep_mask = tf.keras.backend.random_binomial(
-          tf.shape(values), p=1 - rate, dtype=tf.float32, seed=None
-        )
-        keep_mask.set_shape([None])
-        keep_mask = tf.cast(keep_mask, tf.bool)
-
-        keep_indices = tf.boolean_mask(input_tensor.indices, keep_mask, axis=0)
-        keep_values = tf.boolean_mask(values, keep_mask, axis=0)
-
-        dropped_tensor = tf.SparseTensor(keep_indices, keep_values, input_tensor.dense_shape)
-        return dropped_tensor
-    else:
-      return input_tensor
-  elif sparse_tensor == False:
-    return tf.layers.dropout(input_tensor, rate=rate, training=is_training)
+    """
+    Implements dropout layer for both dense and sparse input_tensor
+
+    Arguments:
+      input_tensor:
+        B x D dense tensor, or a sparse tensor
+      rate (float32):
+        dropout rate
+      is_training (bool):
+        training stage or not.
+      sparse_tensor (bool):
+        whether the input_tensor is sparse tensor or not. Default to be None, this value has to be passed explicitly.
+      rescale_sparse_dropout (bool):
+        Do we need to do rescaling or not.
+    Returns:
+      tensor dropped out"""
+    if sparse_tensor == True:
+        if is_training:
+            with tf.variable_scope("sparse_dropout"):
+                values = input_tensor.values
+                keep_mask = tf.keras.backend.random_binomial(
+                    tf.shape(values), p=1 - rate, dtype=tf.float32, seed=None
+                )
+                keep_mask.set_shape([None])
+                keep_mask = tf.cast(keep_mask, tf.bool)
+
+                keep_indices = tf.boolean_mask(input_tensor.indices, keep_mask, axis=0)
+                keep_values = tf.boolean_mask(values, keep_mask, axis=0)
+
+                dropped_tensor = tf.SparseTensor(
+                    keep_indices, keep_values, input_tensor.dense_shape
+                )
+                return dropped_tensor
+        else:
+            return input_tensor
+    elif sparse_tensor == False:
+        return tf.layers.dropout(input_tensor, rate=rate, training=is_training)
 
 
 def adaptive_transformation(bn_normalized_dense, is_training, func_type="default"):
-  assert func_type in [
-    "default",
-    "tiny",
-  ], f"fun_type can only be one of default and tiny, but get {func_type}"
-
-  gw_normalized_dense = tf.expand_dims(bn_normalized_dense, -1)
-  group_num = bn_normalized_dense.shape.as_list()[1]
-
-  if func_type == "default":
-    gw_normalized_dense = FastGroupWiseTrans(
-      group_num, 1, 8, name="groupwise_1", activation=tf.tanh, init_multiplier=8
-    )(gw_normalized_dense)
-
-    gw_normalized_dense = FastGroupWiseTrans(
-      group_num, 8, 4, name="groupwise_2", activation=tf.tanh, init_multiplier=8
-    )(gw_normalized_dense)
-
-    gw_normalized_dense = FastGroupWiseTrans(
-      group_num, 4, 1, name="groupwise_3", activation=tf.tanh, init_multiplier=8
-    )(gw_normalized_dense)
-  elif func_type == "tiny":
-    gw_normalized_dense = FastGroupWiseTrans(
-      group_num, 1, 2, name="groupwise_1", activation=tf.tanh, init_multiplier=8
-    )(gw_normalized_dense)
-
-    gw_normalized_dense = FastGroupWiseTrans(
-      group_num, 2, 1, name="groupwise_2", activation=tf.tanh, init_multiplier=8
-    )(gw_normalized_dense)
-
-    gw_normalized_dense = FastGroupWiseTrans(
-      group_num, 1, 1, name="groupwise_3", activation=tf.tanh, init_multiplier=8
-    )(gw_normalized_dense)
-
-  gw_normalized_dense = tf.squeeze(gw_normalized_dense, [-1])
-  bn_gw_normalized_dense = tf.layers.batch_normalization(
-    gw_normalized_dense,
-    training=is_training,
-    renorm_momentum=0.9999,
-    momentum=0.9999,
-    renorm=is_training,
-    trainable=True,
-  )
+    assert func_type in [
+        "default",
+        "tiny",
+    ], f"fun_type can only be one of default and tiny, but get {func_type}"
+
+    gw_normalized_dense = tf.expand_dims(bn_normalized_dense, -1)
+    group_num = bn_normalized_dense.shape.as_list()[1]
+
+    if func_type == "default":
+        gw_normalized_dense = FastGroupWiseTrans(
+            group_num, 1, 8, name="groupwise_1", activation=tf.tanh, init_multiplier=8
+        )(gw_normalized_dense)
+
+        gw_normalized_dense = FastGroupWiseTrans(
+            group_num, 8, 4, name="groupwise_2", activation=tf.tanh, init_multiplier=8
+        )(gw_normalized_dense)
+
+        gw_normalized_dense = FastGroupWiseTrans(
+            group_num, 4, 1, name="groupwise_3", activation=tf.tanh, init_multiplier=8
+        )(gw_normalized_dense)
+    elif func_type == "tiny":
+        gw_normalized_dense = FastGroupWiseTrans(
+            group_num, 1, 2, name="groupwise_1", activation=tf.tanh, init_multiplier=8
+        )(gw_normalized_dense)
+
+        gw_normalized_dense = FastGroupWiseTrans(
+            group_num, 2, 1, name="groupwise_2", activation=tf.tanh, init_multiplier=8
+        )(gw_normalized_dense)
+
+        gw_normalized_dense = FastGroupWiseTrans(
+            group_num, 1, 1, name="groupwise_3", activation=tf.tanh, init_multiplier=8
+        )(gw_normalized_dense)
+
+    gw_normalized_dense = tf.squeeze(gw_normalized_dense, [-1])
+    bn_gw_normalized_dense = tf.layers.batch_normalization(
+        gw_normalized_dense,
+        training=is_training,
+        renorm_momentum=0.9999,
+        momentum=0.9999,
+        renorm=is_training,
+        trainable=True,
+    )
 
-  return bn_gw_normalized_dense
+    return bn_gw_normalized_dense
 
 
 class FastGroupWiseTrans(object):
-  """
-  used to apply group-wise fully connected layers to the input.
-  it applies a tiny, unique MLP to each individual feature."""
-
-  def __init__(self, group_num, input_dim, out_dim, name, activation=None, init_multiplier=1):
-    self.group_num = group_num
-    self.input_dim = input_dim
-    self.out_dim = out_dim
-    self.activation = activation
-    self.init_multiplier = init_multiplier
-
-    self.w = tf.get_variable(
-      name + "_group_weight",
-      [1, group_num, input_dim, out_dim],
-      initializer=customized_glorot_uniform(
-        fan_in=input_dim * init_multiplier, fan_out=out_dim * init_multiplier
-      ),
-      trainable=True,
-    )
-    self.b = tf.get_variable(
-      name + "_group_bias",
-      [1, group_num, out_dim],
-      initializer=tf.constant_initializer(0.0),
-      trainable=True,
-    )
-
-  def __call__(self, input_tensor):
     """
-    input_tensor: batch_size x group_num x input_dim
-    output_tensor:  batch_size x group_num x out_dim"""
-    input_tensor_expand = tf.expand_dims(input_tensor, axis=-1)
+    used to apply group-wise fully connected layers to the input.
+    it applies a tiny, unique MLP to each individual feature."""
+
+    def __init__(
+        self, group_num, input_dim, out_dim, name, activation=None, init_multiplier=1
+    ):
+        self.group_num = group_num
+        self.input_dim = input_dim
+        self.out_dim = out_dim
+        self.activation = activation
+        self.init_multiplier = init_multiplier
+
+        self.w = tf.get_variable(
+            name + "_group_weight",
+            [1, group_num, input_dim, out_dim],
+            initializer=customized_glorot_uniform(
+                fan_in=input_dim * init_multiplier, fan_out=out_dim * init_multiplier
+            ),
+            trainable=True,
+        )
+        self.b = tf.get_variable(
+            name + "_group_bias",
+            [1, group_num, out_dim],
+            initializer=tf.constant_initializer(0.0),
+            trainable=True,
+        )
 
-    output_tensor = tf.add(
-      tf.reduce_sum(tf.multiply(input_tensor_expand, self.w), axis=-2, keepdims=False),
-      self.b,
-    )
+    def __call__(self, input_tensor):
+        """
+        input_tensor: batch_size x group_num x input_dim
+        output_tensor:  batch_size x group_num x out_dim"""
+        input_tensor_expand = tf.expand_dims(input_tensor, axis=-1)
+
+        output_tensor = tf.add(
+            tf.reduce_sum(
+                tf.multiply(input_tensor_expand, self.w), axis=-2, keepdims=False
+            ),
+            self.b,
+        )
 
-    if self.activation is not None:
-      output_tensor = self.activation(output_tensor)
-    return output_tensor
+        if self.activation is not None:
+            output_tensor = self.activation(output_tensor)
+        return output_tensor
 
 
 class GroupWiseTrans(object):
-  """
-  Used to apply group fully connected layers to the input.
-  """
-
-  def __init__(self, group_num, input_dim, out_dim, name, activation=None):
-    self.group_num = group_num
-    self.input_dim = input_dim
-    self.out_dim = out_dim
-    self.activation = activation
-
-    w_list, b_list = [], []
-    for idx in range(out_dim):
-      this_w = tf.get_variable(
-        name + f"_group_weight_{idx}",
-        [1, group_num, input_dim],
-        initializer=tf.keras.initializers.glorot_uniform(),
-        trainable=True,
-      )
-      this_b = tf.get_variable(
-        name + f"_group_bias_{idx}",
-        [1, group_num, 1],
-        initializer=tf.constant_initializer(0.0),
-        trainable=True,
-      )
-      w_list.append(this_w)
-      b_list.append(this_b)
-    self.w_list = w_list
-    self.b_list = b_list
-
-  def __call__(self, input_tensor):
     """
-    input_tensor: batch_size x group_num x input_dim
-    output_tensor: batch_size x group_num x out_dim
+    Used to apply group fully connected layers to the input.
     """
-    out_tensor_list = []
-    for idx in range(self.out_dim):
-      this_res = (
-        tf.reduce_sum(input_tensor * self.w_list[idx], axis=-1, keepdims=True) + self.b_list[idx]
-      )
-      out_tensor_list.append(this_res)
-    output_tensor = tf.concat(out_tensor_list, axis=-1)
 
-    if self.activation is not None:
-      output_tensor = self.activation(output_tensor)
-    return output_tensor
+    def __init__(self, group_num, input_dim, out_dim, name, activation=None):
+        self.group_num = group_num
+        self.input_dim = input_dim
+        self.out_dim = out_dim
+        self.activation = activation
+
+        w_list, b_list = [], []
+        for idx in range(out_dim):
+            this_w = tf.get_variable(
+                name + f"_group_weight_{idx}",
+                [1, group_num, input_dim],
+                initializer=tf.keras.initializers.glorot_uniform(),
+                trainable=True,
+            )
+            this_b = tf.get_variable(
+                name + f"_group_bias_{idx}",
+                [1, group_num, 1],
+                initializer=tf.constant_initializer(0.0),
+                trainable=True,
+            )
+            w_list.append(this_w)
+            b_list.append(this_b)
+        self.w_list = w_list
+        self.b_list = b_list
+
+    def __call__(self, input_tensor):
+        """
+        input_tensor: batch_size x group_num x input_dim
+        output_tensor: batch_size x group_num x out_dim
+        """
+        out_tensor_list = []
+        for idx in range(self.out_dim):
+            this_res = (
+                tf.reduce_sum(input_tensor * self.w_list[idx], axis=-1, keepdims=True)
+                + self.b_list[idx]
+            )
+            out_tensor_list.append(this_res)
+        output_tensor = tf.concat(out_tensor_list, axis=-1)
+
+        if self.activation is not None:
+            output_tensor = self.activation(output_tensor)
+        return output_tensor
 
 
 def add_scalar_summary(var, name, name_scope="hist_dense_feature/"):
-  with tf.name_scope("summaries/"):
-    with tf.name_scope(name_scope):
-      tf.summary.scalar(name, var)
+    with tf.name_scope("summaries/"):
+        with tf.name_scope(name_scope):
+            tf.summary.scalar(name, var)
 
 
 def add_histogram_summary(var, name, name_scope="hist_dense_feature/"):
-  with tf.name_scope("summaries/"):
-    with tf.name_scope(name_scope):
-      tf.summary.histogram(name, tf.reshape(var, [-1]))
+    with tf.name_scope("summaries/"):
+        with tf.name_scope(name_scope):
+            tf.summary.histogram(name, tf.reshape(var, [-1]))
 
 
 def sparse_clip_by_value(sparse_tf, min_val, max_val):
-  new_vals = tf.clip_by_value(sparse_tf.values, min_val, max_val)
-  return tf.SparseTensor(sparse_tf.indices, new_vals, sparse_tf.dense_shape)
+    new_vals = tf.clip_by_value(sparse_tf.values, min_val, max_val)
+    return tf.SparseTensor(sparse_tf.indices, new_vals, sparse_tf.dense_shape)
 
 
 def check_numerics_with_msg(tensor, message="", sparse_tensor=False):
-  if sparse_tensor:
-    values = tf.debugging.check_numerics(tensor.values, message=message)
-    return tf.SparseTensor(tensor.indices, values, tensor.dense_shape)
-  else:
-    return tf.debugging.check_numerics(tensor, message=message)
+    if sparse_tensor:
+        values = tf.debugging.check_numerics(tensor.values, message=message)
+        return tf.SparseTensor(tensor.indices, values, tensor.dense_shape)
+    else:
+        return tf.debugging.check_numerics(tensor, message=message)
 
 
 def pad_empty_sparse_tensor(tensor):
-  dummy_tensor = tf.SparseTensor(
-    indices=[[0, 0]],
-    values=[0.00001],
-    dense_shape=tensor.dense_shape,
-  )
-  result = tf.cond(
-    tf.equal(tf.size(tensor.values), 0),
-    lambda: dummy_tensor,
-    lambda: tensor,
-  )
-  return result
+    dummy_tensor = tf.SparseTensor(
+        indices=[[0, 0]],
+        values=[0.00001],
+        dense_shape=tensor.dense_shape,
+    )
+    result = tf.cond(
+        tf.equal(tf.size(tensor.values), 0),
+        lambda: dummy_tensor,
+        lambda: tensor,
+    )
+    return result
 
 
 def filter_nans_and_infs(tensor, sparse_tensor=False):
-  if sparse_tensor:
-    sparse_values = tensor.values
-    filtered_val = tf.where(
-      tf.logical_or(tf.is_nan(sparse_values), tf.is_inf(sparse_values)),
-      tf.zeros_like(sparse_values),
-      sparse_values,
-    )
-    return tf.SparseTensor(tensor.indices, filtered_val, tensor.dense_shape)
-  else:
-    return tf.where(
-      tf.logical_or(tf.is_nan(tensor), tf.is_inf(tensor)), tf.zeros_like(tensor), tensor
-    )
+    if sparse_tensor:
+        sparse_values = tensor.values
+        filtered_val = tf.where(
+            tf.logical_or(tf.is_nan(sparse_values), tf.is_inf(sparse_values)),
+            tf.zeros_like(sparse_values),
+            sparse_values,
+        )
+        return tf.SparseTensor(tensor.indices, filtered_val, tensor.dense_shape)
+    else:
+        return tf.where(
+            tf.logical_or(tf.is_nan(tensor), tf.is_inf(tensor)),
+            tf.zeros_like(tensor),
+            tensor,
+        )
 
 
 def generate_disliked_mask(labels):
-  """Generate a disliked mask where only samples with dislike labels are set to 1 otherwise set to 0.
-  Args:
-    labels: labels of training samples, which is a 2D tensor of shape batch_size x 3: [OONCs, engagements, dislikes]
-  Returns:
-    1D tensor of shape batch_size x 1: [dislikes (booleans)]
-  """
-  return tf.equal(tf.reshape(labels[:, 2], shape=[-1, 1]), 1)
+    """Generate a disliked mask where only samples with dislike labels are set to 1 otherwise set to 0.
+    Args:
+      labels: labels of training samples, which is a 2D tensor of shape batch_size x 3: [OONCs, engagements, dislikes]
+    Returns:
+      1D tensor of shape batch_size x 1: [dislikes (booleans)]
+    """
+    return tf.equal(tf.reshape(labels[:, 2], shape=[-1, 1]), 1)
diff --git a/pushservice/src/main/python/models/libs/warm_start_utils.py b/pushservice/src/main/python/models/libs/warm_start_utils.py
index ca83df585..415a4b608 100644
--- a/pushservice/src/main/python/models/libs/warm_start_utils.py
+++ b/pushservice/src/main/python/models/libs/warm_start_utils.py
@@ -1,309 +1,325 @@
-from collections import OrderedDict
 import json
 import os
+from collections import OrderedDict
 from os.path import join
 
+import numpy as np
+import tensorflow.compat.v1 as tf
+from scipy import stats
 from twitter.magicpony.common import file_access
+
 import twml
 
 from .model_utils import read_config
 
-import numpy as np
-from scipy import stats
-import tensorflow.compat.v1 as tf
-
-
 # checkstyle: noqa
 
 
 def get_model_type_to_tensors_to_change_axis():
-  model_type_to_tensors_to_change_axis = {
-    "magic_recs/model/batch_normalization/beta": ([0], "continuous"),
-    "magic_recs/model/batch_normalization/gamma": ([0], "continuous"),
-    "magic_recs/model/batch_normalization/moving_mean": ([0], "continuous"),
-    "magic_recs/model/batch_normalization/moving_stddev": ([0], "continuous"),
-    "magic_recs/model/batch_normalization/moving_variance": ([0], "continuous"),
-    "magic_recs/model/batch_normalization/renorm_mean": ([0], "continuous"),
-    "magic_recs/model/batch_normalization/renorm_stddev": ([0], "continuous"),
-    "magic_recs/model/logits/EngagementGivenOONC_logits/clem_net_1/block2_4/channel_wise_dense_4/kernel": (
-      [1],
-      "all",
-    ),
-    "magic_recs/model/logits/OONC_logits/clem_net/block2/channel_wise_dense/kernel": ([1], "all"),
-  }
-
-  return model_type_to_tensors_to_change_axis
+    model_type_to_tensors_to_change_axis = {
+        "magic_recs/model/batch_normalization/beta": ([0], "continuous"),
+        "magic_recs/model/batch_normalization/gamma": ([0], "continuous"),
+        "magic_recs/model/batch_normalization/moving_mean": ([0], "continuous"),
+        "magic_recs/model/batch_normalization/moving_stddev": ([0], "continuous"),
+        "magic_recs/model/batch_normalization/moving_variance": ([0], "continuous"),
+        "magic_recs/model/batch_normalization/renorm_mean": ([0], "continuous"),
+        "magic_recs/model/batch_normalization/renorm_stddev": ([0], "continuous"),
+        "magic_recs/model/logits/EngagementGivenOONC_logits/clem_net_1/block2_4/channel_wise_dense_4/kernel": (
+            [1],
+            "all",
+        ),
+        "magic_recs/model/logits/OONC_logits/clem_net/block2/channel_wise_dense/kernel": (
+            [1],
+            "all",
+        ),
+    }
+
+    return model_type_to_tensors_to_change_axis
 
 
 def mkdirp(dirname):
-  if not tf.io.gfile.exists(dirname):
-    tf.io.gfile.makedirs(dirname)
+    if not tf.io.gfile.exists(dirname):
+        tf.io.gfile.makedirs(dirname)
 
 
 def rename_dir(dirname, dst):
-  file_access.hdfs.mv(dirname, dst)
+    file_access.hdfs.mv(dirname, dst)
 
 
 def rmdir(dirname):
-  if tf.io.gfile.exists(dirname):
-    if tf.io.gfile.isdir(dirname):
-      tf.io.gfile.rmtree(dirname)
-    else:
-      tf.io.gfile.remove(dirname)
+    if tf.io.gfile.exists(dirname):
+        if tf.io.gfile.isdir(dirname):
+            tf.io.gfile.rmtree(dirname)
+        else:
+            tf.io.gfile.remove(dirname)
 
 
 def get_var_dict(checkpoint_path):
-  checkpoint = tf.train.get_checkpoint_state(checkpoint_path)
-  var_dict = OrderedDict()
-  with tf.Session() as sess:
-    all_var_list = tf.train.list_variables(checkpoint_path)
-    for var_name, _ in all_var_list:
-      # Load the variable
-      var = tf.train.load_variable(checkpoint_path, var_name)
-      var_dict[var_name] = var
-  return var_dict
+    checkpoint = tf.train.get_checkpoint_state(checkpoint_path)
+    var_dict = OrderedDict()
+    with tf.Session() as sess:
+        all_var_list = tf.train.list_variables(checkpoint_path)
+        for var_name, _ in all_var_list:
+            # Load the variable
+            var = tf.train.load_variable(checkpoint_path, var_name)
+            var_dict[var_name] = var
+    return var_dict
 
 
 def get_continunous_mapping_from_feat_list(old_feature_list, new_feature_list):
-  """
-  get var_ind for old_feature and corresponding var_ind for new_feature
-  """
-  new_var_ind, old_var_ind = [], []
-  for this_new_id, this_new_name in enumerate(new_feature_list):
-    if this_new_name in old_feature_list:
-      this_old_id = old_feature_list.index(this_new_name)
-      new_var_ind.append(this_new_id)
-      old_var_ind.append(this_old_id)
-  return np.asarray(old_var_ind), np.asarray(new_var_ind)
+    """
+    get var_ind for old_feature and corresponding var_ind for new_feature
+    """
+    new_var_ind, old_var_ind = [], []
+    for this_new_id, this_new_name in enumerate(new_feature_list):
+        if this_new_name in old_feature_list:
+            this_old_id = old_feature_list.index(this_new_name)
+            new_var_ind.append(this_new_id)
+            old_var_ind.append(this_old_id)
+    return np.asarray(old_var_ind), np.asarray(new_var_ind)
 
 
 def get_continuous_mapping_from_feat_dict(old_feature_dict, new_feature_dict):
-  """
-  get var_ind for old_feature and corresponding var_ind for new_feature
-  """
-  old_cont = old_feature_dict["continuous"]
-  old_bin = old_feature_dict["binary"]
+    """
+    get var_ind for old_feature and corresponding var_ind for new_feature
+    """
+    old_cont = old_feature_dict["continuous"]
+    old_bin = old_feature_dict["binary"]
 
-  new_cont = new_feature_dict["continuous"]
-  new_bin = new_feature_dict["binary"]
+    new_cont = new_feature_dict["continuous"]
+    new_bin = new_feature_dict["binary"]
 
-  _dummy_sparse_feat = [f"sparse_feature_{_idx}" for _idx in range(100)]
+    _dummy_sparse_feat = [f"sparse_feature_{_idx}" for _idx in range(100)]
 
-  cont_old_var_ind, cont_new_var_ind = get_continunous_mapping_from_feat_list(old_cont, new_cont)
+    cont_old_var_ind, cont_new_var_ind = get_continunous_mapping_from_feat_list(
+        old_cont, new_cont
+    )
 
-  all_old_var_ind, all_new_var_ind = get_continunous_mapping_from_feat_list(
-    old_cont + old_bin + _dummy_sparse_feat, new_cont + new_bin + _dummy_sparse_feat
-  )
+    all_old_var_ind, all_new_var_ind = get_continunous_mapping_from_feat_list(
+        old_cont + old_bin + _dummy_sparse_feat, new_cont + new_bin + _dummy_sparse_feat
+    )
 
-  _res = {
-    "continuous": (cont_old_var_ind, cont_new_var_ind),
-    "all": (all_old_var_ind, all_new_var_ind),
-  }
+    _res = {
+        "continuous": (cont_old_var_ind, cont_new_var_ind),
+        "all": (all_old_var_ind, all_new_var_ind),
+    }
 
-  return _res
+    return _res
 
 
 def warm_start_from_var_dict(
-  old_ckpt_path,
-  var_ind_dict,
-  output_dir,
-  new_len_var,
-  var_to_change_dict_fn=get_model_type_to_tensors_to_change_axis,
+    old_ckpt_path,
+    var_ind_dict,
+    output_dir,
+    new_len_var,
+    var_to_change_dict_fn=get_model_type_to_tensors_to_change_axis,
+):
+    """
+    Parameters:
+        old_ckpt_path (str): path to the old checkpoint path
+        new_var_ind (array of int): index to overlapping features in new var between old and new feature list.
+        old_var_ind (array of int): index to overlapping features in old var between old and new feature list.
+
+        output_dir (str): dir that used to write modified checkpoint
+        new_len_var ({str:int}): number of feature in the new feature list.
+        var_to_change_dict_fn (dict): A function to get the dictionary of format {var_name: dim_to_change}
+    """
+    old_var_dict = get_var_dict(old_ckpt_path)
+
+    ckpt_file_name = os.path.basename(old_ckpt_path)
+    mkdirp(output_dir)
+    output_path = join(output_dir, ckpt_file_name)
+
+    tensors_to_change = var_to_change_dict_fn()
+    tf.compat.v1.reset_default_graph()
+
+    with tf.Session() as sess:
+        var_name_shape_list = tf.train.list_variables(old_ckpt_path)
+        count = 0
+
+        for var_name, var_shape in var_name_shape_list:
+            old_var = old_var_dict[var_name]
+            if var_name in tensors_to_change.keys():
+                _info_tuple = tensors_to_change[var_name]
+                dims_to_remove_from, var_type = _info_tuple
+
+                new_var_ind, old_var_ind = var_ind_dict[var_type]
+
+                this_shape = list(old_var.shape)
+                for this_dim in dims_to_remove_from:
+                    this_shape[this_dim] = new_len_var[var_type]
+
+                stddev = np.std(old_var)
+                truncated_norm_generator = stats.truncnorm(
+                    -0.5, 0.5, loc=0, scale=stddev
+                )
+                size = np.prod(this_shape)
+                new_var = truncated_norm_generator.rvs(size).reshape(this_shape)
+                new_var = new_var.astype(old_var.dtype)
+
+                new_var = copy_feat_based_on_mapping(
+                    new_var, old_var, dims_to_remove_from, new_var_ind, old_var_ind
+                )
+                count = count + 1
+            else:
+                new_var = old_var
+            var = tf.Variable(new_var, name=var_name)
+        assert count == len(
+            tensors_to_change.keys()
+        ), "not all variables are exchanged.\n"
+        saver = tf.train.Saver()
+        sess.run(tf.global_variables_initializer())
+        saver.save(sess, output_path)
+    return output_path
+
+
+def copy_feat_based_on_mapping(
+    new_array, old_array, dims_to_remove_from, new_var_ind, old_var_ind
 ):
-  """
-  Parameters:
-      old_ckpt_path (str): path to the old checkpoint path
-      new_var_ind (array of int): index to overlapping features in new var between old and new feature list.
-      old_var_ind (array of int): index to overlapping features in old var between old and new feature list.
-
-      output_dir (str): dir that used to write modified checkpoint
-      new_len_var ({str:int}): number of feature in the new feature list.
-      var_to_change_dict_fn (dict): A function to get the dictionary of format {var_name: dim_to_change}
-  """
-  old_var_dict = get_var_dict(old_ckpt_path)
-
-  ckpt_file_name = os.path.basename(old_ckpt_path)
-  mkdirp(output_dir)
-  output_path = join(output_dir, ckpt_file_name)
-
-  tensors_to_change = var_to_change_dict_fn()
-  tf.compat.v1.reset_default_graph()
-
-  with tf.Session() as sess:
-    var_name_shape_list = tf.train.list_variables(old_ckpt_path)
-    count = 0
-
-    for var_name, var_shape in var_name_shape_list:
-      old_var = old_var_dict[var_name]
-      if var_name in tensors_to_change.keys():
-        _info_tuple = tensors_to_change[var_name]
-        dims_to_remove_from, var_type = _info_tuple
-
-        new_var_ind, old_var_ind = var_ind_dict[var_type]
-
-        this_shape = list(old_var.shape)
-        for this_dim in dims_to_remove_from:
-          this_shape[this_dim] = new_len_var[var_type]
-
-        stddev = np.std(old_var)
-        truncated_norm_generator = stats.truncnorm(-0.5, 0.5, loc=0, scale=stddev)
-        size = np.prod(this_shape)
-        new_var = truncated_norm_generator.rvs(size).reshape(this_shape)
-        new_var = new_var.astype(old_var.dtype)
-
-        new_var = copy_feat_based_on_mapping(
-          new_var, old_var, dims_to_remove_from, new_var_ind, old_var_ind
+    if dims_to_remove_from == [0, 1]:
+        for this_new_ind, this_old_ind in zip(new_var_ind, old_var_ind):
+            new_array[this_new_ind, new_var_ind] = old_array[this_old_ind, old_var_ind]
+    elif dims_to_remove_from == [0]:
+        new_array[new_var_ind] = old_array[old_var_ind]
+    elif dims_to_remove_from == [1]:
+        new_array[:, new_var_ind] = old_array[:, old_var_ind]
+    else:
+        raise RuntimeError(
+            f"undefined dims_to_remove_from pattern: ({dims_to_remove_from})"
         )
-        count = count + 1
-      else:
-        new_var = old_var
-      var = tf.Variable(new_var, name=var_name)
-    assert count == len(tensors_to_change.keys()), "not all variables are exchanged.\n"
-    saver = tf.train.Saver()
-    sess.run(tf.global_variables_initializer())
-    saver.save(sess, output_path)
-  return output_path
-
-
-def copy_feat_based_on_mapping(new_array, old_array, dims_to_remove_from, new_var_ind, old_var_ind):
-  if dims_to_remove_from == [0, 1]:
-    for this_new_ind, this_old_ind in zip(new_var_ind, old_var_ind):
-      new_array[this_new_ind, new_var_ind] = old_array[this_old_ind, old_var_ind]
-  elif dims_to_remove_from == [0]:
-    new_array[new_var_ind] = old_array[old_var_ind]
-  elif dims_to_remove_from == [1]:
-    new_array[:, new_var_ind] = old_array[:, old_var_ind]
-  else:
-    raise RuntimeError(f"undefined dims_to_remove_from pattern: ({dims_to_remove_from})")
-  return new_array
+    return new_array
 
 
 def read_file(filename, decode=False):
-  """
-  Reads contents from a file and optionally decodes it.
+    """
+    Reads contents from a file and optionally decodes it.
 
-  Arguments:
-    filename:
-      path to file where the contents will be loaded from.
-      Accepts HDFS and local paths.
-    decode:
-      False or 'json'. When decode='json', contents is decoded
-      with json.loads. When False, contents is returned as is.
-  """
-  graph = tf.Graph()
-  with graph.as_default():
-    read = tf.read_file(filename)
+    Arguments:
+      filename:
+        path to file where the contents will be loaded from.
+        Accepts HDFS and local paths.
+      decode:
+        False or 'json'. When decode='json', contents is decoded
+        with json.loads. When False, contents is returned as is.
+    """
+    graph = tf.Graph()
+    with graph.as_default():
+        read = tf.read_file(filename)
 
-  with tf.Session(graph=graph) as sess:
-    contents = sess.run(read)
-    if not isinstance(contents, str):
-      contents = contents.decode()
+    with tf.Session(graph=graph) as sess:
+        contents = sess.run(read)
+        if not isinstance(contents, str):
+            contents = contents.decode()
 
-  if decode == "json":
-    contents = json.loads(contents)
+    if decode == "json":
+        contents = json.loads(contents)
 
-  return contents
+    return contents
 
 
 def read_feat_list_from_disk(file_path):
-  return read_file(file_path, decode="json")
+    return read_file(file_path, decode="json")
 
 
 def get_feature_list_for_light_ranking(feature_list_path, data_spec_path):
-  feature_list = read_config(feature_list_path).items()
-  string_feat_list = [f[0] for f in feature_list if f[1] != "S"]
-
-  feature_config_builder = twml.contrib.feature_config.FeatureConfigBuilder(
-    data_spec_path=data_spec_path
-  )
-  feature_config_builder = feature_config_builder.extract_feature_group(
-    feature_regexes=string_feat_list,
-    group_name="continuous",
-    default_value=-1,
-    type_filter=["CONTINUOUS"],
-  )
-  feature_config = feature_config_builder.build()
-  feature_list = feature_config_builder._feature_group_extraction_configs[0].feature_map[
-    "CONTINUOUS"
-  ]
-  return feature_list
+    feature_list = read_config(feature_list_path).items()
+    string_feat_list = [f[0] for f in feature_list if f[1] != "S"]
+
+    feature_config_builder = twml.contrib.feature_config.FeatureConfigBuilder(
+        data_spec_path=data_spec_path
+    )
+    feature_config_builder = feature_config_builder.extract_feature_group(
+        feature_regexes=string_feat_list,
+        group_name="continuous",
+        default_value=-1,
+        type_filter=["CONTINUOUS"],
+    )
+    feature_config = feature_config_builder.build()
+    feature_list = feature_config_builder._feature_group_extraction_configs[
+        0
+    ].feature_map["CONTINUOUS"]
+    return feature_list
 
 
 def get_feature_list_for_heavy_ranking(feature_list_path, data_spec_path):
-  feature_list = read_config(feature_list_path).items()
-  string_feat_list = [f[0] for f in feature_list if f[1] != "S"]
-
-  feature_config_builder = twml.contrib.feature_config.FeatureConfigBuilder(
-    data_spec_path=data_spec_path
-  )
-  feature_config_builder = feature_config_builder.extract_feature_group(
-    feature_regexes=string_feat_list,
-    group_name="continuous",
-    default_value=-1,
-    type_filter=["CONTINUOUS"],
-  )
-
-  feature_config_builder = feature_config_builder.extract_feature_group(
-    feature_regexes=string_feat_list,
-    group_name="binary",
-    default_value=False,
-    type_filter=["BINARY"],
-  )
-
-  feature_config_builder = feature_config_builder.build()
-
-  continuous_feature_list = feature_config_builder._feature_group_extraction_configs[0].feature_map[
-    "CONTINUOUS"
-  ]
-
-  binary_feature_list = feature_config_builder._feature_group_extraction_configs[1].feature_map[
-    "BINARY"
-  ]
-  return {"continuous": continuous_feature_list, "binary": binary_feature_list}
+    feature_list = read_config(feature_list_path).items()
+    string_feat_list = [f[0] for f in feature_list if f[1] != "S"]
+
+    feature_config_builder = twml.contrib.feature_config.FeatureConfigBuilder(
+        data_spec_path=data_spec_path
+    )
+    feature_config_builder = feature_config_builder.extract_feature_group(
+        feature_regexes=string_feat_list,
+        group_name="continuous",
+        default_value=-1,
+        type_filter=["CONTINUOUS"],
+    )
+
+    feature_config_builder = feature_config_builder.extract_feature_group(
+        feature_regexes=string_feat_list,
+        group_name="binary",
+        default_value=False,
+        type_filter=["BINARY"],
+    )
+
+    feature_config_builder = feature_config_builder.build()
+
+    continuous_feature_list = feature_config_builder._feature_group_extraction_configs[
+        0
+    ].feature_map["CONTINUOUS"]
+
+    binary_feature_list = feature_config_builder._feature_group_extraction_configs[
+        1
+    ].feature_map["BINARY"]
+    return {"continuous": continuous_feature_list, "binary": binary_feature_list}
 
 
 def warm_start_checkpoint(
-  old_best_ckpt_folder,
-  old_feature_list_path,
-  feature_allow_list_path,
-  data_spec_path,
-  output_ckpt_folder,
-  *args,
+    old_best_ckpt_folder,
+    old_feature_list_path,
+    feature_allow_list_path,
+    data_spec_path,
+    output_ckpt_folder,
+    *args,
 ):
-  """
-  Reads old checkpoint and the old feature list, and create a new ckpt warm started from old ckpt using new features .
-
-  Arguments:
-    old_best_ckpt_folder:
-      path to the best_checkpoint_folder for old model
-    old_feature_list_path:
-      path to the json file that stores the list of continuous features used in old models.
-    feature_allow_list_path:
-      yaml file that contain the feature allow list.
-    data_spec_path:
-      path to the data_spec file
-    output_ckpt_folder:
-      folder that contains the modified ckpt.
-
-  Returns:
-    path to the modified ckpt."""
-  old_ckpt_path = tf.train.latest_checkpoint(old_best_ckpt_folder, latest_filename=None)
-
-  new_feature_dict = get_feature_list(feature_allow_list_path, data_spec_path)
-  old_feature_dict = read_feat_list_from_disk(old_feature_list_path)
-
-  var_ind_dict = get_continuous_mapping_from_feat_dict(new_feature_dict, old_feature_dict)
-
-  new_len_var = {
-    "continuous": len(new_feature_dict["continuous"]),
-    "all": len(new_feature_dict["continuous"] + new_feature_dict["binary"]) + 100,
-  }
-
-  warm_started_ckpt_path = warm_start_from_var_dict(
-    old_ckpt_path,
-    var_ind_dict,
-    output_dir=output_ckpt_folder,
-    new_len_var=new_len_var,
-  )
-
-  return warm_started_ckpt_path
+    """
+    Reads old checkpoint and the old feature list, and create a new ckpt warm started from old ckpt using new features .
+
+    Arguments:
+      old_best_ckpt_folder:
+        path to the best_checkpoint_folder for old model
+      old_feature_list_path:
+        path to the json file that stores the list of continuous features used in old models.
+      feature_allow_list_path:
+        yaml file that contain the feature allow list.
+      data_spec_path:
+        path to the data_spec file
+      output_ckpt_folder:
+        folder that contains the modified ckpt.
+
+    Returns:
+      path to the modified ckpt."""
+    old_ckpt_path = tf.train.latest_checkpoint(
+        old_best_ckpt_folder, latest_filename=None
+    )
+
+    new_feature_dict = get_feature_list(feature_allow_list_path, data_spec_path)
+    old_feature_dict = read_feat_list_from_disk(old_feature_list_path)
+
+    var_ind_dict = get_continuous_mapping_from_feat_dict(
+        new_feature_dict, old_feature_dict
+    )
+
+    new_len_var = {
+        "continuous": len(new_feature_dict["continuous"]),
+        "all": len(new_feature_dict["continuous"] + new_feature_dict["binary"]) + 100,
+    }
+
+    warm_started_ckpt_path = warm_start_from_var_dict(
+        old_ckpt_path,
+        var_ind_dict,
+        output_dir=output_ckpt_folder,
+        new_len_var=new_len_var,
+    )
+
+    return warm_started_ckpt_path
diff --git a/pushservice/src/main/python/models/light_ranking/deep_norm.py b/pushservice/src/main/python/models/light_ranking/deep_norm.py
index bc90deba4..035f6d24b 100644
--- a/pushservice/src/main/python/models/light_ranking/deep_norm.py
+++ b/pushservice/src/main/python/models/light_ranking/deep_norm.py
@@ -1,16 +1,19 @@
+import os
 from datetime import datetime
 from functools import partial
-import os
 
+import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import logging
 from twitter.cortex.ml.embeddings.common.helpers import decode_str_or_unicode
+
 import twml
 from twml.trainers import DataRecordTrainer
 
-from ..libs.get_feat_config import get_feature_config_light_ranking, LABELS_LR
+from ..libs.get_feat_config import LABELS_LR, get_feature_config_light_ranking
 from ..libs.graph_utils import get_trainable_variables
 from ..libs.group_metrics import (
-  run_group_metrics_light_ranking,
-  run_group_metrics_light_ranking_in_bq,
+    run_group_metrics_light_ranking,
+    run_group_metrics_light_ranking_in_bq,
 )
 from ..libs.metric_fn_utils import get_metric_fn
 from ..libs.model_args import get_arg_parser_light_ranking
@@ -18,209 +21,230 @@
 from ..libs.warm_start_utils import get_feature_list_for_light_ranking
 from .model_pools_mlp import light_ranking_mlp_ngbdt
 
-import tensorflow.compat.v1 as tf
-from tensorflow.compat.v1 import logging
-
-
 # checkstyle: noqa
 
 
 def build_graph(
-  features, label, mode, params, config=None, run_light_ranking_group_metrics_in_bq=False
+    features,
+    label,
+    mode,
+    params,
+    config=None,
+    run_light_ranking_group_metrics_in_bq=False,
 ):
-  is_training = mode == tf.estimator.ModeKeys.TRAIN
-  this_model_func = light_ranking_mlp_ngbdt
-  model_output = this_model_func(features, is_training, params, label)
-
-  logits = model_output["output"]
-  graph_output = {}
-  # --------------------------------------------------------
-  #            define graph output dict
-  # --------------------------------------------------------
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    loss = None
-    output_label = "prediction"
-    if params.task_name in LABELS_LR:
-      output = tf.nn.sigmoid(logits)
-      output = tf.clip_by_value(output, 0, 1)
-
-      if run_light_ranking_group_metrics_in_bq:
-        graph_output["trace_id"] = features["meta.trace_id"]
-        graph_output["target"] = features["meta.ranking.weighted_oonc_model_score"]
+    is_training = mode == tf.estimator.ModeKeys.TRAIN
+    this_model_func = light_ranking_mlp_ngbdt
+    model_output = this_model_func(features, is_training, params, label)
 
-    else:
-      raise ValueError("Invalid Task Name !")
-
-  else:
-    output_label = "output"
-    weights = tf.cast(features["weights"], dtype=tf.float32, name="RecordWeights")
+    logits = model_output["output"]
+    graph_output = {}
+    # --------------------------------------------------------
+    #            define graph output dict
+    # --------------------------------------------------------
+    if mode == tf.estimator.ModeKeys.PREDICT:
+        loss = None
+        output_label = "prediction"
+        if params.task_name in LABELS_LR:
+            output = tf.nn.sigmoid(logits)
+            output = tf.clip_by_value(output, 0, 1)
+
+            if run_light_ranking_group_metrics_in_bq:
+                graph_output["trace_id"] = features["meta.trace_id"]
+                graph_output["target"] = features[
+                    "meta.ranking.weighted_oonc_model_score"
+                ]
+
+        else:
+            raise ValueError("Invalid Task Name !")
 
-    if params.task_name in LABELS_LR:
-      if params.use_record_weight:
-        weights = tf.clip_by_value(
-          1.0 / (1.0 + weights + params.smooth_weight), params.min_record_weight, 1.0
+    else:
+        output_label = "output"
+        weights = tf.cast(features["weights"], dtype=tf.float32, name="RecordWeights")
+
+        if params.task_name in LABELS_LR:
+            if params.use_record_weight:
+                weights = tf.clip_by_value(
+                    1.0 / (1.0 + weights + params.smooth_weight),
+                    params.min_record_weight,
+                    1.0,
+                )
+
+                loss = tf.reduce_sum(
+                    tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits)
+                    * weights
+                ) / (tf.reduce_sum(weights))
+            else:
+                loss = tf.reduce_mean(
+                    tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits)
+                )
+            output = tf.nn.sigmoid(logits)
+
+        else:
+            raise ValueError("Invalid Task Name !")
+
+    train_op = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        # --------------------------------------------------------
+        #                get train_op
+        # --------------------------------------------------------
+        optimizer = tf.train.GradientDescentOptimizer(
+            learning_rate=params.learning_rate
         )
+        update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+        variables = get_trainable_variables(
+            all_trainable_variables=tf.trainable_variables(),
+            trainable_regexes=params.trainable_regexes,
+        )
+        with tf.control_dependencies(update_ops):
+            train_op = twml.optimizers.optimize_loss(
+                loss=loss,
+                variables=variables,
+                global_step=tf.train.get_global_step(),
+                optimizer=optimizer,
+                learning_rate=params.learning_rate,
+                learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn(
+                    params
+                ),
+            )
+
+    graph_output[output_label] = output
+    graph_output["loss"] = loss
+    graph_output["train_op"] = train_op
+    return graph_output
 
-        loss = tf.reduce_sum(
-          tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits) * weights
-        ) / (tf.reduce_sum(weights))
-      else:
-        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits))
-      output = tf.nn.sigmoid(logits)
 
+def get_params(args=None):
+    parser = get_arg_parser_light_ranking()
+    if args is None:
+        return parser.parse_args()
     else:
-      raise ValueError("Invalid Task Name !")
+        return parser.parse_args(args)
+
+
+def _main():
+    opt = get_params()
+    logging.info("parse is: ")
+    logging.info(opt)
+
+    feature_list = read_config(opt.feature_list).items()
+    feature_config = get_feature_config_light_ranking(
+        data_spec_path=opt.data_spec,
+        feature_list_provided=feature_list,
+        opt=opt,
+        add_gbdt=opt.use_gbdt_features,
+        run_light_ranking_group_metrics_in_bq=opt.run_light_ranking_group_metrics_in_bq,
+    )
+    feature_list_path = opt.feature_list
 
-  train_op = None
-  if mode == tf.estimator.ModeKeys.TRAIN:
     # --------------------------------------------------------
-    #                get train_op
+    #               Create Trainer
     # --------------------------------------------------------
-    optimizer = tf.train.GradientDescentOptimizer(learning_rate=params.learning_rate)
-    update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
-    variables = get_trainable_variables(
-      all_trainable_variables=tf.trainable_variables(), trainable_regexes=params.trainable_regexes
+    trainer = DataRecordTrainer(
+        name=opt.model_trainer_name,
+        params=opt,
+        build_graph_fn=build_graph,
+        save_dir=opt.save_dir,
+        run_config=None,
+        feature_config=feature_config,
+        metric_fn=get_metric_fn(opt.task_name, use_stratify_metrics=False),
     )
-    with tf.control_dependencies(update_ops):
-      train_op = twml.optimizers.optimize_loss(
-        loss=loss,
-        variables=variables,
-        global_step=tf.train.get_global_step(),
-        optimizer=optimizer,
-        learning_rate=params.learning_rate,
-        learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn(params),
-      )
-
-  graph_output[output_label] = output
-  graph_output["loss"] = loss
-  graph_output["train_op"] = train_op
-  return graph_output
+    if opt.directly_export_best:
+        logging.info("Directly exporting the model without training")
+    else:
+        # ----------------------------------------------------
+        #        Model Training & Evaluation
+        # ----------------------------------------------------
+        eval_input_fn = trainer.get_eval_input_fn(repeat=False, shuffle=False)
+        train_input_fn = trainer.get_train_input_fn(shuffle=True)
+
+        if opt.distributed or opt.num_workers is not None:
+            learn = trainer.train_and_evaluate
+        else:
+            learn = trainer.learn
+        logging.info("Training...")
+        start = datetime.now()
+
+        early_stop_metric = "rce_unweighted_" + opt.task_name
+        learn(
+            early_stop_minimize=False,
+            early_stop_metric=early_stop_metric,
+            early_stop_patience=opt.early_stop_patience,
+            early_stop_tolerance=opt.early_stop_tolerance,
+            eval_input_fn=eval_input_fn,
+            train_input_fn=train_input_fn,
+        )
 
+        end = datetime.now()
+        logging.info("Training time: " + str(end - start))
 
-def get_params(args=None):
-  parser = get_arg_parser_light_ranking()
-  if args is None:
-    return parser.parse_args()
-  else:
-    return parser.parse_args(args)
+        logging.info("Exporting the models...")
 
-
-def _main():
-  opt = get_params()
-  logging.info("parse is: ")
-  logging.info(opt)
-
-  feature_list = read_config(opt.feature_list).items()
-  feature_config = get_feature_config_light_ranking(
-    data_spec_path=opt.data_spec,
-    feature_list_provided=feature_list,
-    opt=opt,
-    add_gbdt=opt.use_gbdt_features,
-    run_light_ranking_group_metrics_in_bq=opt.run_light_ranking_group_metrics_in_bq,
-  )
-  feature_list_path = opt.feature_list
-
-  # --------------------------------------------------------
-  #               Create Trainer
-  # --------------------------------------------------------
-  trainer = DataRecordTrainer(
-    name=opt.model_trainer_name,
-    params=opt,
-    build_graph_fn=build_graph,
-    save_dir=opt.save_dir,
-    run_config=None,
-    feature_config=feature_config,
-    metric_fn=get_metric_fn(opt.task_name, use_stratify_metrics=False),
-  )
-  if opt.directly_export_best:
-    logging.info("Directly exporting the model without training")
-  else:
-    # ----------------------------------------------------
-    #        Model Training & Evaluation
-    # ----------------------------------------------------
-    eval_input_fn = trainer.get_eval_input_fn(repeat=False, shuffle=False)
-    train_input_fn = trainer.get_train_input_fn(shuffle=True)
-
-    if opt.distributed or opt.num_workers is not None:
-      learn = trainer.train_and_evaluate
-    else:
-      learn = trainer.learn
-    logging.info("Training...")
+    # --------------------------------------------------------
+    #      Do the model exporting
+    # --------------------------------------------------------
     start = datetime.now()
-
-    early_stop_metric = "rce_unweighted_" + opt.task_name
-    learn(
-      early_stop_minimize=False,
-      early_stop_metric=early_stop_metric,
-      early_stop_patience=opt.early_stop_patience,
-      early_stop_tolerance=opt.early_stop_tolerance,
-      eval_input_fn=eval_input_fn,
-      train_input_fn=train_input_fn,
+    if not opt.export_dir:
+        opt.export_dir = os.path.join(opt.save_dir, "exported_models")
+
+    raw_model_path = twml.contrib.export.export_fn.export_all_models(
+        trainer=trainer,
+        export_dir=opt.export_dir,
+        parse_fn=feature_config.get_parse_fn(),
+        serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn(),
+        export_output_fn=twml.export_output_fns.batch_prediction_continuous_output_fn,
     )
+    export_model_dir = decode_str_or_unicode(raw_model_path)
 
-    end = datetime.now()
-    logging.info("Training time: " + str(end - start))
-
-    logging.info("Exporting the models...")
-
-  # --------------------------------------------------------
-  #      Do the model exporting
-  # --------------------------------------------------------
-  start = datetime.now()
-  if not opt.export_dir:
-    opt.export_dir = os.path.join(opt.save_dir, "exported_models")
-
-  raw_model_path = twml.contrib.export.export_fn.export_all_models(
-    trainer=trainer,
-    export_dir=opt.export_dir,
-    parse_fn=feature_config.get_parse_fn(),
-    serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn(),
-    export_output_fn=twml.export_output_fns.batch_prediction_continuous_output_fn,
-  )
-  export_model_dir = decode_str_or_unicode(raw_model_path)
-
-  logging.info("Model export time: " + str(datetime.now() - start))
-  logging.info("The saved model directory is: " + opt.save_dir)
-
-  tf.logging.info("getting default continuous_feature_list")
-  continuous_feature_list = get_feature_list_for_light_ranking(feature_list_path, opt.data_spec)
-  continous_feature_list_save_path = os.path.join(opt.save_dir, "continuous_feature_list.json")
-  twml.util.write_file(continous_feature_list_save_path, continuous_feature_list, encode="json")
-  tf.logging.info(f"Finish writting files to {continous_feature_list_save_path}")
-
-  if opt.run_light_ranking_group_metrics:
-    # --------------------------------------------
-    # Run Light Ranking Group Metrics
-    # --------------------------------------------
-    run_group_metrics_light_ranking(
-      trainer=trainer,
-      data_dir=os.path.join(opt.eval_data_dir, opt.eval_start_datetime),
-      model_path=export_model_dir,
-      parse_fn=feature_config.get_parse_fn(),
-    )
+    logging.info("Model export time: " + str(datetime.now() - start))
+    logging.info("The saved model directory is: " + opt.save_dir)
 
-  if opt.run_light_ranking_group_metrics_in_bq:
-    # ----------------------------------------------------------------------------------------
-    # Get Light/Heavy Ranker Predictions for Light Ranking Group Metrics in BigQuery
-    # ----------------------------------------------------------------------------------------
-    trainer_pred = DataRecordTrainer(
-      name=opt.model_trainer_name,
-      params=opt,
-      build_graph_fn=partial(build_graph, run_light_ranking_group_metrics_in_bq=True),
-      save_dir=opt.save_dir + "/tmp/",
-      run_config=None,
-      feature_config=feature_config,
-      metric_fn=get_metric_fn(opt.task_name, use_stratify_metrics=False),
+    tf.logging.info("getting default continuous_feature_list")
+    continuous_feature_list = get_feature_list_for_light_ranking(
+        feature_list_path, opt.data_spec
+    )
+    continous_feature_list_save_path = os.path.join(
+        opt.save_dir, "continuous_feature_list.json"
     )
-    checkpoint_folder = os.path.join(opt.save_dir, "best_checkpoint")
-    checkpoint = tf.train.latest_checkpoint(checkpoint_folder, latest_filename=None)
-    tf.logging.info("\n\nPrediction from Checkpoint: {:}.\n\n".format(checkpoint))
-    run_group_metrics_light_ranking_in_bq(
-      trainer=trainer_pred, params=opt, checkpoint_path=checkpoint
+    twml.util.write_file(
+        continous_feature_list_save_path, continuous_feature_list, encode="json"
     )
+    tf.logging.info(f"Finish writting files to {continous_feature_list_save_path}")
+
+    if opt.run_light_ranking_group_metrics:
+        # --------------------------------------------
+        # Run Light Ranking Group Metrics
+        # --------------------------------------------
+        run_group_metrics_light_ranking(
+            trainer=trainer,
+            data_dir=os.path.join(opt.eval_data_dir, opt.eval_start_datetime),
+            model_path=export_model_dir,
+            parse_fn=feature_config.get_parse_fn(),
+        )
+
+    if opt.run_light_ranking_group_metrics_in_bq:
+        # ----------------------------------------------------------------------------------------
+        # Get Light/Heavy Ranker Predictions for Light Ranking Group Metrics in BigQuery
+        # ----------------------------------------------------------------------------------------
+        trainer_pred = DataRecordTrainer(
+            name=opt.model_trainer_name,
+            params=opt,
+            build_graph_fn=partial(
+                build_graph, run_light_ranking_group_metrics_in_bq=True
+            ),
+            save_dir=opt.save_dir + "/tmp/",
+            run_config=None,
+            feature_config=feature_config,
+            metric_fn=get_metric_fn(opt.task_name, use_stratify_metrics=False),
+        )
+        checkpoint_folder = os.path.join(opt.save_dir, "best_checkpoint")
+        checkpoint = tf.train.latest_checkpoint(checkpoint_folder, latest_filename=None)
+        tf.logging.info("\n\nPrediction from Checkpoint: {:}.\n\n".format(checkpoint))
+        run_group_metrics_light_ranking_in_bq(
+            trainer=trainer_pred, params=opt, checkpoint_path=checkpoint
+        )
 
-  tf.logging.info("Done Training & Prediction.")
+    tf.logging.info("Done Training & Prediction.")
 
 
 if __name__ == "__main__":
-  _main()
+    _main()
diff --git a/pushservice/src/main/python/models/light_ranking/eval_model.py b/pushservice/src/main/python/models/light_ranking/eval_model.py
index 1726685cf..e53d75277 100644
--- a/pushservice/src/main/python/models/light_ranking/eval_model.py
+++ b/pushservice/src/main/python/models/light_ranking/eval_model.py
@@ -1,89 +1,94 @@
+import os
 from datetime import datetime
 from functools import partial
-import os
 
 from ..libs.group_metrics import (
-  run_group_metrics_light_ranking,
-  run_group_metrics_light_ranking_in_bq,
+    run_group_metrics_light_ranking,
+    run_group_metrics_light_ranking_in_bq,
 )
 from ..libs.metric_fn_utils import get_metric_fn
 from ..libs.model_args import get_arg_parser_light_ranking
 from ..libs.model_utils import read_config
-from .deep_norm import build_graph, DataRecordTrainer, get_config_func, logging
-
+from .deep_norm import DataRecordTrainer, build_graph, get_config_func, logging
 
 # checkstyle: noqa
 
 if __name__ == "__main__":
-  parser = get_arg_parser_light_ranking()
-  parser.add_argument(
-    "--eval_checkpoint",
-    default=None,
-    type=str,
-    help="Which checkpoint to use for evaluation",
-  )
-  parser.add_argument(
-    "--saved_model_path",
-    default=None,
-    type=str,
-    help="Path to saved model for evaluation",
-  )
-  parser.add_argument(
-    "--run_binary_metrics",
-    default=False,
-    action="store_true",
-    help="Whether to compute the basic binary metrics for Light Ranking.",
-  )
+    parser = get_arg_parser_light_ranking()
+    parser.add_argument(
+        "--eval_checkpoint",
+        default=None,
+        type=str,
+        help="Which checkpoint to use for evaluation",
+    )
+    parser.add_argument(
+        "--saved_model_path",
+        default=None,
+        type=str,
+        help="Path to saved model for evaluation",
+    )
+    parser.add_argument(
+        "--run_binary_metrics",
+        default=False,
+        action="store_true",
+        help="Whether to compute the basic binary metrics for Light Ranking.",
+    )
 
-  opt = parser.parse_args()
-  logging.info("parse is: ")
-  logging.info(opt)
+    opt = parser.parse_args()
+    logging.info("parse is: ")
+    logging.info(opt)
 
-  feature_list = read_config(opt.feature_list).items()
-  feature_config = get_config_func(opt.feat_config_type)(
-    data_spec_path=opt.data_spec,
-    feature_list_provided=feature_list,
-    opt=opt,
-    add_gbdt=opt.use_gbdt_features,
-    run_light_ranking_group_metrics_in_bq=opt.run_light_ranking_group_metrics_in_bq,
-  )
+    feature_list = read_config(opt.feature_list).items()
+    feature_config = get_config_func(opt.feat_config_type)(
+        data_spec_path=opt.data_spec,
+        feature_list_provided=feature_list,
+        opt=opt,
+        add_gbdt=opt.use_gbdt_features,
+        run_light_ranking_group_metrics_in_bq=opt.run_light_ranking_group_metrics_in_bq,
+    )
 
-  # -----------------------------------------------
-  #        Create Trainer
-  # -----------------------------------------------
-  trainer = DataRecordTrainer(
-    name=opt.model_trainer_name,
-    params=opt,
-    build_graph_fn=partial(build_graph, run_light_ranking_group_metrics_in_bq=True),
-    save_dir=opt.save_dir,
-    run_config=None,
-    feature_config=feature_config,
-    metric_fn=get_metric_fn(opt.task_name, use_stratify_metrics=False),
-  )
+    # -----------------------------------------------
+    #        Create Trainer
+    # -----------------------------------------------
+    trainer = DataRecordTrainer(
+        name=opt.model_trainer_name,
+        params=opt,
+        build_graph_fn=partial(build_graph, run_light_ranking_group_metrics_in_bq=True),
+        save_dir=opt.save_dir,
+        run_config=None,
+        feature_config=feature_config,
+        metric_fn=get_metric_fn(opt.task_name, use_stratify_metrics=False),
+    )
 
-  # -----------------------------------------------
-  #         Model Evaluation
-  # -----------------------------------------------
-  logging.info("Evaluating...")
-  start = datetime.now()
+    # -----------------------------------------------
+    #         Model Evaluation
+    # -----------------------------------------------
+    logging.info("Evaluating...")
+    start = datetime.now()
 
-  if opt.run_binary_metrics:
-    eval_input_fn = trainer.get_eval_input_fn(repeat=False, shuffle=False)
-    eval_steps = None if (opt.eval_steps is not None and opt.eval_steps < 0) else opt.eval_steps
-    trainer.estimator.evaluate(eval_input_fn, steps=eval_steps, checkpoint_path=opt.eval_checkpoint)
+    if opt.run_binary_metrics:
+        eval_input_fn = trainer.get_eval_input_fn(repeat=False, shuffle=False)
+        eval_steps = (
+            None
+            if (opt.eval_steps is not None and opt.eval_steps < 0)
+            else opt.eval_steps
+        )
+        trainer.estimator.evaluate(
+            eval_input_fn, steps=eval_steps, checkpoint_path=opt.eval_checkpoint
+        )
 
-  if opt.run_light_ranking_group_metrics_in_bq:
-    run_group_metrics_light_ranking_in_bq(
-      trainer=trainer, params=opt, checkpoint_path=opt.eval_checkpoint
-    )
+    if opt.run_light_ranking_group_metrics_in_bq:
+        run_group_metrics_light_ranking_in_bq(
+            trainer=trainer, params=opt, checkpoint_path=opt.eval_checkpoint
+        )
 
-  if opt.run_light_ranking_group_metrics:
-    run_group_metrics_light_ranking(
-      trainer=trainer,
-      data_dir=os.path.join(opt.eval_data_dir, opt.eval_start_datetime),
-      model_path=opt.saved_model_path,
-      parse_fn=feature_config.get_parse_fn(),
-    )
+    if opt.run_light_ranking_group_metrics:
+        run_group_metrics_light_ranking(
+            trainer=trainer,
+            data_dir=os.path.join(opt.eval_data_dir, opt.eval_start_datetime),
+            model_path=opt.saved_model_path,
+            parse_fn=feature_config.get_parse_fn(),
+        )
 
-  end = datetime.now()
-  logging.info("Evaluating time: " + str(end - start))
+    end = datetime.now()
+    logging.info("Evaluating time: " + str(end - start))
diff --git a/pushservice/src/main/python/models/light_ranking/model_pools_mlp.py b/pushservice/src/main/python/models/light_ranking/model_pools_mlp.py
index b45c85e47..435504a6f 100644
--- a/pushservice/src/main/python/models/light_ranking/model_pools_mlp.py
+++ b/pushservice/src/main/python/models/light_ranking/model_pools_mlp.py
@@ -1,187 +1,215 @@
 import warnings
 
+import tensorflow.compat.v1 as tf
+
 from twml.contrib.layers import ZscoreNormalization
 
 from ...libs.customized_full_sparse import FullSparse
 from ...libs.get_feat_config import FEAT_CONFIG_DEFAULT_VAL as MISSING_VALUE_MARKER
 from ...libs.model_utils import (
-  _sparse_feature_fixup,
-  adaptive_transformation,
-  filter_nans_and_infs,
-  get_dense_out,
-  tensor_dropout,
+    _sparse_feature_fixup,
+    adaptive_transformation,
+    filter_nans_and_infs,
+    get_dense_out,
+    tensor_dropout,
 )
 
-import tensorflow.compat.v1 as tf
 # checkstyle: noqa
 
+
 def light_ranking_mlp_ngbdt(features, is_training, params, label=None):
-  return deepnorm_light_ranking(
+    return deepnorm_light_ranking(
+        features,
+        is_training,
+        params,
+        label=label,
+        decay=params.momentum,
+        dense_emb_size=params.dense_embedding_size,
+        base_activation=tf.keras.layers.LeakyReLU(),
+        input_dropout_rate=params.dropout,
+        use_gbdt=False,
+    )
+
+
+def deepnorm_light_ranking(
     features,
     is_training,
     params,
-    label=label,
-    decay=params.momentum,
-    dense_emb_size=params.dense_embedding_size,
-    base_activation=tf.keras.layers.LeakyReLU(),
-    input_dropout_rate=params.dropout,
+    label=None,
+    decay=0.99999,
+    dense_emb_size=128,
+    base_activation=None,
+    input_dropout_rate=None,
+    input_dense_type="self_atten_dense",
+    emb_dense_type="self_atten_dense",
+    mlp_dense_type="self_atten_dense",
     use_gbdt=False,
-  )
-
-
-def deepnorm_light_ranking(
-  features,
-  is_training,
-  params,
-  label=None,
-  decay=0.99999,
-  dense_emb_size=128,
-  base_activation=None,
-  input_dropout_rate=None,
-  input_dense_type="self_atten_dense",
-  emb_dense_type="self_atten_dense",
-  mlp_dense_type="self_atten_dense",
-  use_gbdt=False,
 ):
-  # --------------------------------------------------------
-  #            Initial Parameter Checking
-  # --------------------------------------------------------
-  if base_activation is None:
-    base_activation = tf.keras.layers.LeakyReLU()
-
-  if label is not None:
-    warnings.warn(
-      "Label is unused in deepnorm_gbdt. Stop using this argument.",
-      DeprecationWarning,
-    )
-
-  with tf.variable_scope("helper_layers"):
-    full_sparse_layer = FullSparse(
-      output_size=params.sparse_embedding_size,
-      activation=base_activation,
-      use_sparse_grads=is_training,
-      use_binary_values=False,
-      dtype=tf.float32,
-    )
-    input_normalizing_layer = ZscoreNormalization(decay=decay, name="input_normalizing_layer")
-
-  # --------------------------------------------------------
-  #            Feature Selection & Embedding
-  # --------------------------------------------------------
-  if use_gbdt:
-    sparse_gbdt_features = _sparse_feature_fixup(features["gbdt_sparse"], params.input_size_bits)
-    if input_dropout_rate is not None:
-      sparse_gbdt_features = tensor_dropout(
-        sparse_gbdt_features, input_dropout_rate, is_training, sparse_tensor=True
-      )
-
-    total_embed = full_sparse_layer(sparse_gbdt_features, use_binary_values=True)
-
-    if (input_dropout_rate is not None) and is_training:
-      total_embed = total_embed / (1 - input_dropout_rate)
-
-  else:
-    with tf.variable_scope("dense_branch"):
-      dense_continuous_features = filter_nans_and_infs(features["continuous"])
-
-      if params.use_missing_sub_branch:
-        is_missing = tf.equal(dense_continuous_features, MISSING_VALUE_MARKER)
-        continuous_features_filled = tf.where(
-          is_missing,
-          tf.zeros_like(dense_continuous_features),
-          dense_continuous_features,
-        )
-        normalized_features = input_normalizing_layer(
-          continuous_features_filled, is_training, tf.math.logical_not(is_missing)
+    # --------------------------------------------------------
+    #            Initial Parameter Checking
+    # --------------------------------------------------------
+    if base_activation is None:
+        base_activation = tf.keras.layers.LeakyReLU()
+
+    if label is not None:
+        warnings.warn(
+            "Label is unused in deepnorm_gbdt. Stop using this argument.",
+            DeprecationWarning,
         )
 
-        with tf.variable_scope("missing_sub_branch"):
-          missing_feature_embed = get_dense_out(
-            tf.cast(is_missing, tf.float32),
-            dense_emb_size,
+    with tf.variable_scope("helper_layers"):
+        full_sparse_layer = FullSparse(
+            output_size=params.sparse_embedding_size,
             activation=base_activation,
-            dense_type=input_dense_type,
-          )
-
-      else:
-        continuous_features_filled = dense_continuous_features
-        normalized_features = input_normalizing_layer(continuous_features_filled, is_training)
-
-      with tf.variable_scope("continuous_sub_branch"):
-        normalized_features = adaptive_transformation(
-          normalized_features, is_training, func_type="tiny"
+            use_sparse_grads=is_training,
+            use_binary_values=False,
+            dtype=tf.float32,
         )
-
-        if input_dropout_rate is not None:
-          normalized_features = tensor_dropout(
-            normalized_features,
-            input_dropout_rate,
-            is_training,
-            sparse_tensor=False,
-          )
-        filled_feature_embed = get_dense_out(
-          normalized_features,
-          dense_emb_size,
-          activation=base_activation,
-          dense_type=input_dense_type,
+        input_normalizing_layer = ZscoreNormalization(
+            decay=decay, name="input_normalizing_layer"
         )
 
-      if params.use_missing_sub_branch:
-        dense_embed = tf.concat(
-          [filled_feature_embed, missing_feature_embed], axis=1, name="merge_dense_emb"
-        )
-      else:
-        dense_embed = filled_feature_embed
-
-    with tf.variable_scope("sparse_branch"):
-      sparse_discrete_features = _sparse_feature_fixup(
-        features["sparse_no_continuous"], params.input_size_bits
-      )
-      if input_dropout_rate is not None:
-        sparse_discrete_features = tensor_dropout(
-          sparse_discrete_features, input_dropout_rate, is_training, sparse_tensor=True
+    # --------------------------------------------------------
+    #            Feature Selection & Embedding
+    # --------------------------------------------------------
+    if use_gbdt:
+        sparse_gbdt_features = _sparse_feature_fixup(
+            features["gbdt_sparse"], params.input_size_bits
         )
+        if input_dropout_rate is not None:
+            sparse_gbdt_features = tensor_dropout(
+                sparse_gbdt_features,
+                input_dropout_rate,
+                is_training,
+                sparse_tensor=True,
+            )
 
-      discrete_features_embed = full_sparse_layer(sparse_discrete_features, use_binary_values=True)
-
-      if (input_dropout_rate is not None) and is_training:
-        discrete_features_embed = discrete_features_embed / (1 - input_dropout_rate)
-
-    total_embed = tf.concat(
-      [dense_embed, discrete_features_embed],
-      axis=1,
-      name="total_embed",
-    )
+        total_embed = full_sparse_layer(sparse_gbdt_features, use_binary_values=True)
 
-  total_embed = tf.layers.batch_normalization(
-    total_embed,
-    training=is_training,
-    renorm_momentum=decay,
-    momentum=decay,
-    renorm=is_training,
-    trainable=True,
-  )
-
-  # --------------------------------------------------------
-  #                MLP Layers
-  # --------------------------------------------------------
-  with tf.variable_scope("MLP_branch"):
-
-    assert params.num_mlp_layers >= 0
-    embed_list = [total_embed] + [None for _ in range(params.num_mlp_layers)]
-    dense_types = [emb_dense_type] + [mlp_dense_type for _ in range(params.num_mlp_layers - 1)]
-
-    for xl in range(1, params.num_mlp_layers + 1):
-      neurons = params.mlp_neuron_scale ** (params.num_mlp_layers + 1 - xl)
-      embed_list[xl] = get_dense_out(
-        embed_list[xl - 1], neurons, activation=base_activation, dense_type=dense_types[xl - 1]
-      )
-
-    if params.task_name in ["Sent", "HeavyRankPosition", "HeavyRankProbability"]:
-      logits = get_dense_out(embed_list[-1], 1, activation=None, dense_type=mlp_dense_type)
+        if (input_dropout_rate is not None) and is_training:
+            total_embed = total_embed / (1 - input_dropout_rate)
 
     else:
-      raise ValueError("Invalid Task Name !")
+        with tf.variable_scope("dense_branch"):
+            dense_continuous_features = filter_nans_and_infs(features["continuous"])
+
+            if params.use_missing_sub_branch:
+                is_missing = tf.equal(dense_continuous_features, MISSING_VALUE_MARKER)
+                continuous_features_filled = tf.where(
+                    is_missing,
+                    tf.zeros_like(dense_continuous_features),
+                    dense_continuous_features,
+                )
+                normalized_features = input_normalizing_layer(
+                    continuous_features_filled,
+                    is_training,
+                    tf.math.logical_not(is_missing),
+                )
+
+                with tf.variable_scope("missing_sub_branch"):
+                    missing_feature_embed = get_dense_out(
+                        tf.cast(is_missing, tf.float32),
+                        dense_emb_size,
+                        activation=base_activation,
+                        dense_type=input_dense_type,
+                    )
+
+            else:
+                continuous_features_filled = dense_continuous_features
+                normalized_features = input_normalizing_layer(
+                    continuous_features_filled, is_training
+                )
+
+            with tf.variable_scope("continuous_sub_branch"):
+                normalized_features = adaptive_transformation(
+                    normalized_features, is_training, func_type="tiny"
+                )
+
+                if input_dropout_rate is not None:
+                    normalized_features = tensor_dropout(
+                        normalized_features,
+                        input_dropout_rate,
+                        is_training,
+                        sparse_tensor=False,
+                    )
+                filled_feature_embed = get_dense_out(
+                    normalized_features,
+                    dense_emb_size,
+                    activation=base_activation,
+                    dense_type=input_dense_type,
+                )
+
+            if params.use_missing_sub_branch:
+                dense_embed = tf.concat(
+                    [filled_feature_embed, missing_feature_embed],
+                    axis=1,
+                    name="merge_dense_emb",
+                )
+            else:
+                dense_embed = filled_feature_embed
+
+        with tf.variable_scope("sparse_branch"):
+            sparse_discrete_features = _sparse_feature_fixup(
+                features["sparse_no_continuous"], params.input_size_bits
+            )
+            if input_dropout_rate is not None:
+                sparse_discrete_features = tensor_dropout(
+                    sparse_discrete_features,
+                    input_dropout_rate,
+                    is_training,
+                    sparse_tensor=True,
+                )
+
+            discrete_features_embed = full_sparse_layer(
+                sparse_discrete_features, use_binary_values=True
+            )
+
+            if (input_dropout_rate is not None) and is_training:
+                discrete_features_embed = discrete_features_embed / (
+                    1 - input_dropout_rate
+                )
+
+        total_embed = tf.concat(
+            [dense_embed, discrete_features_embed],
+            axis=1,
+            name="total_embed",
+        )
+
+    total_embed = tf.layers.batch_normalization(
+        total_embed,
+        training=is_training,
+        renorm_momentum=decay,
+        momentum=decay,
+        renorm=is_training,
+        trainable=True,
+    )
 
-  output_dict = {"output": logits}
-  return output_dict
+    # --------------------------------------------------------
+    #                MLP Layers
+    # --------------------------------------------------------
+    with tf.variable_scope("MLP_branch"):
+        assert params.num_mlp_layers >= 0
+        embed_list = [total_embed] + [None for _ in range(params.num_mlp_layers)]
+        dense_types = [emb_dense_type] + [
+            mlp_dense_type for _ in range(params.num_mlp_layers - 1)
+        ]
+
+        for xl in range(1, params.num_mlp_layers + 1):
+            neurons = params.mlp_neuron_scale ** (params.num_mlp_layers + 1 - xl)
+            embed_list[xl] = get_dense_out(
+                embed_list[xl - 1],
+                neurons,
+                activation=base_activation,
+                dense_type=dense_types[xl - 1],
+            )
+
+        if params.task_name in ["Sent", "HeavyRankPosition", "HeavyRankProbability"]:
+            logits = get_dense_out(
+                embed_list[-1], 1, activation=None, dense_type=mlp_dense_type
+            )
+
+        else:
+            raise ValueError("Invalid Task Name !")
+
+    output_dict = {"output": logits}
+    return output_dict
diff --git a/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py b/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py
index 167756c01..971eb5a45 100644
--- a/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py
+++ b/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py
@@ -3,81 +3,81 @@
 
 
 def get_feature_config(data_spec_path, label):
-  return (
-    FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True)
-    .batch_add_features(
-      [
-        ("ebd.author_specific_score", "A"),
-        ("ebd.has_diff_lang", "A"),
-        ("ebd.has_english_tweet_diff_ui_lang", "A"),
-        ("ebd.has_english_ui_diff_tweet_lang", "A"),
-        ("ebd.is_self_tweet", "A"),
-        ("ebd.tweet_age_in_secs", "A"),
-        ("encoded_tweet_features.favorite_count", "A"),
-        ("encoded_tweet_features.from_verified_account_flag", "A"),
-        ("encoded_tweet_features.has_card_flag", "A"),
-        # ("encoded_tweet_features.has_consumer_video_flag", "A"),
-        ("encoded_tweet_features.has_image_url_flag", "A"),
-        ("encoded_tweet_features.has_link_flag", "A"),
-        ("encoded_tweet_features.has_multiple_hashtags_or_trends_flag", "A"),
-        # ("encoded_tweet_features.has_multiple_media_flag", "A"),
-        ("encoded_tweet_features.has_native_image_flag", "A"),
-        ("encoded_tweet_features.has_news_url_flag", "A"),
-        ("encoded_tweet_features.has_periscope_flag", "A"),
-        ("encoded_tweet_features.has_pro_video_flag", "A"),
-        ("encoded_tweet_features.has_quote_flag", "A"),
-        ("encoded_tweet_features.has_trend_flag", "A"),
-        ("encoded_tweet_features.has_video_url_flag", "A"),
-        ("encoded_tweet_features.has_vine_flag", "A"),
-        ("encoded_tweet_features.has_visible_link_flag", "A"),
-        ("encoded_tweet_features.is_offensive_flag", "A"),
-        ("encoded_tweet_features.is_reply_flag", "A"),
-        ("encoded_tweet_features.is_retweet_flag", "A"),
-        ("encoded_tweet_features.is_sensitive_content", "A"),
-        # ("encoded_tweet_features.is_user_new_flag", "A"),
-        ("encoded_tweet_features.language", "A"),
-        ("encoded_tweet_features.link_language", "A"),
-        ("encoded_tweet_features.num_hashtags", "A"),
-        ("encoded_tweet_features.num_mentions", "A"),
-        # ("encoded_tweet_features.profile_is_egg_flag", "A"),
-        ("encoded_tweet_features.reply_count", "A"),
-        ("encoded_tweet_features.retweet_count", "A"),
-        ("encoded_tweet_features.text_score", "A"),
-        ("encoded_tweet_features.user_reputation", "A"),
-        ("extended_encoded_tweet_features.embeds_impression_count", "A"),
-        ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
-        ("extended_encoded_tweet_features.embeds_url_count", "A"),
-        ("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
-        ("extended_encoded_tweet_features.favorite_count_v2", "A"),
-        ("extended_encoded_tweet_features.label_abusive_hi_rcl_flag", "A"),
-        ("extended_encoded_tweet_features.label_dup_content_flag", "A"),
-        ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
-        ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
-        ("extended_encoded_tweet_features.label_spam_flag", "A"),
-        ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
-        ("extended_encoded_tweet_features.quote_count", "A"),
-        ("extended_encoded_tweet_features.reply_count_v2", "A"),
-        ("extended_encoded_tweet_features.retweet_count_v2", "A"),
-        ("extended_encoded_tweet_features.weighted_favorite_count", "A"),
-        ("extended_encoded_tweet_features.weighted_quote_count", "A"),
-        ("extended_encoded_tweet_features.weighted_reply_count", "A"),
-        ("extended_encoded_tweet_features.weighted_retweet_count", "A"),
-      ]
+    return (
+        FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True)
+        .batch_add_features(
+            [
+                ("ebd.author_specific_score", "A"),
+                ("ebd.has_diff_lang", "A"),
+                ("ebd.has_english_tweet_diff_ui_lang", "A"),
+                ("ebd.has_english_ui_diff_tweet_lang", "A"),
+                ("ebd.is_self_tweet", "A"),
+                ("ebd.tweet_age_in_secs", "A"),
+                ("encoded_tweet_features.favorite_count", "A"),
+                ("encoded_tweet_features.from_verified_account_flag", "A"),
+                ("encoded_tweet_features.has_card_flag", "A"),
+                # ("encoded_tweet_features.has_consumer_video_flag", "A"),
+                ("encoded_tweet_features.has_image_url_flag", "A"),
+                ("encoded_tweet_features.has_link_flag", "A"),
+                ("encoded_tweet_features.has_multiple_hashtags_or_trends_flag", "A"),
+                # ("encoded_tweet_features.has_multiple_media_flag", "A"),
+                ("encoded_tweet_features.has_native_image_flag", "A"),
+                ("encoded_tweet_features.has_news_url_flag", "A"),
+                ("encoded_tweet_features.has_periscope_flag", "A"),
+                ("encoded_tweet_features.has_pro_video_flag", "A"),
+                ("encoded_tweet_features.has_quote_flag", "A"),
+                ("encoded_tweet_features.has_trend_flag", "A"),
+                ("encoded_tweet_features.has_video_url_flag", "A"),
+                ("encoded_tweet_features.has_vine_flag", "A"),
+                ("encoded_tweet_features.has_visible_link_flag", "A"),
+                ("encoded_tweet_features.is_offensive_flag", "A"),
+                ("encoded_tweet_features.is_reply_flag", "A"),
+                ("encoded_tweet_features.is_retweet_flag", "A"),
+                ("encoded_tweet_features.is_sensitive_content", "A"),
+                # ("encoded_tweet_features.is_user_new_flag", "A"),
+                ("encoded_tweet_features.language", "A"),
+                ("encoded_tweet_features.link_language", "A"),
+                ("encoded_tweet_features.num_hashtags", "A"),
+                ("encoded_tweet_features.num_mentions", "A"),
+                # ("encoded_tweet_features.profile_is_egg_flag", "A"),
+                ("encoded_tweet_features.reply_count", "A"),
+                ("encoded_tweet_features.retweet_count", "A"),
+                ("encoded_tweet_features.text_score", "A"),
+                ("encoded_tweet_features.user_reputation", "A"),
+                ("extended_encoded_tweet_features.embeds_impression_count", "A"),
+                ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
+                ("extended_encoded_tweet_features.embeds_url_count", "A"),
+                ("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
+                ("extended_encoded_tweet_features.favorite_count_v2", "A"),
+                ("extended_encoded_tweet_features.label_abusive_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.label_dup_content_flag", "A"),
+                ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
+                ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.label_spam_flag", "A"),
+                ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.quote_count", "A"),
+                ("extended_encoded_tweet_features.reply_count_v2", "A"),
+                ("extended_encoded_tweet_features.retweet_count_v2", "A"),
+                ("extended_encoded_tweet_features.weighted_favorite_count", "A"),
+                ("extended_encoded_tweet_features.weighted_quote_count", "A"),
+                ("extended_encoded_tweet_features.weighted_reply_count", "A"),
+                ("extended_encoded_tweet_features.weighted_retweet_count", "A"),
+            ]
+        )
+        .add_labels(
+            [
+                label,  # Tensor index: 0
+                "recap.engagement.is_clicked",  # Tensor index: 1
+                "recap.engagement.is_favorited",  # Tensor index: 2
+                "recap.engagement.is_open_linked",  # Tensor index: 3
+                "recap.engagement.is_photo_expanded",  # Tensor index: 4
+                "recap.engagement.is_profile_clicked",  # Tensor index: 5
+                "recap.engagement.is_replied",  # Tensor index: 6
+                "recap.engagement.is_retweeted",  # Tensor index: 7
+                "recap.engagement.is_video_playback_50",  # Tensor index: 8
+                "timelines.earlybird_score",  # Tensor index: 9
+            ]
+        )
+        .define_weight("meta.record_weight/type=earlybird")
+        .build()
     )
-    .add_labels(
-      [
-        label,  # Tensor index: 0
-        "recap.engagement.is_clicked",  # Tensor index: 1
-        "recap.engagement.is_favorited",  # Tensor index: 2
-        "recap.engagement.is_open_linked",  # Tensor index: 3
-        "recap.engagement.is_photo_expanded",  # Tensor index: 4
-        "recap.engagement.is_profile_clicked",  # Tensor index: 5
-        "recap.engagement.is_replied",  # Tensor index: 6
-        "recap.engagement.is_retweeted",  # Tensor index: 7
-        "recap.engagement.is_video_playback_50",  # Tensor index: 8
-        "timelines.earlybird_score",  # Tensor index: 9
-      ]
-    )
-    .define_weight("meta.record_weight/type=earlybird")
-    .build()
-  )
diff --git a/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py b/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py
index 85b7d7f10..b3224eb63 100644
--- a/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py
+++ b/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py
@@ -3,72 +3,83 @@
 
 
 def get_feature_config(data_spec_path, label):
-  return FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True) \
-    .batch_add_features(
-    [
-      ("ebd.has_diff_lang", "A"),
-      ("ebd.tweet_age_in_secs", "A"),
-      ("encoded_tweet_features.composer_source_is_camera_flag", "A"),
-      ("encoded_tweet_features.favorite_count", "A"),
-      ("encoded_tweet_features.has_card_flag", "A"),
-      ("encoded_tweet_features.has_image_url_flag", "A"),
-      ("encoded_tweet_features.has_native_image_flag", "A"),
-      ("encoded_tweet_features.has_news_url_flag", "A"),
-      ("encoded_tweet_features.has_periscope_flag", "A"),
-      ("encoded_tweet_features.has_pro_video_flag", "A"),
-      ("encoded_tweet_features.has_quote_flag", "A"),
-      ("encoded_tweet_features.has_video_url_flag", "A"),
-      ("encoded_tweet_features.has_vine_flag", "A"),
-      ("encoded_tweet_features.has_visible_link_flag", "A"),
-      ("encoded_tweet_features.is_sensitive_content", "A"),
-      ("encoded_tweet_features.is_user_spam_flag", "A"),
-      ("encoded_tweet_features.link_language", "A"),
-      ("encoded_tweet_features.num_hashtags", "A"),
-      ("encoded_tweet_features.num_mentions", "A"),
-      ("encoded_tweet_features.reply_count", "A"),
-      ("encoded_tweet_features.retweet_count", "A"),
-      ("encoded_tweet_features.text_score", "A"),
-      ("encoded_tweet_features.user_reputation", "A"),
-      ("extended_encoded_tweet_features.decayed_favorite_count", "A"),
-      ("extended_encoded_tweet_features.decayed_quote_count", "A"),
-      ("extended_encoded_tweet_features.decayed_reply_count", "A"),
-      ("extended_encoded_tweet_features.decayed_retweet_count", "A"),
-      ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
-      ("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
-      ("extended_encoded_tweet_features.fake_favorite_count", "A"),
-      ("extended_encoded_tweet_features.fake_quote_count", "A"),
-      ("extended_encoded_tweet_features.fake_reply_count", "A"),
-      ("extended_encoded_tweet_features.fake_retweet_count", "A"),
-      ("extended_encoded_tweet_features.favorite_count_v2", "A"),
-      ("extended_encoded_tweet_features.label_dup_content_flag", "A"),
-      ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
-      ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
-      ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
-      ("extended_encoded_tweet_features.periscope_exists", "A"),
-      ("extended_encoded_tweet_features.periscope_has_been_featured", "A"),
-      ("extended_encoded_tweet_features.periscope_is_currently_featured", "A"),
-      ("extended_encoded_tweet_features.periscope_is_from_quality_source", "A"),
-      ("extended_encoded_tweet_features.periscope_is_live", "A"),
-      ("extended_encoded_tweet_features.quote_count", "A"),
-      ("extended_encoded_tweet_features.reply_count_v2", "A"),
-      ("extended_encoded_tweet_features.retweet_count_v2", "A"),
-      ("extended_encoded_tweet_features.weighted_favorite_count", "A"),
-      ("extended_encoded_tweet_features.weighted_quote_count", "A"),
-      ("extended_encoded_tweet_features.weighted_reply_count", "A"),
-      ("extended_encoded_tweet_features.weighted_retweet_count", "A"),
-      ("timelines.earlybird.visible_token_ratio", "A")
-    ]
-  ).add_labels([
-    label,                                 # Tensor index: 0
-    "itl.engagement.is_clicked",           # Tensor index: 1
-    "itl.engagement.is_favorited",         # Tensor index: 2
-    "itl.engagement.is_open_linked",       # Tensor index: 3
-    "itl.engagement.is_photo_expanded",    # Tensor index: 4
-    "itl.engagement.is_profile_clicked",   # Tensor index: 5
-    "itl.engagement.is_replied",           # Tensor index: 6
-    "itl.engagement.is_retweeted",         # Tensor index: 7
-    "itl.engagement.is_video_playback_50",  # Tensor index: 8
-    "timelines.earlybird_score",           # Tensor index: 9
-  ]) \
-    .define_weight("meta.record_weight/type=earlybird") \
-    .build()
+    return (
+        FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True)
+        .batch_add_features(
+            [
+                ("ebd.has_diff_lang", "A"),
+                ("ebd.tweet_age_in_secs", "A"),
+                ("encoded_tweet_features.composer_source_is_camera_flag", "A"),
+                ("encoded_tweet_features.favorite_count", "A"),
+                ("encoded_tweet_features.has_card_flag", "A"),
+                ("encoded_tweet_features.has_image_url_flag", "A"),
+                ("encoded_tweet_features.has_native_image_flag", "A"),
+                ("encoded_tweet_features.has_news_url_flag", "A"),
+                ("encoded_tweet_features.has_periscope_flag", "A"),
+                ("encoded_tweet_features.has_pro_video_flag", "A"),
+                ("encoded_tweet_features.has_quote_flag", "A"),
+                ("encoded_tweet_features.has_video_url_flag", "A"),
+                ("encoded_tweet_features.has_vine_flag", "A"),
+                ("encoded_tweet_features.has_visible_link_flag", "A"),
+                ("encoded_tweet_features.is_sensitive_content", "A"),
+                ("encoded_tweet_features.is_user_spam_flag", "A"),
+                ("encoded_tweet_features.link_language", "A"),
+                ("encoded_tweet_features.num_hashtags", "A"),
+                ("encoded_tweet_features.num_mentions", "A"),
+                ("encoded_tweet_features.reply_count", "A"),
+                ("encoded_tweet_features.retweet_count", "A"),
+                ("encoded_tweet_features.text_score", "A"),
+                ("encoded_tweet_features.user_reputation", "A"),
+                ("extended_encoded_tweet_features.decayed_favorite_count", "A"),
+                ("extended_encoded_tweet_features.decayed_quote_count", "A"),
+                ("extended_encoded_tweet_features.decayed_reply_count", "A"),
+                ("extended_encoded_tweet_features.decayed_retweet_count", "A"),
+                ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
+                ("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
+                ("extended_encoded_tweet_features.fake_favorite_count", "A"),
+                ("extended_encoded_tweet_features.fake_quote_count", "A"),
+                ("extended_encoded_tweet_features.fake_reply_count", "A"),
+                ("extended_encoded_tweet_features.fake_retweet_count", "A"),
+                ("extended_encoded_tweet_features.favorite_count_v2", "A"),
+                ("extended_encoded_tweet_features.label_dup_content_flag", "A"),
+                ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
+                ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.periscope_exists", "A"),
+                ("extended_encoded_tweet_features.periscope_has_been_featured", "A"),
+                (
+                    "extended_encoded_tweet_features.periscope_is_currently_featured",
+                    "A",
+                ),
+                (
+                    "extended_encoded_tweet_features.periscope_is_from_quality_source",
+                    "A",
+                ),
+                ("extended_encoded_tweet_features.periscope_is_live", "A"),
+                ("extended_encoded_tweet_features.quote_count", "A"),
+                ("extended_encoded_tweet_features.reply_count_v2", "A"),
+                ("extended_encoded_tweet_features.retweet_count_v2", "A"),
+                ("extended_encoded_tweet_features.weighted_favorite_count", "A"),
+                ("extended_encoded_tweet_features.weighted_quote_count", "A"),
+                ("extended_encoded_tweet_features.weighted_reply_count", "A"),
+                ("extended_encoded_tweet_features.weighted_retweet_count", "A"),
+                ("timelines.earlybird.visible_token_ratio", "A"),
+            ]
+        )
+        .add_labels(
+            [
+                label,  # Tensor index: 0
+                "itl.engagement.is_clicked",  # Tensor index: 1
+                "itl.engagement.is_favorited",  # Tensor index: 2
+                "itl.engagement.is_open_linked",  # Tensor index: 3
+                "itl.engagement.is_photo_expanded",  # Tensor index: 4
+                "itl.engagement.is_profile_clicked",  # Tensor index: 5
+                "itl.engagement.is_replied",  # Tensor index: 6
+                "itl.engagement.is_retweeted",  # Tensor index: 7
+                "itl.engagement.is_video_playback_50",  # Tensor index: 8
+                "timelines.earlybird_score",  # Tensor index: 9
+            ]
+        )
+        .define_weight("meta.record_weight/type=earlybird")
+        .build()
+    )
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py
index 57178b92c..e6a3ee940 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py
@@ -1,21 +1,29 @@
 # checkstyle: noqa
 
 INDEX_BY_LABEL = {
-  "is_clicked": 1,
-  "is_favorited": 2,
-  "is_open_linked": 3,
-  "is_photo_expanded": 4,
-  "is_profile_clicked": 5,
-  "is_replied": 6,
-  "is_retweeted": 7,
-  "is_video_playback_50": 8
+    "is_clicked": 1,
+    "is_favorited": 2,
+    "is_open_linked": 3,
+    "is_photo_expanded": 4,
+    "is_profile_clicked": 5,
+    "is_replied": 6,
+    "is_retweeted": 7,
+    "is_video_playback_50": 8,
 }
 
 TARGET_LABEL_IDX = 0
 EB_SCORE_IDX = 9
 
-LABEL_NAMES = [label_name for label_name, _ in sorted(INDEX_BY_LABEL.items(), key=lambda item: item[1])]
+LABEL_NAMES = [
+    label_name
+    for label_name, _ in sorted(INDEX_BY_LABEL.items(), key=lambda item: item[1])
+]
 
-PREDICTED_CLASSES = \
-  ["tf_target"] + ["tf_" + label_name for label_name in LABEL_NAMES] + ["tf_timelines.earlybird_score"] + \
-  ["lolly_target"] + ["lolly_" + label_name for label_name in LABEL_NAMES] + ["lolly_timelines.earlybird_score"]
+PREDICTED_CLASSES = (
+    ["tf_target"]
+    + ["tf_" + label_name for label_name in LABEL_NAMES]
+    + ["tf_timelines.earlybird_score"]
+    + ["lolly_target"]
+    + ["lolly_" + label_name for label_name in LABEL_NAMES]
+    + ["lolly_timelines.earlybird_score"]
+)
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py
index cf0c38ecc..da21e52ad 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py
@@ -1,43 +1,56 @@
 # checkstyle: noqa
 import tensorflow.compat.v1 as tf
+
 from .constants import INDEX_BY_LABEL, LABEL_NAMES
 
 # TODO: Read these from command line arguments, since they specify the existing example weights in the input data.
 DEFAULT_WEIGHT_BY_LABEL = {
-  "is_clicked": 0.3,
-  "is_favorited": 1.0,
-  "is_open_linked": 0.1,
-  "is_photo_expanded": 0.03,
-  "is_profile_clicked": 1.0,
-  "is_replied": 9.0,
-  "is_retweeted": 1.0,
-  "is_video_playback_50": 0.01
+    "is_clicked": 0.3,
+    "is_favorited": 1.0,
+    "is_open_linked": 0.1,
+    "is_photo_expanded": 0.03,
+    "is_profile_clicked": 1.0,
+    "is_replied": 9.0,
+    "is_retweeted": 1.0,
+    "is_video_playback_50": 0.01,
 }
 
+
 def add_weight_arguments(parser):
-  for label_name in LABEL_NAMES:
-    parser.add_argument(
-      _make_weight_cli_argument_name(label_name),
-      type=float,
-      default=DEFAULT_WEIGHT_BY_LABEL[label_name],
-      dest=_make_weight_param_name(label_name)
-    )
+    for label_name in LABEL_NAMES:
+        parser.add_argument(
+            _make_weight_cli_argument_name(label_name),
+            type=float,
+            default=DEFAULT_WEIGHT_BY_LABEL[label_name],
+            dest=_make_weight_param_name(label_name),
+        )
+
 
 def make_weights_tensor(input_weights, label, params):
-  '''
-  Replaces the weights for each positive engagement and keeps the input weights for negative examples.
-  '''
-  weight_tensors = [input_weights]
-  for label_name in LABEL_NAMES:
-    index, default_weight = INDEX_BY_LABEL[label_name], DEFAULT_WEIGHT_BY_LABEL[label_name]
-    weight_param_name =_make_weight_param_name(label_name)
-    weight_tensors.append(
-      tf.reshape(tf.math.scalar_mul(getattr(params, weight_param_name) - default_weight, label[:, index]), [-1, 1])
-    )
-  return tf.math.accumulate_n(weight_tensors)
+    """
+    Replaces the weights for each positive engagement and keeps the input weights for negative examples.
+    """
+    weight_tensors = [input_weights]
+    for label_name in LABEL_NAMES:
+        index, default_weight = (
+            INDEX_BY_LABEL[label_name],
+            DEFAULT_WEIGHT_BY_LABEL[label_name],
+        )
+        weight_param_name = _make_weight_param_name(label_name)
+        weight_tensors.append(
+            tf.reshape(
+                tf.math.scalar_mul(
+                    getattr(params, weight_param_name) - default_weight, label[:, index]
+                ),
+                [-1, 1],
+            )
+        )
+    return tf.math.accumulate_n(weight_tensors)
+
 
 def _make_weight_cli_argument_name(label_name):
-  return f"--weight.{label_name}"
+    return f"--weight.{label_name}"
+
 
 def _make_weight_param_name(label_name):
-  return f"weight_{label_name}"
+    return f"weight_{label_name}"
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py
index 723dd626c..f121bd039 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py
@@ -1,23 +1,28 @@
 # checkstyle: noqa
 import tensorflow.compat.v1 as tf
+
 from ..constants import EB_SCORE_IDX
 
+
 # The rationale behind this logic is available at TQ-9678.
 def get_lolly_logits(labels):
-  '''
-  :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
-  :return: tf.Tensor of shape (batch size) with the extracted lolly logits.
-  '''
-  eb_lolly_scores = get_lolly_scores(labels)
-  inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores)
-  lolly_activations = tf.math.subtract(tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores))
-  return lolly_activations
+    """
+    :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
+    :return: tf.Tensor of shape (batch size) with the extracted lolly logits.
+    """
+    eb_lolly_scores = get_lolly_scores(labels)
+    inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores)
+    lolly_activations = tf.math.subtract(
+        tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores)
+    )
+    return lolly_activations
+
 
 def get_lolly_scores(labels):
-  '''
-  :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
-  :return: tf.Tensor of shape (batch size) with the extracted lolly scores.
-  '''
-  logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1))
-  eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0)
-  return eb_lolly_scores
+    """
+    :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
+    :return: tf.Tensor of shape (batch size) with the extracted lolly scores.
+    """
+    logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1))
+    eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0)
+    return eb_lolly_scores
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py
index cb39c67a7..790a34556 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py
@@ -4,142 +4,151 @@
 
 
 class Parser(object):
-  def parse(self, line):
-    match = re.search(self.pattern(), line)
-    if match:
-      return self._parse_match(match)
-    return None
+    def parse(self, line):
+        match = re.search(self.pattern(), line)
+        if match:
+            return self._parse_match(match)
+        return None
 
-  def pattern(self):
-    raise NotImplementedError
+    def pattern(self):
+        raise NotImplementedError
 
-  def _parse_match(self, match):
-    raise NotImplementedError
+    def _parse_match(self, match):
+        raise NotImplementedError
 
 
 class BiasParser(Parser):
-  '''
-  Parses the bias feature available in lolly model tsv files.
-  '''
+    """
+    Parses the bias feature available in lolly model tsv files.
+    """
 
-  def pattern(self):
-    '''
-    Matches lines like:
-      unified_engagement	bias	-0.935945
-    :return: a RegEx that extracts feature weight.
-    '''
-    return r"\t(bias)\t([^\s]+)"
+    def pattern(self):
+        """
+        Matches lines like:
+          unified_engagement	bias	-0.935945
+        :return: a RegEx that extracts feature weight.
+        """
+        return r"\t(bias)\t([^\s]+)"
 
-  def _parse_match(self, match):
-    return float(match.group(2))
+    def _parse_match(self, match):
+        return float(match.group(2))
 
 
 class BinaryFeatureParser(Parser):
-  '''
-  Parses binary features available in lolly model tsv files.
-  '''
+    """
+    Parses binary features available in lolly model tsv files.
+    """
 
-  def pattern(self):
-    '''
-    Matches lines like:
-      unified_engagement	encoded_tweet_features.is_user_spam_flag	-0.181130
-    :return: a RegEx that extracts feature name and weight.
-    '''
-    return r"\t([\w\.]+)\t([^\s]+)"
+    def pattern(self):
+        """
+        Matches lines like:
+          unified_engagement	encoded_tweet_features.is_user_spam_flag	-0.181130
+        :return: a RegEx that extracts feature name and weight.
+        """
+        return r"\t([\w\.]+)\t([^\s]+)"
 
-  def _parse_match(self, match):
-    return (match.group(1), float(match.group(2)))
+    def _parse_match(self, match):
+        return (match.group(1), float(match.group(2)))
 
 
 class DiscretizedFeatureParser(Parser):
-  '''
-  Parses discretized features available in lolly model tsv files.
-  '''
-
-  def pattern(self):
-    '''
-    Matches lines like:
-      unified_engagement	encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00	0.031004
-    :return: a RegEx that extracts feature name, bin boundaries and weight.
-    '''
-    return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)"
-
-  def _parse_match(self, match):
-    left_bin_side, right_bin_side = [float(number) for number in match.group(2).split("_")]
-    return (
-      match.group(1),
-      left_bin_side,
-      right_bin_side,
-      float(match.group(3))
-    )
+    """
+    Parses discretized features available in lolly model tsv files.
+    """
+
+    def pattern(self):
+        """
+        Matches lines like:
+          unified_engagement	encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00	0.031004
+        :return: a RegEx that extracts feature name, bin boundaries and weight.
+        """
+        return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)"
+
+    def _parse_match(self, match):
+        left_bin_side, right_bin_side = [
+            float(number) for number in match.group(2).split("_")
+        ]
+        return (match.group(1), left_bin_side, right_bin_side, float(match.group(3)))
 
 
 class LollyModelFeaturesParser(Parser):
-  def __init__(self, bias_parser=BiasParser(), binary_feature_parser=BinaryFeatureParser(), discretized_feature_parser=DiscretizedFeatureParser()):
-    self._bias_parser = bias_parser
-    self._binary_feature_parser = binary_feature_parser
-    self._discretized_feature_parser = discretized_feature_parser
-
-  def parse(self, lolly_model_reader):
-    parsed_features = {
-      "bias": None,
-      "binary": {},
-      "discretized": {}
-    }
-    def process_line_fn(line):
-      bias_parser_result = self._bias_parser.parse(line)
-      if bias_parser_result:
-        parsed_features["bias"] = bias_parser_result
-        return
-
-      binary_feature_parser_result = self._binary_feature_parser.parse(line)
-      if binary_feature_parser_result:
-        name, value = binary_feature_parser_result
-        parsed_features["binary"][name] = value
-        return
-
-      discretized_feature_parser_result = self._discretized_feature_parser.parse(line)
-      if discretized_feature_parser_result:
-        name, left_bin, right_bin, weight = discretized_feature_parser_result
-        discretized_features = parsed_features["discretized"]
-        if name not in discretized_features:
-          discretized_features[name] = []
-        discretized_features[name].append((left_bin, right_bin, weight))
-
-    lolly_model_reader.read(process_line_fn)
-
-    return parsed_features
+    def __init__(
+        self,
+        bias_parser=BiasParser(),
+        binary_feature_parser=BinaryFeatureParser(),
+        discretized_feature_parser=DiscretizedFeatureParser(),
+    ):
+        self._bias_parser = bias_parser
+        self._binary_feature_parser = binary_feature_parser
+        self._discretized_feature_parser = discretized_feature_parser
+
+    def parse(self, lolly_model_reader):
+        parsed_features = {"bias": None, "binary": {}, "discretized": {}}
+
+        def process_line_fn(line):
+            bias_parser_result = self._bias_parser.parse(line)
+            if bias_parser_result:
+                parsed_features["bias"] = bias_parser_result
+                return
+
+            binary_feature_parser_result = self._binary_feature_parser.parse(line)
+            if binary_feature_parser_result:
+                name, value = binary_feature_parser_result
+                parsed_features["binary"][name] = value
+                return
+
+            discretized_feature_parser_result = self._discretized_feature_parser.parse(
+                line
+            )
+            if discretized_feature_parser_result:
+                name, left_bin, right_bin, weight = discretized_feature_parser_result
+                discretized_features = parsed_features["discretized"]
+                if name not in discretized_features:
+                    discretized_features[name] = []
+                discretized_features[name].append((left_bin, right_bin, weight))
+
+        lolly_model_reader.read(process_line_fn)
+
+        return parsed_features
 
 
 class DBv2DataExampleParser(Parser):
-  '''
-  Parses data records printed by the DBv2 train.py build_graph function.
-  Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]]
-  '''
-
-  def __init__(self, lolly_model_reader, lolly_model_features_parser=LollyModelFeaturesParser()):
-    self.features = lolly_model_features_parser.parse(lolly_model_reader)
-    self.feature_name_by_dbv2_id = {}
-
-    for feature_name in list(self.features["binary"].keys()) + list(self.features["discretized"].keys()):
-      self.feature_name_by_dbv2_id[str(_get_feature_id(feature_name))] = feature_name
-
-  def pattern(self):
-    '''
-    :return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values.
-    '''
-    return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]"
-
-  def _parse_match(self, match):
-    feature_ids = match.group(3).split(" ")
-    feature_values = match.group(4).split(" ")
-
-    value_by_feature_name = {}
-    for index in range(len(feature_ids)):
-      feature_id = feature_ids[index]
-      if feature_id not in self.feature_name_by_dbv2_id:
-        print("Missing feature with id: " + str(feature_id))
-        continue
-      value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(feature_values[index])
-
-    return value_by_feature_name
+    """
+    Parses data records printed by the DBv2 train.py build_graph function.
+    Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]]
+    """
+
+    def __init__(
+        self, lolly_model_reader, lolly_model_features_parser=LollyModelFeaturesParser()
+    ):
+        self.features = lolly_model_features_parser.parse(lolly_model_reader)
+        self.feature_name_by_dbv2_id = {}
+
+        for feature_name in list(self.features["binary"].keys()) + list(
+            self.features["discretized"].keys()
+        ):
+            self.feature_name_by_dbv2_id[
+                str(_get_feature_id(feature_name))
+            ] = feature_name
+
+    def pattern(self):
+        """
+        :return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values.
+        """
+        return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]"
+
+    def _parse_match(self, match):
+        feature_ids = match.group(3).split(" ")
+        feature_values = match.group(4).split(" ")
+
+        value_by_feature_name = {}
+        for index in range(len(feature_ids)):
+            feature_id = feature_ids[index]
+            if feature_id not in self.feature_name_by_dbv2_id:
+                print("Missing feature with id: " + str(feature_id))
+                continue
+            value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(
+                feature_values[index]
+            )
+
+        return value_by_feature_name
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py
index ab33ee4e7..2ff3ca79b 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py
@@ -1,8 +1,8 @@
 class LollyModelReader(object):
-  def __init__(self, lolly_model_file_path):
-    self._lolly_model_file_path = lolly_model_file_path
+    def __init__(self, lolly_model_file_path):
+        self._lolly_model_file_path = lolly_model_file_path
 
-  def read(self, process_line_fn):
-    with open(self._lolly_model_file_path, "r") as file:
-      for line in file:
-        process_line_fn(line)
+    def read(self, process_line_fn):
+        with open(self._lolly_model_file_path, "r") as file:
+            for line in file:
+                process_line_fn(line)
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py
index 5692616c2..1ed012cd2 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py
@@ -4,10 +4,11 @@
 from .reader import LollyModelReader
 from .scorer import LollyModelScorer
 
-
 if __name__ == "__main__":
-  lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1])
-  lolly_model_scorer = LollyModelScorer(data_example_parser=DBv2DataExampleParser(lolly_model_reader))
+    lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1])
+    lolly_model_scorer = LollyModelScorer(
+        data_example_parser=DBv2DataExampleParser(lolly_model_reader)
+    )
 
-  score = lolly_model_scorer.score(data_example=sys.argv[2])
-  print(score)
+    score = lolly_model_scorer.score(data_example=sys.argv[2])
+    print(score)
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py
index 621c43388..6a9ca4204 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py
@@ -1,37 +1,41 @@
 class LollyModelScorer(object):
-  def __init__(self, data_example_parser):
-    self._data_example_parser = data_example_parser
+    def __init__(self, data_example_parser):
+        self._data_example_parser = data_example_parser
 
-  def score(self, data_example):
-    value_by_feature_name = self._data_example_parser.parse(data_example)
-    features = self._data_example_parser.features
-    return self._score(value_by_feature_name, features)
+    def score(self, data_example):
+        value_by_feature_name = self._data_example_parser.parse(data_example)
+        features = self._data_example_parser.features
+        return self._score(value_by_feature_name, features)
 
-  def _score(self, value_by_feature_name, features):
-    score = features["bias"]
-    score += self._score_binary_features(features["binary"], value_by_feature_name)
-    score += self._score_discretized_features(features["discretized"], value_by_feature_name)
-    return score
+    def _score(self, value_by_feature_name, features):
+        score = features["bias"]
+        score += self._score_binary_features(features["binary"], value_by_feature_name)
+        score += self._score_discretized_features(
+            features["discretized"], value_by_feature_name
+        )
+        return score
 
-  def _score_binary_features(self, binary_features, value_by_feature_name):
-    score = 0.0
-    for binary_feature_name, binary_feature_weight in binary_features.items():
-      if binary_feature_name in value_by_feature_name:
-        score += binary_feature_weight
-    return score
+    def _score_binary_features(self, binary_features, value_by_feature_name):
+        score = 0.0
+        for binary_feature_name, binary_feature_weight in binary_features.items():
+            if binary_feature_name in value_by_feature_name:
+                score += binary_feature_weight
+        return score
 
-  def _score_discretized_features(self, discretized_features, value_by_feature_name):
-    score = 0.0
-    for discretized_feature_name, buckets in discretized_features.items():
-      if discretized_feature_name in value_by_feature_name:
-        feature_value = value_by_feature_name[discretized_feature_name]
-        score += self._find_matching_bucket_weight(buckets, feature_value)
-    return score
+    def _score_discretized_features(self, discretized_features, value_by_feature_name):
+        score = 0.0
+        for discretized_feature_name, buckets in discretized_features.items():
+            if discretized_feature_name in value_by_feature_name:
+                feature_value = value_by_feature_name[discretized_feature_name]
+                score += self._find_matching_bucket_weight(buckets, feature_value)
+        return score
 
-  def _find_matching_bucket_weight(self, buckets, feature_value):
-    for left_side, right_side, weight in buckets:
-      # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
-      if feature_value >= left_side and feature_value < right_side:
-        return weight
+    def _find_matching_bucket_weight(self, buckets, feature_value):
+        for left_side, right_side, weight in buckets:
+            # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
+            if feature_value >= left_side and feature_value < right_side:
+                return weight
 
-    raise LookupError("Couldn't find a matching bucket for the given feature value.")
+        raise LookupError(
+            "Couldn't find a matching bucket for the given feature value."
+        )
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py
index 2d0342551..4e0156983 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py
@@ -2,90 +2,91 @@
 
 
 class TFModelInitializerBuilder:
-
-  def __init__(self, model_features_parser=LollyModelFeaturesParser()):
-    self._model_features_parser = model_features_parser
-
-  def build(self, lolly_model_reader):
-    '''
-    :param lolly_model_reader: LollyModelReader instance
-    :return: tf_model_initializer dictionary of the following format:
-      {
-        "features": {
-          "bias": 0.0,
-          "binary": {
-            # (feature name : feature weight) pairs
-            "feature_name_1": 0.0,
-            ...
-            "feature_nameN": 0.0
-          },
-          "discretized": {
-            # (feature name : index aligned lists of bin_boundaries and weights
-            "feature_name_1": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
-            }
-            ...
-            "feature_name_K": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
+    def __init__(self, model_features_parser=LollyModelFeaturesParser()):
+        self._model_features_parser = model_features_parser
+
+    def build(self, lolly_model_reader):
+        """
+        :param lolly_model_reader: LollyModelReader instance
+        :return: tf_model_initializer dictionary of the following format:
+          {
+            "features": {
+              "bias": 0.0,
+              "binary": {
+                # (feature name : feature weight) pairs
+                "feature_name_1": 0.0,
+                ...
+                "feature_nameN": 0.0
+              },
+              "discretized": {
+                # (feature name : index aligned lists of bin_boundaries and weights
+                "feature_name_1": {
+                  "bin_boundaries": [1, ..., inf],
+                  "weights": [0.0, ..., 0.0]
+                }
+                ...
+                "feature_name_K": {
+                  "bin_boundaries": [1, ..., inf],
+                  "weights": [0.0, ..., 0.0]
+                }
+              }
             }
           }
-        }
-      }
-    '''
-    tf_model_initializer = {
-      "features": {}
-    }
+        """
+        tf_model_initializer = {"features": {}}
 
-    features = self._model_features_parser.parse(lolly_model_reader)
-    tf_model_initializer["features"]["bias"] = features["bias"]
-    self._set_discretized_features(features["discretized"], tf_model_initializer)
+        features = self._model_features_parser.parse(lolly_model_reader)
+        tf_model_initializer["features"]["bias"] = features["bias"]
+        self._set_discretized_features(features["discretized"], tf_model_initializer)
 
-    self._dedup_binary_features(features["binary"], features["discretized"])
-    tf_model_initializer["features"]["binary"] = features["binary"]
+        self._dedup_binary_features(features["binary"], features["discretized"])
+        tf_model_initializer["features"]["binary"] = features["binary"]
 
-    return tf_model_initializer
+        return tf_model_initializer
 
-  def _set_discretized_features(self, discretized_features, tf_model_initializer):
-    if len(discretized_features) == 0:
-      return
+    def _set_discretized_features(self, discretized_features, tf_model_initializer):
+        if len(discretized_features) == 0:
+            return
 
-    num_bins = max([len(bins) for bins in discretized_features.values()])
+        num_bins = max([len(bins) for bins in discretized_features.values()])
 
-    bin_boundaries_and_weights = {}
-    for feature_name in discretized_features:
-      bin_boundaries_and_weights[feature_name] = self._extract_bin_boundaries_and_weights(
-        discretized_features[feature_name], num_bins)
+        bin_boundaries_and_weights = {}
+        for feature_name in discretized_features:
+            bin_boundaries_and_weights[
+                feature_name
+            ] = self._extract_bin_boundaries_and_weights(
+                discretized_features[feature_name], num_bins
+            )
 
-    tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights
+        tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights
 
-  def _dedup_binary_features(self, binary_features, discretized_features):
-    [binary_features.pop(feature_name) for feature_name in discretized_features]
+    def _dedup_binary_features(self, binary_features, discretized_features):
+        [binary_features.pop(feature_name) for feature_name in discretized_features]
 
-  def _extract_bin_boundaries_and_weights(self, discretized_feature_buckets, num_bins):
-    bin_boundary_weight_pairs = []
+    def _extract_bin_boundaries_and_weights(
+        self, discretized_feature_buckets, num_bins
+    ):
+        bin_boundary_weight_pairs = []
 
-    for bucket in discretized_feature_buckets:
-      bin_boundary_weight_pairs.append([bucket[0], bucket[2]])
+        for bucket in discretized_feature_buckets:
+            bin_boundary_weight_pairs.append([bucket[0], bucket[2]])
 
-    # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
-    #
-    # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
-    #
-    # Thus, convert (a, b] to [a, b) by inverting the bin boundaries.
-    for bin_boundary_weight_pair in bin_boundary_weight_pairs:
-      if bin_boundary_weight_pair[0] < float("inf"):
-        bin_boundary_weight_pair[0] *= -1
+        # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
+        #
+        # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
+        #
+        # Thus, convert (a, b] to [a, b) by inverting the bin boundaries.
+        for bin_boundary_weight_pair in bin_boundary_weight_pairs:
+            if bin_boundary_weight_pair[0] < float("inf"):
+                bin_boundary_weight_pair[0] *= -1
 
-    while len(bin_boundary_weight_pairs) < num_bins:
-      bin_boundary_weight_pairs.append([float("inf"), float(0)])
+        while len(bin_boundary_weight_pairs) < num_bins:
+            bin_boundary_weight_pairs.append([float("inf"), float(0)])
 
-    bin_boundary_weight_pairs.sort(key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0])
+        bin_boundary_weight_pairs.sort(
+            key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0]
+        )
 
-    bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs))
+        bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs))
 
-    return {
-      "bin_boundaries": bin_boundaries,
-      "weights": weights
-    }
+        return {"bin_boundaries": bin_boundaries, "weights": weights}
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py
index 6919914f8..7f8241ca4 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py
@@ -1,120 +1,136 @@
 # checkstyle: noqa
-import tensorflow.compat.v1 as tf
 from collections import OrderedDict
+
+import tensorflow.compat.v1 as tf
+
+import twml
+
 from .constants import EB_SCORE_IDX
 from .lolly.data_helpers import get_lolly_scores
 
-import twml
 
 def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1):
-  """
-  This function was copied from twml/metrics.py with the following adjustments:
-    - Override example weights with the ones set in graph_output.
-    - Tile labels in order to support per engagement metrics for both TF and Lolly scores.
-    - Add lolly_tf_score_MSE metric.
-  Note: All custom lines have a comment that starts with 'Added'
-  """
-  # pylint: disable=invalid-name,dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys())
-    metrics.remove('pr_curve')
-
-  def get_eval_metric_ops(graph_output, labels, weights):
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    This function was copied from twml/metrics.py with the following adjustments:
+      - Override example weights with the ones set in graph_output.
+      - Tile labels in order to support per engagement metrics for both TF and Lolly scores.
+      - Add lolly_tf_score_MSE metric.
+    Note: All custom lines have a comment that starts with 'Added'
     """
-
-    # Added to support the example weights overriding.
-    weights = graph_output["weights"]
-    # Added to support per engagement metrics for both TF and Lolly scores.
-    labels = tf.tile(labels, [1, 2])
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if not hard_preds:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    shape = labels.get_shape()
-
-    # basic sanity check: multi_metric dimension must exist
-    assert len(shape) > class_dim, "Dimension specified by class_dim does not exist."
-
-    num_labels = shape[class_dim]
-    # If we are doing multi-class / multi-label metric, the number of classes / labels must
-    # be know at graph construction time.  This dimension cannot have size None.
-    assert num_labels is not None, "The multi-metric dimension cannot be None."
-    assert classes is None or len(classes) == num_labels, (
-      "Number of classes must match the number of labels")
-
-    weights_shape = weights.get_shape() if weights is not None else None
-    if weights_shape is None:
-      num_weights = None
-    elif len(weights_shape) > 1:
-      num_weights = weights_shape[class_dim]
-    else:
-      num_weights = 1
-
-    for i in range(num_labels):
-
-      # add metrics to eval_metric_ops dict
-      for metric_name in metrics:
-        metric_name = metric_name.lower()  # metric name are case insensitive.
-
-        class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i))
-
-        if class_metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        class_labels = tf.gather(labels, indices=[i], axis=class_dim)
-        class_preds = tf.gather(preds, indices=[i], axis=class_dim)
-        class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
-
-        if num_weights is None:
-          class_weights = None
-        elif num_weights == num_labels:
-          class_weights = tf.gather(weights, indices=[i], axis=class_dim)
-        elif num_weights == 1:
-          class_weights = weights
-        else:
-          raise ValueError("num_weights (%d) and num_labels (%d) do not match"
-                           % (num_weights, num_labels))
-
-        metric_factory, requires_threshold = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-        if metric_factory:
-          value_op, update_op = metric_factory(
-            labels=class_labels,
-            predictions=(class_hard_preds if requires_threshold else class_preds),
-            weights=class_weights, name=class_metric_name)
-          eval_metric_ops[class_metric_name] = (value_op, update_op)
+    # pylint: disable=invalid-name,dict-keys-not-iterating
+    if metrics is None:
+        # remove expensive metrics by default for faster eval
+        metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys())
+        metrics.remove("pr_curve")
+
+    def get_eval_metric_ops(graph_output, labels, weights):
+        """
+        graph_output:
+          dict that is returned by build_graph given input features.
+        labels:
+          target labels associated to batch.
+        weights:
+          weights of the samples..
+        """
+
+        # Added to support the example weights overriding.
+        weights = graph_output["weights"]
+        # Added to support per engagement metrics for both TF and Lolly scores.
+        labels = tf.tile(labels, [1, 2])
+
+        eval_metric_ops = OrderedDict()
+
+        preds = graph_output["output"]
+
+        threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5
+
+        hard_preds = graph_output.get("hard_output")
+        if not hard_preds:
+            hard_preds = tf.greater_equal(preds, threshold)
+
+        shape = labels.get_shape()
+
+        # basic sanity check: multi_metric dimension must exist
+        assert (
+            len(shape) > class_dim
+        ), "Dimension specified by class_dim does not exist."
+
+        num_labels = shape[class_dim]
+        # If we are doing multi-class / multi-label metric, the number of classes / labels must
+        # be know at graph construction time.  This dimension cannot have size None.
+        assert num_labels is not None, "The multi-metric dimension cannot be None."
+        assert (
+            classes is None or len(classes) == num_labels
+        ), "Number of classes must match the number of labels"
+
+        weights_shape = weights.get_shape() if weights is not None else None
+        if weights_shape is None:
+            num_weights = None
+        elif len(weights_shape) > 1:
+            num_weights = weights_shape[class_dim]
         else:
-          raise ValueError('Cannot find the metric named ' + metric_name)
-
-    # Added to compare TF and Lolly scores.
-    eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
+            num_weights = 1
+
+        for i in range(num_labels):
+            # add metrics to eval_metric_ops dict
+            for metric_name in metrics:
+                metric_name = metric_name.lower()  # metric name are case insensitive.
+
+                class_metric_name = (
+                    metric_name + "_" + (classes[i] if classes is not None else str(i))
+                )
+
+                if class_metric_name in eval_metric_ops:
+                    # avoid adding duplicate metrics.
+                    continue
+
+                class_labels = tf.gather(labels, indices=[i], axis=class_dim)
+                class_preds = tf.gather(preds, indices=[i], axis=class_dim)
+                class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
+
+                if num_weights is None:
+                    class_weights = None
+                elif num_weights == num_labels:
+                    class_weights = tf.gather(weights, indices=[i], axis=class_dim)
+                elif num_weights == 1:
+                    class_weights = weights
+                else:
+                    raise ValueError(
+                        "num_weights (%d) and num_labels (%d) do not match"
+                        % (num_weights, num_labels)
+                    )
+
+                (
+                    metric_factory,
+                    requires_threshold,
+                ) = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
+                if metric_factory:
+                    value_op, update_op = metric_factory(
+                        labels=class_labels,
+                        predictions=(
+                            class_hard_preds if requires_threshold else class_preds
+                        ),
+                        weights=class_weights,
+                        name=class_metric_name,
+                    )
+                    eval_metric_ops[class_metric_name] = (value_op, update_op)
+                else:
+                    raise ValueError("Cannot find the metric named " + metric_name)
+
+        # Added to compare TF and Lolly scores.
+        eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels)
+
+        return eval_metric_ops
+
+    return get_eval_metric_ops
 
 
 def get_mse(predictions, labels):
-  lolly_scores = get_lolly_scores(labels)
-  tf_scores = predictions[:, EB_SCORE_IDX]
-  squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores))
+    lolly_scores = get_lolly_scores(labels)
+    tf_scores = predictions[:, EB_SCORE_IDX]
+    squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores))
 
-  value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op")
-  update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op")
+    value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op")
+    update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op")
 
-  return value_op, update_op
+    return value_op, update_op
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py
index 82c31bde0..22b2cbe75 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py
@@ -1,62 +1,65 @@
-from .hashing_utils import make_feature_id
+import numpy as np
 
 from twml.contrib.layers.hashing_discretizer import HashingDiscretizer
-import numpy as np
+
+from .hashing_utils import make_feature_id
 
 
 class TFModelDiscretizerBuilder(object):
-  def __init__(self, num_bits):
-    self.num_bits = num_bits
-
-  def build(self, tf_model_initializer):
-    '''
-    :param tf_model_initializer: dictionary of the following format:
-      {
-        "features": {
-          "bias": 0.0,
-          "binary": {
-            # (feature name : feature weight) pairs
-            "feature_name_1": 0.0,
-            ...
-            "feature_nameN": 0.0
-          },
-          "discretized": {
-            # (feature name : index aligned lists of bin_boundaries and weights
-            "feature_name_1": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
-            }
-            ...
-            "feature_name_K": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
+    def __init__(self, num_bits):
+        self.num_bits = num_bits
+
+    def build(self, tf_model_initializer):
+        """
+        :param tf_model_initializer: dictionary of the following format:
+          {
+            "features": {
+              "bias": 0.0,
+              "binary": {
+                # (feature name : feature weight) pairs
+                "feature_name_1": 0.0,
+                ...
+                "feature_nameN": 0.0
+              },
+              "discretized": {
+                # (feature name : index aligned lists of bin_boundaries and weights
+                "feature_name_1": {
+                  "bin_boundaries": [1, ..., inf],
+                  "weights": [0.0, ..., 0.0]
+                }
+                ...
+                "feature_name_K": {
+                  "bin_boundaries": [1, ..., inf],
+                  "weights": [0.0, ..., 0.0]
+                }
+              }
             }
           }
-        }
-      }
-    :return: a HashingDiscretizer instance.
-    '''
-    discretized_features = tf_model_initializer["features"]["discretized"]
-
-    max_bins = 0
-
-    feature_ids = []
-    bin_vals = []
-    for feature_name in discretized_features:
-      bin_boundaries = discretized_features[feature_name]["bin_boundaries"]
-      feature_id = make_feature_id(feature_name, self.num_bits)
-      feature_ids.append(feature_id)
-      np_bin_boundaries = [np.float(bin_boundary) for bin_boundary in bin_boundaries]
-      bin_vals.append(np_bin_boundaries)
-
-      max_bins = max(max_bins, len(np_bin_boundaries))
-
-    feature_ids_np = np.array(feature_ids)
-    bin_vals_np = np.array(bin_vals).flatten()
-
-    return HashingDiscretizer(
-      feature_ids=feature_ids_np,
-      bin_vals=bin_vals_np,
-      n_bin=max_bins,
-      out_bits=self.num_bits
-    )
+        :return: a HashingDiscretizer instance.
+        """
+        discretized_features = tf_model_initializer["features"]["discretized"]
+
+        max_bins = 0
+
+        feature_ids = []
+        bin_vals = []
+        for feature_name in discretized_features:
+            bin_boundaries = discretized_features[feature_name]["bin_boundaries"]
+            feature_id = make_feature_id(feature_name, self.num_bits)
+            feature_ids.append(feature_id)
+            np_bin_boundaries = [
+                np.float(bin_boundary) for bin_boundary in bin_boundaries
+            ]
+            bin_vals.append(np_bin_boundaries)
+
+            max_bins = max(max_bins, len(np_bin_boundaries))
+
+        feature_ids_np = np.array(feature_ids)
+        bin_vals_np = np.array(bin_vals).flatten()
+
+        return HashingDiscretizer(
+            feature_ids=feature_ids_np,
+            bin_vals=bin_vals_np,
+            n_bin=max_bins,
+            out_bits=self.num_bits,
+        )
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py
index 2c57f8d63..638e5d18f 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py
@@ -1,29 +1,28 @@
-from twitter.deepbird.io.util import _get_feature_id
-
 import numpy as np
+from twitter.deepbird.io.util import _get_feature_id
 
 
 def numpy_hashing_uniform(the_id, bin_idx, output_bits):
-  """
-  integer_multiplicative_hashing
-  This is a reimplementation, for testing purposes, of the
-    c++ version found in hashing_discretizer_impl.cpp
-  """
-  hashing_constant = 2654435761
-  N = 32
-  with np.errstate(over='ignore'):
-    the_id *= hashing_constant
-    the_id += bin_idx
-    the_id *= hashing_constant
-    the_id >>= N - output_bits
-    the_id &= (1 << output_bits) - 1
-  return the_id
+    """
+    integer_multiplicative_hashing
+    This is a reimplementation, for testing purposes, of the
+      c++ version found in hashing_discretizer_impl.cpp
+    """
+    hashing_constant = 2654435761
+    N = 32
+    with np.errstate(over="ignore"):
+        the_id *= hashing_constant
+        the_id += bin_idx
+        the_id *= hashing_constant
+        the_id >>= N - output_bits
+        the_id &= (1 << output_bits) - 1
+    return the_id
 
 
 def make_feature_id(name, num_bits):
-  feature_id = _get_feature_id(name)
-  return np.int64(limit_bits(feature_id, num_bits))
+    feature_id = _get_feature_id(name)
+    return np.int64(limit_bits(feature_id, num_bits))
 
 
 def limit_bits(value, num_bits):
-  return value & ((2 ** num_bits) - 1)
+    return value & ((2**num_bits) - 1)
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py
index 63491ea38..a05edfe32 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py
@@ -1,34 +1,39 @@
-from .hashing_utils import make_feature_id, numpy_hashing_uniform
-
 import numpy as np
 import tensorflow.compat.v1 as tf
+
 import twml
 
+from .hashing_utils import make_feature_id, numpy_hashing_uniform
+
 
 class TFModelWeightsInitializerBuilder(object):
-  def __init__(self, num_bits):
-    self.num_bits = num_bits
-
-  def build(self, tf_model_initializer):
-    '''
-    :return: (bias_initializer, weight_initializer)
-    '''
-    initial_weights = np.zeros((2 ** self.num_bits, 1))
-
-    features = tf_model_initializer["features"]
-    self._set_binary_feature_weights(initial_weights, features["binary"])
-    self._set_discretized_feature_weights(initial_weights, features["discretized"])
-
-    return tf.constant_initializer(features["bias"]), twml.contrib.initializers.PartitionConstant(initial_weights)
-
-  def _set_binary_feature_weights(self, initial_weights, binary_features):
-    for feature_name, weight in binary_features.items():
-      feature_id = make_feature_id(feature_name, self.num_bits)
-      initial_weights[feature_id][0] = weight
-
-  def _set_discretized_feature_weights(self, initial_weights, discretized_features):
-    for feature_name, discretized_feature in discretized_features.items():
-      feature_id = make_feature_id(feature_name, self.num_bits)
-      for bin_idx, weight in enumerate(discretized_feature["weights"]):
-        final_bucket_id = numpy_hashing_uniform(feature_id, bin_idx, self.num_bits)
-        initial_weights[final_bucket_id][0] = weight
+    def __init__(self, num_bits):
+        self.num_bits = num_bits
+
+    def build(self, tf_model_initializer):
+        """
+        :return: (bias_initializer, weight_initializer)
+        """
+        initial_weights = np.zeros((2**self.num_bits, 1))
+
+        features = tf_model_initializer["features"]
+        self._set_binary_feature_weights(initial_weights, features["binary"])
+        self._set_discretized_feature_weights(initial_weights, features["discretized"])
+
+        return tf.constant_initializer(
+            features["bias"]
+        ), twml.contrib.initializers.PartitionConstant(initial_weights)
+
+    def _set_binary_feature_weights(self, initial_weights, binary_features):
+        for feature_name, weight in binary_features.items():
+            feature_id = make_feature_id(feature_name, self.num_bits)
+            initial_weights[feature_id][0] = weight
+
+    def _set_discretized_feature_weights(self, initial_weights, discretized_features):
+        for feature_name, discretized_feature in discretized_features.items():
+            feature_id = make_feature_id(feature_name, self.num_bits)
+            for bin_idx, weight in enumerate(discretized_feature["weights"]):
+                final_bucket_id = numpy_hashing_uniform(
+                    feature_id, bin_idx, self.num_bits
+                )
+                initial_weights[final_bucket_id][0] = weight
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py
index 6ef181f5f..a23d0964d 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py
@@ -1,212 +1,266 @@
 # checkstyle: noqa
+from datetime import datetime
+
 import tensorflow.compat.v1 as tf
-from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
 import tensorflow_hub as hub
-
-from datetime import datetime
 from tensorflow.compat.v1 import logging
+from tensorflow.python.estimator.export.export import (
+    build_raw_serving_input_receiver_fn,
+)
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from twitter.deepbird.projects.timelines.configs import all_configs
+
+import twml
+from twml.contrib.calibrators.common_calibrators import (
+    build_percentile_discretizer_graph,
+    calibrate_discretizer_and_export,
+)
 from twml.trainers import DataRecordTrainer
-from twml.contrib.calibrators.common_calibrators import build_percentile_discretizer_graph
-from twml.contrib.calibrators.common_calibrators import calibrate_discretizer_and_export
-from .metrics import get_multi_binary_class_metric_fn
-from .constants import TARGET_LABEL_IDX, PREDICTED_CLASSES
+
+from .constants import PREDICTED_CLASSES, TARGET_LABEL_IDX
 from .example_weights import add_weight_arguments, make_weights_tensor
 from .lolly.data_helpers import get_lolly_logits
-from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder
 from .lolly.reader import LollyModelReader
+from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder
+from .metrics import get_multi_binary_class_metric_fn
 from .tf_model.discretizer_builder import TFModelDiscretizerBuilder
 from .tf_model.weights_initializer_builder import TFModelWeightsInitializerBuilder
 
-import twml
 
 def get_feature_values(features_values, params):
-  if params.lolly_model_tsv:
-    # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
-    #
-    # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
-    #
-    # TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries.
-    #
-    # Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket.
-    return tf.multiply(features_values, -1.0)
-  else:
-    return features_values
+    if params.lolly_model_tsv:
+        # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
+        #
+        # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
+        #
+        # TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries.
+        #
+        # Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket.
+        return tf.multiply(features_values, -1.0)
+    else:
+        return features_values
+
 
 def build_graph(features, label, mode, params, config=None):
-  weights = None
-  if "weights" in features:
-    weights = make_weights_tensor(features["weights"], label, params)
-
-  num_bits = params.input_size_bits
-
-  if mode == "infer":
-    indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits)
-    dense_shape = tf.stack([features["input_sparse_tensor_shape"][0], 1 << num_bits])
-    sparse_tf = tf.SparseTensor(
-      indices=indices,
-      values=get_feature_values(features["input_sparse_tensor_values"], params),
-      dense_shape=dense_shape
-    )
-  else:
-    features["values"] = get_feature_values(features["values"], params)
-    sparse_tf = twml.util.convert_to_sparse(features, num_bits)
-
-  if params.lolly_model_tsv:
-    tf_model_initializer = TFModelInitializerBuilder().build(LollyModelReader(params.lolly_model_tsv))
-    bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder(num_bits).build(tf_model_initializer)
-    discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer)
-  else:
-    discretizer = hub.Module(params.discretizer_save_dir)
-    bias_initializer, weight_initializer = None, None
-
-  input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator")
-
-  logits = twml.layers.full_sparse(
-    inputs=input_sparse,
-    output_size=1,
-    bias_initializer=bias_initializer,
-    weight_initializer=weight_initializer,
-    use_sparse_grads=(mode == "train"),
-    use_binary_values=True,
-    name="full_sparse_1"
-  )
-
-  loss = None
-
-  if mode != "infer":
-    lolly_activations = get_lolly_logits(label)
-
-    if opt.print_data_examples:
-      logits = print_data_example(logits, lolly_activations, features)
-
-    if params.replicate_lolly:
-      loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations))
+    weights = None
+    if "weights" in features:
+        weights = make_weights_tensor(features["weights"], label, params)
+
+    num_bits = params.input_size_bits
+
+    if mode == "infer":
+        indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits)
+        dense_shape = tf.stack(
+            [features["input_sparse_tensor_shape"][0], 1 << num_bits]
+        )
+        sparse_tf = tf.SparseTensor(
+            indices=indices,
+            values=get_feature_values(features["input_sparse_tensor_values"], params),
+            dense_shape=dense_shape,
+        )
+    else:
+        features["values"] = get_feature_values(features["values"], params)
+        sparse_tf = twml.util.convert_to_sparse(features, num_bits)
+
+    if params.lolly_model_tsv:
+        tf_model_initializer = TFModelInitializerBuilder().build(
+            LollyModelReader(params.lolly_model_tsv)
+        )
+        bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder(
+            num_bits
+        ).build(tf_model_initializer)
+        discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer)
     else:
-      batch_size = tf.shape(label)[0]
-      target_label = tf.reshape(tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1))
-      loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=target_label, logits=logits)
-      loss = twml.util.weighted_average(loss, weights)
+        discretizer = hub.Module(params.discretizer_save_dir)
+        bias_initializer, weight_initializer = None, None
+
+    input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator")
+
+    logits = twml.layers.full_sparse(
+        inputs=input_sparse,
+        output_size=1,
+        bias_initializer=bias_initializer,
+        weight_initializer=weight_initializer,
+        use_sparse_grads=(mode == "train"),
+        use_binary_values=True,
+        name="full_sparse_1",
+    )
+
+    loss = None
+
+    if mode != "infer":
+        lolly_activations = get_lolly_logits(label)
+
+        if opt.print_data_examples:
+            logits = print_data_example(logits, lolly_activations, features)
 
-    num_labels = tf.shape(label)[1]
-    eb_scores = tf.tile(lolly_activations, [1, num_labels])
-    logits = tf.tile(logits, [1, num_labels])
-    logits = tf.concat([logits, eb_scores], axis=1)
+        if params.replicate_lolly:
+            loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations))
+        else:
+            batch_size = tf.shape(label)[0]
+            target_label = tf.reshape(
+                tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1)
+            )
+            loss = tf.nn.sigmoid_cross_entropy_with_logits(
+                labels=target_label, logits=logits
+            )
+            loss = twml.util.weighted_average(loss, weights)
 
-  output = tf.nn.sigmoid(logits)
+        num_labels = tf.shape(label)[1]
+        eb_scores = tf.tile(lolly_activations, [1, num_labels])
+        logits = tf.tile(logits, [1, num_labels])
+        logits = tf.concat([logits, eb_scores], axis=1)
+
+    output = tf.nn.sigmoid(logits)
+
+    return {"output": output, "loss": loss, "weights": weights}
 
-  return {"output": output, "loss": loss, "weights": weights}
 
 def print_data_example(logits, lolly_activations, features):
-  return tf.Print(
-    logits,
-    [logits, lolly_activations, tf.reshape(features['keys'], (1, -1)), tf.reshape(tf.multiply(features['values'], -1.0), (1, -1))],
-    message="DATA EXAMPLE = ",
-    summarize=10000
-  )
+    return tf.Print(
+        logits,
+        [
+            logits,
+            lolly_activations,
+            tf.reshape(features["keys"], (1, -1)),
+            tf.reshape(tf.multiply(features["values"], -1.0), (1, -1)),
+        ],
+        message="DATA EXAMPLE = ",
+        summarize=10000,
+    )
+
 
 def earlybird_output_fn(graph_output):
-  export_outputs = {
-    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-      tf.estimator.export.PredictOutput(
-        {"prediction": tf.identity(graph_output["output"], name="output_scores")}
-      )
-  }
-  return export_outputs
+    export_outputs = {
+        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(
+            {"prediction": tf.identity(graph_output["output"], name="output_scores")}
+        )
+    }
+    return export_outputs
+
 
 if __name__ == "__main__":
-  parser = DataRecordTrainer.add_parser_arguments()
-
-  parser = twml.contrib.calibrators.add_discretizer_arguments(parser)
-
-  parser.add_argument("--label", type=str, help="label for the engagement")
-  parser.add_argument("--model.use_existing_discretizer", action="store_true",
-                      dest="model_use_existing_discretizer",
-                      help="Load a pre-trained calibration or train a new one")
-  parser.add_argument("--input_size_bits", type=int)
-  parser.add_argument("--export_module_name", type=str, default="base_mlp", dest="export_module_name")
-  parser.add_argument("--feature_config", type=str)
-  parser.add_argument("--replicate_lolly", type=bool, default=False, dest="replicate_lolly",
-                      help="Train a regression model with MSE loss and the logged Earlybird score as a label")
-  parser.add_argument("--lolly_model_tsv", type=str, required=False, dest="lolly_model_tsv",
-                      help="Initialize with weights and discretizer bins available in the given Lolly model tsv file"
-                      "No discretizer gets trained or loaded if set.")
-  parser.add_argument("--print_data_examples", type=bool, default=False, dest="print_data_examples",
-                      help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'")
-  add_weight_arguments(parser)
-
-  opt = parser.parse_args()
-
-  feature_config_module = all_configs.select_feature_config(opt.feature_config)
-
-  feature_config = feature_config_module.get_feature_config(data_spec_path=opt.data_spec, label=opt.label)
-
-  parse_fn = twml.parsers.get_sparse_parse_fn(
-    feature_config,
-    keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes"))
-
-  if not opt.lolly_model_tsv:
-    if opt.model_use_existing_discretizer:
-      logging.info("Skipping discretizer calibration [model.use_existing_discretizer=True]")
-      logging.info(f"Using calibration at {opt.discretizer_save_dir}")
-    else:
-      logging.info("Calibrating new discretizer [model.use_existing_discretizer=False]")
-      calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator(
-        opt.discretizer_num_bins,
-        opt.discretizer_output_size_bits
-      )
-      calibrate_discretizer_and_export(name="recap_earlybird_hashing_discretizer",
-                                       params=opt,
-                                       calibrator=calibrator,
-                                       build_graph_fn=build_percentile_discretizer_graph,
-                                       feature_config=feature_config)
-
-  trainer = DataRecordTrainer(
-    name="earlybird",
-    params=opt,
-    build_graph_fn=build_graph,
-    save_dir=opt.save_dir,
-    feature_config=feature_config,
-    metric_fn=get_multi_binary_class_metric_fn(
-      metrics=["roc_auc"],
-      classes=PREDICTED_CLASSES
-    ),
-    warm_start_from=None
-  )
-
-  train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn)
-  eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn)
-
-  logging.info("Training and Evaluation ...")
-  trainingStartTime = datetime.now()
-  trainer.train_and_evaluate(train_input_fn=train_input_fn, eval_input_fn=eval_input_fn)
-  trainingEndTime = datetime.now()
-  logging.info("Training and Evaluation time: " + str(trainingEndTime - trainingStartTime))
-
-  if trainer._estimator.config.is_chief:
-    serving_input_in_earlybird = {
-      "input_sparse_tensor_indices": array_ops.placeholder(
-        name="input_sparse_tensor_indices",
-        shape=[None, 2],
-        dtype=dtypes.int64),
-      "input_sparse_tensor_values": array_ops.placeholder(
-        name="input_sparse_tensor_values",
-        shape=[None],
-        dtype=dtypes.float32),
-      "input_sparse_tensor_shape": array_ops.placeholder(
-        name="input_sparse_tensor_shape",
-        shape=[2],
-        dtype=dtypes.int64)
-    }
-    serving_input_receiver_fn = build_raw_serving_input_receiver_fn(serving_input_in_earlybird)
-    twml.contrib.export.export_fn.export_all_models(
-      trainer=trainer,
-      export_dir=opt.export_dir,
-      parse_fn=parse_fn,
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      export_output_fn=earlybird_output_fn,
-      feature_spec=feature_config.get_feature_spec()
+    parser = DataRecordTrainer.add_parser_arguments()
+
+    parser = twml.contrib.calibrators.add_discretizer_arguments(parser)
+
+    parser.add_argument("--label", type=str, help="label for the engagement")
+    parser.add_argument(
+        "--model.use_existing_discretizer",
+        action="store_true",
+        dest="model_use_existing_discretizer",
+        help="Load a pre-trained calibration or train a new one",
+    )
+    parser.add_argument("--input_size_bits", type=int)
+    parser.add_argument(
+        "--export_module_name", type=str, default="base_mlp", dest="export_module_name"
+    )
+    parser.add_argument("--feature_config", type=str)
+    parser.add_argument(
+        "--replicate_lolly",
+        type=bool,
+        default=False,
+        dest="replicate_lolly",
+        help="Train a regression model with MSE loss and the logged Earlybird score as a label",
+    )
+    parser.add_argument(
+        "--lolly_model_tsv",
+        type=str,
+        required=False,
+        dest="lolly_model_tsv",
+        help="Initialize with weights and discretizer bins available in the given Lolly model tsv file"
+        "No discretizer gets trained or loaded if set.",
+    )
+    parser.add_argument(
+        "--print_data_examples",
+        type=bool,
+        default=False,
+        dest="print_data_examples",
+        help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'",
     )
-    logging.info("The export model path is: " + opt.export_dir)
+    add_weight_arguments(parser)
+
+    opt = parser.parse_args()
+
+    feature_config_module = all_configs.select_feature_config(opt.feature_config)
+
+    feature_config = feature_config_module.get_feature_config(
+        data_spec_path=opt.data_spec, label=opt.label
+    )
+
+    parse_fn = twml.parsers.get_sparse_parse_fn(
+        feature_config,
+        keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes"),
+    )
+
+    if not opt.lolly_model_tsv:
+        if opt.model_use_existing_discretizer:
+            logging.info(
+                "Skipping discretizer calibration [model.use_existing_discretizer=True]"
+            )
+            logging.info(f"Using calibration at {opt.discretizer_save_dir}")
+        else:
+            logging.info(
+                "Calibrating new discretizer [model.use_existing_discretizer=False]"
+            )
+            calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator(
+                opt.discretizer_num_bins, opt.discretizer_output_size_bits
+            )
+            calibrate_discretizer_and_export(
+                name="recap_earlybird_hashing_discretizer",
+                params=opt,
+                calibrator=calibrator,
+                build_graph_fn=build_percentile_discretizer_graph,
+                feature_config=feature_config,
+            )
+
+    trainer = DataRecordTrainer(
+        name="earlybird",
+        params=opt,
+        build_graph_fn=build_graph,
+        save_dir=opt.save_dir,
+        feature_config=feature_config,
+        metric_fn=get_multi_binary_class_metric_fn(
+            metrics=["roc_auc"], classes=PREDICTED_CLASSES
+        ),
+        warm_start_from=None,
+    )
+
+    train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn)
+    eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn)
+
+    logging.info("Training and Evaluation ...")
+    trainingStartTime = datetime.now()
+    trainer.train_and_evaluate(
+        train_input_fn=train_input_fn, eval_input_fn=eval_input_fn
+    )
+    trainingEndTime = datetime.now()
+    logging.info(
+        "Training and Evaluation time: " + str(trainingEndTime - trainingStartTime)
+    )
+
+    if trainer._estimator.config.is_chief:
+        serving_input_in_earlybird = {
+            "input_sparse_tensor_indices": array_ops.placeholder(
+                name="input_sparse_tensor_indices", shape=[None, 2], dtype=dtypes.int64
+            ),
+            "input_sparse_tensor_values": array_ops.placeholder(
+                name="input_sparse_tensor_values", shape=[None], dtype=dtypes.float32
+            ),
+            "input_sparse_tensor_shape": array_ops.placeholder(
+                name="input_sparse_tensor_shape", shape=[2], dtype=dtypes.int64
+            ),
+        }
+        serving_input_receiver_fn = build_raw_serving_input_receiver_fn(
+            serving_input_in_earlybird
+        )
+        twml.contrib.export.export_fn.export_all_models(
+            trainer=trainer,
+            export_dir=opt.export_dir,
+            parse_fn=parse_fn,
+            serving_input_receiver_fn=serving_input_receiver_fn,
+            export_output_fn=earlybird_output_fn,
+            feature_spec=feature_config.get_feature_spec(),
+        )
+        logging.info("The export model path is: " + opt.export_dir)
diff --git a/timelines/data_processing/ml_util/aggregation_framework/docs/conf.py b/timelines/data_processing/ml_util/aggregation_framework/docs/conf.py
index 03996dfd7..e5806d820 100644
--- a/timelines/data_processing/ml_util/aggregation_framework/docs/conf.py
+++ b/timelines/data_processing/ml_util/aggregation_framework/docs/conf.py
@@ -7,15 +7,14 @@
 
 from os.path import abspath, dirname, isfile, join
 
-
 extensions = [
-  "sphinx.ext.autodoc",
-  "sphinx.ext.intersphinx",
-  "sphinx.ext.ifconfig",
-  "sphinx.ext.graphviz",
-  "twitter.docbird.ext.thriftlexer",
-  "twitter.docbird.ext.toctree_default_caption",
-  "sphinxcontrib.httpdomain",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.ifconfig",
+    "sphinx.ext.graphviz",
+    "twitter.docbird.ext.thriftlexer",
+    "twitter.docbird.ext.toctree_default_caption",
+    "sphinxcontrib.httpdomain",
 ]
 
 
@@ -29,13 +28,13 @@
 master_doc = "index"
 
 # General information about the project.
-project = u"""Aggregation Framework"""
-description = u""""""
+project = """Aggregation Framework"""
+description = """"""
 
 # The short X.Y version.
-version = u"""1.0"""
+version = """1.0"""
 # The full version, including alpha/beta/rc tags.
-release = u"""1.0"""
+release = """1.0"""
 
 exclude_patterns = ["_build"]
 
@@ -45,15 +44,20 @@
 
 html_static_path = ["_static"]
 
-html_logo = u""""""
+html_logo = """"""
 
 # Automagically add project logo, if it exists
 # (checks on any build, not just init)
 # Scan for some common defaults (png or svg format,
 # called "logo" or project name, in docs folder)
 if not html_logo:
-  location = dirname(abspath(__file__))
-  for logo_file in ["logo.png", "logo.svg", ("%s.png" % project), ("%s.svg" % project)]:
-    html_logo = logo_file if isfile(join(location, logo_file)) else html_logo
+    location = dirname(abspath(__file__))
+    for logo_file in [
+        "logo.png",
+        "logo.svg",
+        ("%s.png" % project),
+        ("%s.svg" % project),
+    ]:
+        html_logo = logo_file if isfile(join(location, logo_file)) else html_logo
 
 graphviz_output_format = "svg"
diff --git a/trust_and_safety_models/abusive/abusive_model.py b/trust_and_safety_models/abusive/abusive_model.py
index 06fff4ed2..5cc7d5086 100644
--- a/trust_and_safety_models/abusive/abusive_model.py
+++ b/trust_and_safety_models/abusive/abusive_model.py
@@ -1,48 +1,57 @@
 import tensorflow as tf
 
-physical_devices = tf.config.list_physical_devices('GPU') 
+physical_devices = tf.config.list_physical_devices("GPU")
 for device in physical_devices:
     tf.config.experimental.set_memory_growth(device, True)
 
-from twitter.hmli.nimbus.modeling.model_config import FeatureType, EncodingType, Feature, Model, LogType
-from twitter.hmli.nimbus.modeling.feature_loader import BigQueryFeatureLoader
-from twitter.cuad.representation.models.text_encoder import TextEncoder
-from twitter.cuad.representation.models.optimization import create_optimizer
-from twitter.hmli.nimbus.modeling.feature_encoder import FeatureEncoder
-
 import numpy as np
 import pandas as pd
 import utils
+from twitter.cuad.representation.models.optimization import create_optimizer
+from twitter.cuad.representation.models.text_encoder import TextEncoder
+from twitter.hmli.nimbus.modeling.feature_encoder import FeatureEncoder
+from twitter.hmli.nimbus.modeling.feature_loader import BigQueryFeatureLoader
+from twitter.hmli.nimbus.modeling.model_config import (
+    EncodingType,
+    Feature,
+    FeatureType,
+    LogType,
+    Model,
+)
 
-cat_names = [
-...
-]
+cat_names = [...]
 
-category_features = [Feature(name=cat_name, ftype=FeatureType.CONTINUOUS) for cat_name in cat_names]
+category_features = [
+    Feature(name=cat_name, ftype=FeatureType.CONTINUOUS) for cat_name in cat_names
+]
 features = [
-  Feature(name="tweet_text_with_media_annotations", ftype=FeatureType.STRING, encoding=EncodingType.BERT),
-  Feature(name="precision_nsfw", ftype=FeatureType.CONTINUOUS),
-  Feature(name="has_media", ftype=FeatureType.BINARY),
-  Feature(name="num_media", ftype=FeatureType.DISCRETE)
+    Feature(
+        name="tweet_text_with_media_annotations",
+        ftype=FeatureType.STRING,
+        encoding=EncodingType.BERT,
+    ),
+    Feature(name="precision_nsfw", ftype=FeatureType.CONTINUOUS),
+    Feature(name="has_media", ftype=FeatureType.BINARY),
+    Feature(name="num_media", ftype=FeatureType.DISCRETE),
 ] + category_features
 
 ptos_prototype = Model(
-  name='ptos_prototype',
-  export_path="...",
-  features=features,
+    name="ptos_prototype",
+    export_path="...",
+    features=features,
 )
 print(ptos_prototype)
 
 cq_loader = BigQueryFeatureLoader(gcp_project=COMPUTE_PROJECT)
 labels = [
-  "has_non_punitive_action",
-  "has_punitive_action",
-  "has_punitive_action_contains_self_harm",
-  "has_punitive_action_encourage_self_harm",
-  "has_punitive_action_episodic",
-  "has_punitive_action_episodic_hateful_conduct",
-  "has_punitive_action_other_abuse_policy",
-  "has_punitive_action_without_self_harm"
+    "has_non_punitive_action",
+    "has_punitive_action",
+    "has_punitive_action_contains_self_harm",
+    "has_punitive_action_encourage_self_harm",
+    "has_punitive_action_episodic",
+    "has_punitive_action_episodic_hateful_conduct",
+    "has_punitive_action_other_abuse_policy",
+    "has_punitive_action_without_self_harm",
 ]
 
 train_query = f"""
@@ -64,112 +73,128 @@
 print(train.describe(model=ptos_prototype))
 
 params = {
-  'max_seq_lengths': 128,
-  'batch_size': 196,
-  'lr': 1e-5,
-  'optimizer_type': 'adamw',
-  'warmup_steps': 0,
-  'cls_dropout_rate': 0.1,
-  'epochs': 30,
-  'steps_per_epoch': 5000,
-  'model_type': 'twitter_multilingual_bert_base_cased_mlm', 
-  'mixed_precision': True,
+    "max_seq_lengths": 128,
+    "batch_size": 196,
+    "lr": 1e-5,
+    "optimizer_type": "adamw",
+    "warmup_steps": 0,
+    "cls_dropout_rate": 0.1,
+    "epochs": 30,
+    "steps_per_epoch": 5000,
+    "model_type": "twitter_multilingual_bert_base_cased_mlm",
+    "mixed_precision": True,
 }
 params
 
+
 def parse_labeled_data(row_dict):
-  label = [row_dict.pop(l) for l in labels]
-  return row_dict, label
+    label = [row_dict.pop(l) for l in labels]
+    return row_dict, label
 
-mirrored_strategy = tf.distribute.MirroredStrategy()
-BATCH_SIZE = params['batch_size'] * mirrored_strategy.num_replicas_in_sync
 
-train_ds = train.to_tf_dataset().map(parse_labeled_data).shuffle(BATCH_SIZE*100).batch(BATCH_SIZE).repeat()
+mirrored_strategy = tf.distribute.MirroredStrategy()
+BATCH_SIZE = params["batch_size"] * mirrored_strategy.num_replicas_in_sync
+
+train_ds = (
+    train.to_tf_dataset()
+    .map(parse_labeled_data)
+    .shuffle(BATCH_SIZE * 100)
+    .batch(BATCH_SIZE)
+    .repeat()
+)
 val_ds = val.to_tf_dataset().map(parse_labeled_data).batch(BATCH_SIZE)
 
 for record in train_ds:
-  tf.print(record)
-  break
+    tf.print(record)
+    break
+
 
 def get_positive_weights():
-  """Computes positive weights used for class imbalance from training data."""
-  label_weights_df = utils.get_label_weights(
-      "tos-data-media-full",
-      project_id="twttr-abusive-interact-prod",
-      dataset_id="tos_policy"
-  )
-  pos_weight_tensor = tf.cast(
-      label_weights_df.sort_values(by='label').positive_class_weight,
-      dtype=tf.float32
-  )
-  return pos_weight_tensor
+    """Computes positive weights used for class imbalance from training data."""
+    label_weights_df = utils.get_label_weights(
+        "tos-data-media-full",
+        project_id="twttr-abusive-interact-prod",
+        dataset_id="tos_policy",
+    )
+    pos_weight_tensor = tf.cast(
+        label_weights_df.sort_values(by="label").positive_class_weight, dtype=tf.float32
+    )
+    return pos_weight_tensor
+
 
 pos_weight_tensor = get_positive_weights()
 print(pos_weight_tensor)
 
+
 class TextEncoderPooledOutput(TextEncoder):
-  def call(self, x):
-    return super().call([x])["pooled_output"]  
+    def call(self, x):
+        return super().call([x])["pooled_output"]
+
+    def get_config(self):
+        return super().get_config()
 
-  def get_config(self):
-    return super().get_config()
 
 with mirrored_strategy.scope():
-  text_encoder_pooled_output = TextEncoderPooledOutput(
-                                params['max_seq_lengths'], 
-                                model_type=params['model_type'],
-                                trainable=True
-                              )
-
-  fe = FeatureEncoder(train)
-  inputs, preprocessing_head = fe.build_model_head(model=ptos_prototype, text_encoder=text_encoder_pooled_output)
-
-  cls_dropout = tf.keras.layers.Dropout(params['cls_dropout_rate'], name="cls_dropout")
-  outputs = cls_dropout(preprocessing_head)
-  outputs = tf.keras.layers.Dense(8, name="output", dtype="float32")(outputs)
-
-  model = tf.keras.Model(
-      inputs=inputs,
-      outputs=outputs
-  )
-  pr_auc = tf.keras.metrics.AUC(curve="PR", num_thresholds=1000, multi_label=True, from_logits=True)
-
-  custom_loss = lambda y_true, y_pred: utils.multilabel_weighted_loss(y_true, y_pred, weights=pos_weight_tensor)
-  optimizer = create_optimizer(
-    init_lr=params["lr"], 
-    num_train_steps=(params["epochs"] * params["steps_per_epoch"]),
-    num_warmup_steps=params["warmup_steps"],
-    optimizer_type=params["optimizer_type"],
-  )
-  if params.get("mixed_precision"):
-      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
-      
-  model.compile(
-    optimizer=optimizer,
-    loss=custom_loss,
-    metrics=[pr_auc]
-  )
+    text_encoder_pooled_output = TextEncoderPooledOutput(
+        params["max_seq_lengths"], model_type=params["model_type"], trainable=True
+    )
+
+    fe = FeatureEncoder(train)
+    inputs, preprocessing_head = fe.build_model_head(
+        model=ptos_prototype, text_encoder=text_encoder_pooled_output
+    )
+
+    cls_dropout = tf.keras.layers.Dropout(
+        params["cls_dropout_rate"], name="cls_dropout"
+    )
+    outputs = cls_dropout(preprocessing_head)
+    outputs = tf.keras.layers.Dense(8, name="output", dtype="float32")(outputs)
+
+    model = tf.keras.Model(inputs=inputs, outputs=outputs)
+    pr_auc = tf.keras.metrics.AUC(
+        curve="PR", num_thresholds=1000, multi_label=True, from_logits=True
+    )
+
+    custom_loss = lambda y_true, y_pred: utils.multilabel_weighted_loss(
+        y_true, y_pred, weights=pos_weight_tensor
+    )
+    optimizer = create_optimizer(
+        init_lr=params["lr"],
+        num_train_steps=(params["epochs"] * params["steps_per_epoch"]),
+        num_warmup_steps=params["warmup_steps"],
+        optimizer_type=params["optimizer_type"],
+    )
+    if params.get("mixed_precision"):
+        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+            optimizer
+        )
+
+    model.compile(optimizer=optimizer, loss=custom_loss, metrics=[pr_auc])
 
 model.weights
 model.summary()
 pr_auc.name
 
 import getpass
+
 import wandb
 from wandb.keras import WandbCallback
+
 try:
-  wandb_key = ...
-  wandb.login(...)
-  run = wandb.init(project='ptos_with_media',
-             group='new-split-trains',
-             notes='tweet text with only (num_media, precision_nsfw). on full train set, new split.',
-             entity='absv',
-             config=params,
-             name='tweet-text-w-nsfw-1.1',
-             sync_tensorboard=True)
+    wandb_key = ...
+    wandb.login(...)
+    run = wandb.init(
+        project="ptos_with_media",
+        group="new-split-trains",
+        notes="tweet text with only (num_media, precision_nsfw). on full train set, new split.",
+        entity="absv",
+        config=params,
+        name="tweet-text-w-nsfw-1.1",
+        sync_tensorboard=True,
+    )
 except FileNotFoundError:
-  print('Wandb key not found')
-  run = wandb.init(mode='disabled')
+    print("Wandb key not found")
+    run = wandb.init(mode="disabled")
 import datetime
 import os
 
@@ -179,27 +204,34 @@ def get_config(self):
 print("Saving model checkpoints here: ", checkpoint_path)
 
 cp_callback = tf.keras.callbacks.ModelCheckpoint(
-  filepath=os.path.join(checkpoint_path, "model.{epoch:04d}.tf"),
-  verbose=1,
-  monitor=f'val_{pr_auc.name}',
-  mode='max',
-  save_freq='epoch',
-  save_best_only=True
+    filepath=os.path.join(checkpoint_path, "model.{epoch:04d}.tf"),
+    verbose=1,
+    monitor=f"val_{pr_auc.name}",
+    mode="max",
+    save_freq="epoch",
+    save_best_only=True,
 )
 
-early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=7,
-                                                           monitor=f"val_{pr_auc.name}",
-                                                           mode="max")
+early_stopping_callback = tf.keras.callbacks.EarlyStopping(
+    patience=7, monitor=f"val_{pr_auc.name}", mode="max"
+)
 
-model.fit(train_ds, epochs=params["epochs"], validation_data=val_ds, callbacks=[cp_callback, early_stopping_callback],
-        steps_per_epoch=params["steps_per_epoch"], 
-        verbose=2)
+model.fit(
+    train_ds,
+    epochs=params["epochs"],
+    validation_data=val_ds,
+    callbacks=[cp_callback, early_stopping_callback],
+    steps_per_epoch=params["steps_per_epoch"],
+    verbose=2,
+)
 
 import tensorflow_hub as hub
 
 gs_model_path = ...
 reloaded_keras_layer = hub.KerasLayer(gs_model_path)
-inputs = tf.keras.layers.Input(name="tweet__core__tweet__text", shape=(1,), dtype=tf.string)
+inputs = tf.keras.layers.Input(
+    name="tweet__core__tweet__text", shape=(1,), dtype=tf.string
+)
 output = reloaded_keras_layer(inputs)
 v7_model = tf.keras.models.Model(inputs=inputs, outputs=output)
 pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc")
@@ -210,7 +242,7 @@ def get_config(self):
 candidate_model = model
 
 with mirrored_strategy.scope():
-  candidate_eval = candidate_model.evaluate(val_ds)
+    candidate_eval = candidate_model.evaluate(val_ds)
 
 test_query = f"""
 SELECT 
@@ -229,48 +261,64 @@ def get_config(self):
 test_only_media = test.filter(lambda x, y: tf.equal(x["has_media"], True))
 test_only_nsfw = test.filter(lambda x, y: tf.greater_equal(x["precision_nsfw"], 0.95))
 test_no_media = test.filter(lambda x, y: tf.equal(x["has_media"], False))
-test_media_not_nsfw = test.filter(lambda x, y: tf.logical_and(tf.equal(x["has_media"], True), tf.less(x["precision_nsfw"], 0.95)))
+test_media_not_nsfw = test.filter(
+    lambda x, y: tf.logical_and(
+        tf.equal(x["has_media"], True), tf.less(x["precision_nsfw"], 0.95)
+    )
+)
 for d in [test, test_only_media, test_only_nsfw, test_no_media, test_media_not_nsfw]:
-  print(d.reduce(0, lambda x, _: x + 1).numpy())
+    print(d.reduce(0, lambda x, _: x + 1).numpy())
 
-from notebook_eval_utils import SparseMultilabelEvaluator, EvalConfig
 from dataclasses import asdict
 
+from notebook_eval_utils import EvalConfig, SparseMultilabelEvaluator
+
+
 def display_metrics(probs, targets, labels=labels):
-  eval_config = EvalConfig(prediction_threshold=0.5, precision_k=0.9)
-  for eval_mode, y_mask in [("implicit", np.ones(targets.shape))]:
-    print("Evaluation mode", eval_mode)
-    metrics = SparseMultilabelEvaluator.evaluate(
-        targets, np.array(probs), y_mask, classes=labels, eval_config=eval_config
-    )
-    metrics_df = pd.DataFrame.from_dict(asdict(metrics)["per_topic_metrics"]).transpose()
-    metrics_df["pos_to_neg"] = metrics_df["num_pos_samples"] / (metrics_df["num_neg_samples"] + 1)
-    display(metrics_df.median())    
-    display(metrics_df)
-    return metrics_df
+    eval_config = EvalConfig(prediction_threshold=0.5, precision_k=0.9)
+    for eval_mode, y_mask in [("implicit", np.ones(targets.shape))]:
+        print("Evaluation mode", eval_mode)
+        metrics = SparseMultilabelEvaluator.evaluate(
+            targets, np.array(probs), y_mask, classes=labels, eval_config=eval_config
+        )
+        metrics_df = pd.DataFrame.from_dict(
+            asdict(metrics)["per_topic_metrics"]
+        ).transpose()
+        metrics_df["pos_to_neg"] = metrics_df["num_pos_samples"] / (
+            metrics_df["num_neg_samples"] + 1
+        )
+        display(metrics_df.median())
+        display(metrics_df)
+        return metrics_df
 
 
 def eval_model(model, df):
-  with mirrored_strategy.scope():
-    targets = np.stack(list(df.map(lambda x, y: y).as_numpy_iterator()), axis=0)
-    df = df.padded_batch(BATCH_SIZE)
-    preds = model.predict(df)
-    return display_metrics(preds, targets)
-
-subsets = {"test": test,
-          "test_only_media": test_only_media,
-          "test_only_nsfw": test_only_nsfw,
-          "test_no_media": test_no_media,
-          "test_media_not_nsfw": test_media_not_nsfw}
+    with mirrored_strategy.scope():
+        targets = np.stack(list(df.map(lambda x, y: y).as_numpy_iterator()), axis=0)
+        df = df.padded_batch(BATCH_SIZE)
+        preds = model.predict(df)
+        return display_metrics(preds, targets)
+
+
+subsets = {
+    "test": test,
+    "test_only_media": test_only_media,
+    "test_only_nsfw": test_only_nsfw,
+    "test_no_media": test_no_media,
+    "test_media_not_nsfw": test_media_not_nsfw,
+}
 
 metrics = {}
 for name, df in subsets.items():
-  metrics[name] = eval_model(candidate_model, df)
+    metrics[name] = eval_model(candidate_model, df)
 [(name, m.pr_auc) for name, m in metrics.items()]
-for name, x in [(name, m.pr_auc.to_string(index=False).strip().split("\n")) for name, m in metrics.items()]:
-  print(name)
-  for y in x:
-    print(y.strip(), end="\t")
-  print(".")
+for name, x in [
+    (name, m.pr_auc.to_string(index=False).strip().split("\n"))
+    for name, m in metrics.items()
+]:
+    print(name)
+    for y in x:
+        print(y.strip(), end="\t")
+    print(".")
 for d in [test, test_only_media, test_only_nsfw, test_no_media, test_media_not_nsfw]:
-  print(d.reduce(0, lambda x, _: x + 1).numpy())
\ No newline at end of file
+    print(d.reduce(0, lambda x, _: x + 1).numpy())
diff --git a/trust_and_safety_models/nsfw/nsfw_media.py b/trust_and_safety_models/nsfw/nsfw_media.py
index b5dfebb65..4975b4b32 100644
--- a/trust_and_safety_models/nsfw/nsfw_media.py
+++ b/trust_and_safety_models/nsfw/nsfw_media.py
@@ -1,51 +1,55 @@
-import kerastuner as kt
+import glob
 import math
+import os
+import random
+
+import kerastuner as kt
 import numpy as np
 import pandas as pd
-import random
 import sklearn.metrics
 import tensorflow as tf
-import os
-import glob
-
-from tqdm import tqdm
+from google.cloud import storage
 from matplotlib import pyplot as plt
-from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense
-from google.cloud import storage
+from tensorflow.keras.models import Sequential
+from tqdm import tqdm
 
-physical_devices = tf.config.list_physical_devices('GPU')
+physical_devices = tf.config.list_physical_devices("GPU")
 physical_devices
 
-tf.config.set_visible_devices([tf.config.PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')], 'GPU')
-tf.config.get_visible_devices('GPU')
+tf.config.set_visible_devices(
+    [tf.config.PhysicalDevice(name="/physical_device:GPU:1", device_type="GPU")], "GPU"
+)
+tf.config.get_visible_devices("GPU")
+
 
 def decode_fn_embedding(example_proto):
-  
-  feature_description = {
-    "embedding": tf.io.FixedLenFeature([256], dtype=tf.float32),
-    "labels": tf.io.FixedLenFeature([], dtype=tf.int64),
-  }
-  
-  example = tf.io.parse_single_example(
-      example_proto,
-      feature_description
-  )
-
-  return example
-
-def preprocess_embedding_example(example_dict, positive_label=1, features_as_dict=False):
-  labels = example_dict["labels"]
-  label = tf.math.reduce_any(labels == positive_label)
-  label = tf.cast(label, tf.int32)
-  embedding = example_dict["embedding"]
-  
-  if features_as_dict:
-    features = {"embedding": embedding}
-  else:
-    features = embedding
-    
-  return features, label
+    feature_description = {
+        "embedding": tf.io.FixedLenFeature([256], dtype=tf.float32),
+        "labels": tf.io.FixedLenFeature([], dtype=tf.int64),
+    }
+
+    example = tf.io.parse_single_example(example_proto, feature_description)
+
+    return example
+
+
+def preprocess_embedding_example(
+    example_dict, positive_label=1, features_as_dict=False
+):
+    labels = example_dict["labels"]
+    label = tf.math.reduce_any(labels == positive_label)
+    label = tf.cast(label, tf.int32)
+    embedding = example_dict["embedding"]
+
+    if features_as_dict:
+        features = {"embedding": embedding}
+    else:
+        features = embedding
+
+    return features, label
+
+
 input_root = ...
 sens_prev_input_root = ...
 
@@ -58,161 +62,189 @@ def preprocess_embedding_example(example_dict, positive_label=1, features_as_dic
 validation_batch_size = 256
 
 do_resample = False
+
+
 def class_func(features, label):
-  return label
+    return label
+
 
 resample_fn = tf.data.experimental.rejection_resample(
-    class_func, target_dist = [0.5, 0.5], seed=0
+    class_func, target_dist=[0.5, 0.5], seed=0
 )
 train_glob = f"{input_root}/train/tfrecord/*.tfrecord"
 train_files = tf.io.gfile.glob(train_glob)
 
 if use_sens_prev_data:
-  train_sens_prev_glob = f"{sens_prev_input_root}/train/tfrecord/*.tfrecord"
-  train_sens_prev_files = tf.io.gfile.glob(train_sens_prev_glob)
-  train_files = train_files + train_sens_prev_files
-  
+    train_sens_prev_glob = f"{sens_prev_input_root}/train/tfrecord/*.tfrecord"
+    train_sens_prev_files = tf.io.gfile.glob(train_sens_prev_glob)
+    train_files = train_files + train_sens_prev_files
+
 random.shuffle(train_files)
 
 if not len(train_files):
-  raise ValueError(f"Did not find any train files matching {train_glob}")
+    raise ValueError(f"Did not find any train files matching {train_glob}")
 
 
 test_glob = f"{input_root}/test/tfrecord/*.tfrecord"
-test_files =  tf.io.gfile.glob(test_glob)
+test_files = tf.io.gfile.glob(test_glob)
 
 if not len(test_files):
-  raise ValueError(f"Did not find any eval files matching {test_glob}")
-  
+    raise ValueError(f"Did not find any eval files matching {test_glob}")
+
 test_ds = tf.data.TFRecordDataset(test_files).map(decode_fn_embedding)
-test_ds = test_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=test_batch_size)
-  
+test_ds = test_ds.map(
+    lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+).batch(batch_size=test_batch_size)
+
 if use_sens_prev_data:
-  test_sens_prev_glob = f"{sens_prev_input_root}/test/tfrecord/*.tfrecord"
-  test_sens_prev_files =  tf.io.gfile.glob(test_sens_prev_glob)
-  
-  if not len(test_sens_prev_files):
-    raise ValueError(f"Did not find any eval files matching {test_sens_prev_glob}")
-  
-  test_sens_prev_ds = tf.data.TFRecordDataset(test_sens_prev_files).map(decode_fn_embedding)
-  test_sens_prev_ds = test_sens_prev_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=test_batch_size)
+    test_sens_prev_glob = f"{sens_prev_input_root}/test/tfrecord/*.tfrecord"
+    test_sens_prev_files = tf.io.gfile.glob(test_sens_prev_glob)
+
+    if not len(test_sens_prev_files):
+        raise ValueError(f"Did not find any eval files matching {test_sens_prev_glob}")
+
+    test_sens_prev_ds = tf.data.TFRecordDataset(test_sens_prev_files).map(
+        decode_fn_embedding
+    )
+    test_sens_prev_ds = test_sens_prev_ds.map(
+        lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+    ).batch(batch_size=test_batch_size)
 
 train_ds = tf.data.TFRecordDataset(train_files).map(decode_fn_embedding)
-train_ds = train_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label))
+train_ds = train_ds.map(
+    lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+)
 
 if do_resample:
-  train_ds = train_ds.apply(resample_fn).map(lambda _,b:(b))
+    train_ds = train_ds.apply(resample_fn).map(lambda _, b: (b))
 
 train_ds = train_ds.batch(batch_size=256).shuffle(buffer_size=10)
 train_ds = train_ds.repeat()
-  
-
-if has_validation_data: 
-  eval_glob = f"{input_root}/validation/tfrecord/*.tfrecord"
-  eval_files =  tf.io.gfile.glob(eval_glob)
-    
-  if use_sens_prev_data:
-    eval_sens_prev_glob = f"{sens_prev_input_root}/validation/tfrecord/*.tfrecord"
-    eval_sens_prev_files = tf.io.gfile.glob(eval_sens_prev_glob)
-    eval_files =  eval_files + eval_sens_prev_files
-    
-    
-  if not len(eval_files):
-    raise ValueError(f"Did not find any eval files matching {eval_glob}")
-  
-  eval_ds = tf.data.TFRecordDataset(eval_files).map(decode_fn_embedding)
-  eval_ds = eval_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=validation_batch_size)
+
+
+if has_validation_data:
+    eval_glob = f"{input_root}/validation/tfrecord/*.tfrecord"
+    eval_files = tf.io.gfile.glob(eval_glob)
+
+    if use_sens_prev_data:
+        eval_sens_prev_glob = f"{sens_prev_input_root}/validation/tfrecord/*.tfrecord"
+        eval_sens_prev_files = tf.io.gfile.glob(eval_sens_prev_glob)
+        eval_files = eval_files + eval_sens_prev_files
+
+    if not len(eval_files):
+        raise ValueError(f"Did not find any eval files matching {eval_glob}")
+
+    eval_ds = tf.data.TFRecordDataset(eval_files).map(decode_fn_embedding)
+    eval_ds = eval_ds.map(
+        lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+    ).batch(batch_size=validation_batch_size)
 
 else:
-  
-  eval_ds = tf.data.TFRecordDataset(test_files).map(decode_fn_embedding)
-  eval_ds = eval_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=validation_batch_size)
+    eval_ds = tf.data.TFRecordDataset(test_files).map(decode_fn_embedding)
+    eval_ds = eval_ds.map(
+        lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+    ).batch(batch_size=validation_batch_size)
 check_ds = tf.data.TFRecordDataset(train_files).map(decode_fn_embedding)
 cnt = 0
 pos_cnt = 0
 for example in tqdm(check_ds):
-  label = example['labels']
-  if label == 1:
-    pos_cnt += 1
-  cnt += 1
-print(f'{cnt} train entries with {pos_cnt} positive')
+    label = example["labels"]
+    if label == 1:
+        pos_cnt += 1
+    cnt += 1
+print(f"{cnt} train entries with {pos_cnt} positive")
 
 metrics = []
 
 metrics.append(
-  tf.keras.metrics.PrecisionAtRecall(
-    recall=0.9, num_thresholds=200, class_id=None, name=None, dtype=None
-  )
+    tf.keras.metrics.PrecisionAtRecall(
+        recall=0.9, num_thresholds=200, class_id=None, name=None, dtype=None
+    )
 )
 
 metrics.append(
-  tf.keras.metrics.AUC(
-    num_thresholds=200,
-    curve="PR",
-  )
+    tf.keras.metrics.AUC(
+        num_thresholds=200,
+        curve="PR",
+    )
 )
+
+
 def build_model(hp):
-  model = Sequential()
+    model = Sequential()
+
+    optimizer = tf.keras.optimizers.Adam(
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-08,
+        amsgrad=False,
+        name="Adam",
+    )
+
+    activation = hp.Choice("activation", ["tanh", "gelu"])
+    kernel_initializer = hp.Choice(
+        "kernel_initializer", ["he_uniform", "glorot_uniform"]
+    )
+    for i in range(hp.Int("num_layers", 1, 2)):
+        model.add(tf.keras.layers.BatchNormalization())
+
+        units = hp.Int("units", min_value=128, max_value=256, step=128)
+
+        if i == 0:
+            model.add(
+                Dense(
+                    units=units,
+                    activation=activation,
+                    kernel_initializer=kernel_initializer,
+                    input_shape=(None, 256),
+                )
+            )
+        else:
+            model.add(
+                Dense(
+                    units=units,
+                    activation=activation,
+                    kernel_initializer=kernel_initializer,
+                )
+            )
+
+    model.add(Dense(1, activation="sigmoid", kernel_initializer=kernel_initializer))
+    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics)
+
+    return model
 
-  optimizer = tf.keras.optimizers.Adam(
-    learning_rate=0.001,
-    beta_1=0.9,
-    beta_2=0.999,
-    epsilon=1e-08,
-    amsgrad=False,
-    name="Adam",
-  )
-  
-  activation=hp.Choice("activation", ["tanh", "gelu"])
-  kernel_initializer=hp.Choice("kernel_initializer", ["he_uniform", "glorot_uniform"])
-  for i in range(hp.Int("num_layers", 1, 2)):
-    model.add(tf.keras.layers.BatchNormalization())
-
-    units=hp.Int("units", min_value=128, max_value=256, step=128)
-    
-    if i == 0:
-      model.add(
-        Dense(
-          units=units,
-          activation=activation,
-          kernel_initializer=kernel_initializer,
-          input_shape=(None, 256)
-        )
-      )
-    else:
-      model.add(
-        Dense(
-          units=units,
-          activation=activation,
-          kernel_initializer=kernel_initializer,
-        )
-      )
-    
-  model.add(Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer))
-  model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)
-
-  return model
 
 tuner = kt.tuners.BayesianOptimization(
-  build_model,
-  objective=kt.Objective('val_loss', direction="min"),
-  max_trials=30,
-  directory='tuner_dir',
-  project_name='with_twitter_clip')
+    build_model,
+    objective=kt.Objective("val_loss", direction="min"),
+    max_trials=30,
+    directory="tuner_dir",
+    project_name="with_twitter_clip",
+)
 
-callbacks = [tf.keras.callbacks.EarlyStopping(
-    monitor='val_loss', min_delta=0, patience=5, verbose=0,
-    mode='auto', baseline=None, restore_best_weights=True
-)]
+callbacks = [
+    tf.keras.callbacks.EarlyStopping(
+        monitor="val_loss",
+        min_delta=0,
+        patience=5,
+        verbose=0,
+        mode="auto",
+        baseline=None,
+        restore_best_weights=True,
+    )
+]
 
 steps_per_epoch = 400
-tuner.search(train_ds,
-             epochs=100,
-             batch_size=256,
-             steps_per_epoch=steps_per_epoch,
-             verbose=2,
-             validation_data=eval_ds,
-             callbacks=callbacks)
+tuner.search(
+    train_ds,
+    epochs=100,
+    batch_size=256,
+    steps_per_epoch=steps_per_epoch,
+    verbose=2,
+    validation_data=eval_ds,
+    callbacks=callbacks,
+)
 
 tuner.results_summary()
 models = tuner.get_best_models(num_models=2)
@@ -230,109 +262,126 @@ def build_model(hp):
     epsilon=1e-08,
     amsgrad=False,
     name="Adam",
-  )
-best_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)
+)
+best_model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics)
 best_model.summary()
 
-callbacks = [tf.keras.callbacks.EarlyStopping(
-    monitor='val_loss', min_delta=0, patience=10, verbose=0,
-    mode='auto', baseline=None, restore_best_weights=True
-)]
-history = best_model.fit(train_ds, epochs=100, validation_data=eval_ds, steps_per_epoch=steps_per_epoch, callbacks=callbacks)
+callbacks = [
+    tf.keras.callbacks.EarlyStopping(
+        monitor="val_loss",
+        min_delta=0,
+        patience=10,
+        verbose=0,
+        mode="auto",
+        baseline=None,
+        restore_best_weights=True,
+    )
+]
+history = best_model.fit(
+    train_ds,
+    epochs=100,
+    validation_data=eval_ds,
+    steps_per_epoch=steps_per_epoch,
+    callbacks=callbacks,
+)
 
-model_name = 'twitter_hypertuned'
-model_path = f'models/nsfw_Keras_with_CLIP_{model_name}'
+model_name = "twitter_hypertuned"
+model_path = f"models/nsfw_Keras_with_CLIP_{model_name}"
 tf.keras.models.save_model(best_model, model_path)
 
+
 def copy_local_directory_to_gcs(local_path, bucket, gcs_path):
     """Recursively copy a directory of files to GCS.
 
     local_path should be a directory and not have a trailing slash.
     """
     assert os.path.isdir(local_path)
-    for local_file in glob.glob(local_path + '/**'):
+    for local_file in glob.glob(local_path + "/**"):
         if not os.path.isfile(local_file):
             dir_name = os.path.basename(os.path.normpath(local_file))
             copy_local_directory_to_gcs(local_file, bucket, f"{gcs_path}/{dir_name}")
         else:
-          remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :])
-          blob = bucket.blob(remote_path)
-          blob.upload_from_filename(local_file)
+            remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :])
+            blob = bucket.blob(remote_path)
+            blob.upload_from_filename(local_file)
+
 
 client = storage.Client(project=...)
 bucket = client.get_bucket(...)
 copy_local_directory_to_gcs(model_path, bucket, model_path)
-copy_local_directory_to_gcs('tuner_dir', bucket, 'tuner_dir')
+copy_local_directory_to_gcs("tuner_dir", bucket, "tuner_dir")
 loaded_model = tf.keras.models.load_model(model_path)
 print(history.history.keys())
 
-plt.figure(figsize = (20, 5))
+plt.figure(figsize=(20, 5))
 
 plt.subplot(1, 3, 1)
-plt.plot(history.history['auc'])
-plt.plot(history.history['val_auc'])
-plt.title('model auc')
-plt.ylabel('auc')
-plt.xlabel('epoch')
-plt.legend(['train', 'test'], loc='upper left')
+plt.plot(history.history["auc"])
+plt.plot(history.history["val_auc"])
+plt.title("model auc")
+plt.ylabel("auc")
+plt.xlabel("epoch")
+plt.legend(["train", "test"], loc="upper left")
 
 plt.subplot(1, 3, 2)
-plt.plot(history.history['loss'])
-plt.plot(history.history['val_loss'])
-plt.title('model loss')
-plt.ylabel('loss')
-plt.xlabel('epoch')
-plt.legend(['train', 'test'], loc='upper left')
+plt.plot(history.history["loss"])
+plt.plot(history.history["val_loss"])
+plt.title("model loss")
+plt.ylabel("loss")
+plt.xlabel("epoch")
+plt.legend(["train", "test"], loc="upper left")
 
 plt.subplot(1, 3, 3)
-plt.plot(history.history['precision_at_recall'])
-plt.plot(history.history['val_precision_at_recall'])
-plt.title('model precision at 0.9 recall')
-plt.ylabel('precision_at_recall')
-plt.xlabel('epoch')
-plt.legend(['train', 'test'], loc='upper left')
+plt.plot(history.history["precision_at_recall"])
+plt.plot(history.history["val_precision_at_recall"])
+plt.title("model precision at 0.9 recall")
+plt.ylabel("precision_at_recall")
+plt.xlabel("epoch")
+plt.legend(["train", "test"], loc="upper left")
 
-plt.savefig('history_with_twitter_clip.pdf')
+plt.savefig("history_with_twitter_clip.pdf")
 
 test_labels = []
 test_preds = []
 
 for batch_features, batch_labels in tqdm(test_ds):
-  test_preds.extend(loaded_model.predict_proba(batch_features))
-  test_labels.extend(batch_labels.numpy())
-  
+    test_preds.extend(loaded_model.predict_proba(batch_features))
+    test_labels.extend(batch_labels.numpy())
+
 test_sens_prev_labels = []
 test_sens_prev_preds = []
 
 for batch_features, batch_labels in tqdm(test_sens_prev_ds):
-  test_sens_prev_preds.extend(loaded_model.predict_proba(batch_features))
-  test_sens_prev_labels.extend(batch_labels.numpy())
-  
+    test_sens_prev_preds.extend(loaded_model.predict_proba(batch_features))
+    test_sens_prev_labels.extend(batch_labels.numpy())
+
 n_test_pos = 0
 n_test_neg = 0
 n_test = 0
 
 for label in test_labels:
-  n_test +=1
-  if label == 1:
-    n_test_pos +=1
-  else:
-    n_test_neg +=1
+    n_test += 1
+    if label == 1:
+        n_test_pos += 1
+    else:
+        n_test_neg += 1
 
-print(f'n_test = {n_test}, n_pos = {n_test_pos}, n_neg = {n_test_neg}')
+print(f"n_test = {n_test}, n_pos = {n_test_pos}, n_neg = {n_test_neg}")
 
 n_test_sens_prev_pos = 0
 n_test_sens_prev_neg = 0
 n_test_sens_prev = 0
 
 for label in test_sens_prev_labels:
-  n_test_sens_prev +=1
-  if label == 1:
-    n_test_sens_prev_pos +=1
-  else:
-    n_test_sens_prev_neg +=1
+    n_test_sens_prev += 1
+    if label == 1:
+        n_test_sens_prev_pos += 1
+    else:
+        n_test_sens_prev_neg += 1
 
-print(f'n_test_sens_prev = {n_test_sens_prev}, n_pos_sens_prev = {n_test_sens_prev_pos}, n_neg = {n_test_sens_prev_neg}')
+print(
+    f"n_test_sens_prev = {n_test_sens_prev}, n_pos_sens_prev = {n_test_sens_prev_pos}, n_neg = {n_test_sens_prev_neg}"
+)
 
 test_weights = np.ones(np.asarray(test_preds).shape)
 
@@ -340,9 +389,7 @@ def copy_local_directory_to_gcs(local_path, bucket, gcs_path):
 test_preds = np.asarray(test_preds)
 test_weights = np.asarray(test_weights)
 
-pr = sklearn.metrics.precision_recall_curve(
-  test_labels, 
-  test_preds)
+pr = sklearn.metrics.precision_recall_curve(test_labels, test_preds)
 
 auc = sklearn.metrics.auc(pr[1], pr[0])
 plt.plot(pr[1], pr[0])
@@ -355,25 +402,26 @@ def copy_local_directory_to_gcs(local_path, bucket, gcs_path):
 test_sens_prev_weights = np.asarray(test_sens_prev_weights)
 
 pr_sens_prev = sklearn.metrics.precision_recall_curve(
-  test_sens_prev_labels, 
-  test_sens_prev_preds)
+    test_sens_prev_labels, test_sens_prev_preds
+)
 
 auc_sens_prev = sklearn.metrics.auc(pr_sens_prev[1], pr_sens_prev[0])
 plt.plot(pr_sens_prev[1], pr_sens_prev[0])
 plt.title("nsfw (sens prev test set)")
 
 df = pd.DataFrame(
-  {
-    "label": test_labels.squeeze(), 
-    "preds_keras": np.asarray(test_preds).flatten(),
-  })
+    {
+        "label": test_labels.squeeze(),
+        "preds_keras": np.asarray(test_preds).flatten(),
+    }
+)
 plt.figure(figsize=(15, 10))
 df["preds_keras"].hist()
 plt.title("Keras predictions", size=20)
-plt.xlabel('score')
+plt.xlabel("score")
 plt.ylabel("freq")
 
-plt.figure(figsize = (20, 5))
+plt.figure(figsize=(20, 5))
 plt.subplot(1, 3, 1)
 
 plt.plot(pr[2], pr[0][0:-1])
@@ -393,15 +441,19 @@ def copy_local_directory_to_gcs(local_path, bucket, gcs_path):
 plt.xlabel("recall")
 plt.ylabel("precision")
 
-plt.savefig('with_twitter_clip.pdf')
+plt.savefig("with_twitter_clip.pdf")
+
 
 def get_point_for_recall(recall_value, recall, precision):
-  idx = np.argmin(np.abs(recall - recall_value))
-  return (recall[idx], precision[idx])
+    idx = np.argmin(np.abs(recall - recall_value))
+    return (recall[idx], precision[idx])
+
 
 def get_point_for_precision(precision_value, recall, precision):
-  idx = np.argmin(np.abs(precision - precision_value))
-  return (recall[idx], precision[idx])
+    idx = np.argmin(np.abs(precision - precision_value))
+    return (recall[idx], precision[idx])
+
+
 precision, recall, thresholds = pr
 
 auc_precision_recall = sklearn.metrics.auc(recall, precision)
@@ -416,23 +468,23 @@ def get_point_for_precision(precision_value, recall, precision):
 
 ptAt50 = get_point_for_recall(0.5, recall, precision)
 print(ptAt50)
-plt.plot( [ptAt50[0],ptAt50[0]], [0,ptAt50[1]], 'r')
-plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], 'r')
+plt.plot([ptAt50[0], ptAt50[0]], [0, ptAt50[1]], "r")
+plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], "r")
 
 ptAt90 = get_point_for_recall(0.9, recall, precision)
 print(ptAt90)
-plt.plot( [ptAt90[0],ptAt90[0]], [0,ptAt90[1]], 'b')
-plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], 'b')
+plt.plot([ptAt90[0], ptAt90[0]], [0, ptAt90[1]], "b")
+plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], "b")
 
 ptAt50fmt = "%.4f" % ptAt50[1]
 ptAt90fmt = "%.4f" % ptAt90[1]
 aucFmt = "%.4f" % auc_precision_recall
 plt.title(
-  f"Keras (nsfw MU test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...}} ({...} pos), N_test={n_test} ({n_test_pos} pos)",
-  size=20
+    f"Keras (nsfw MU test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...} ({...} pos), N_test={n_test} ({n_test_pos} pos)",
+    size=20,
 )
 plt.subplots_adjust(top=0.72)
-plt.savefig('recall_precision_nsfw_Keras_with_twitter_CLIP_MU_test.pdf')
+plt.savefig("recall_precision_nsfw_Keras_with_twitter_CLIP_MU_test.pdf")
 
 precision, recall, thresholds = pr_sens_prev
 
@@ -447,20 +499,20 @@ def get_point_for_precision(precision_value, recall, precision):
 
 ptAt50 = get_point_for_recall(0.5, recall, precision)
 print(ptAt50)
-plt.plot( [ptAt50[0],ptAt50[0]], [0,ptAt50[1]], 'r')
-plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], 'r')
+plt.plot([ptAt50[0], ptAt50[0]], [0, ptAt50[1]], "r")
+plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], "r")
 
 ptAt90 = get_point_for_recall(0.9, recall, precision)
 print(ptAt90)
-plt.plot( [ptAt90[0],ptAt90[0]], [0,ptAt90[1]], 'b')
-plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], 'b')
+plt.plot([ptAt90[0], ptAt90[0]], [0, ptAt90[1]], "b")
+plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], "b")
 
 ptAt50fmt = "%.4f" % ptAt50[1]
 ptAt90fmt = "%.4f" % ptAt90[1]
 aucFmt = "%.4f" % auc_precision_recall
 plt.title(
-  f"Keras (nsfw sens prev test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...} ({...} pos), N_test={n_test_sens_prev} ({n_test_sens_prev_pos} pos)",
-  size=20
+    f"Keras (nsfw sens prev test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...} ({...} pos), N_test={n_test_sens_prev} ({n_test_sens_prev_pos} pos)",
+    size=20,
 )
 plt.subplots_adjust(top=0.72)
-plt.savefig('recall_precision_nsfw_Keras_with_twitter_CLIP_sens_prev_test.pdf')
\ No newline at end of file
+plt.savefig("recall_precision_nsfw_Keras_with_twitter_CLIP_sens_prev_test.pdf")
diff --git a/trust_and_safety_models/nsfw/nsfw_text.py b/trust_and_safety_models/nsfw/nsfw_text.py
index 980fc8fd4..0d7735371 100644
--- a/trust_and_safety_models/nsfw/nsfw_text.py
+++ b/trust_and_safety_models/nsfw/nsfw_text.py
@@ -1,41 +1,47 @@
+import os
+import re
 from datetime import datetime
 from functools import reduce
-import os
+
+import matplotlib.pyplot as plt
 import pandas as pd
-import re
-from sklearn.metrics import average_precision_score, classification_report, precision_recall_curve, PrecisionRecallDisplay
-from sklearn.model_selection import train_test_split
 import tensorflow as tf
-import matplotlib.pyplot as plt
-import re
-
+from sklearn.metrics import (
+    PrecisionRecallDisplay,
+    average_precision_score,
+    classification_report,
+    precision_recall_curve,
+)
+from sklearn.model_selection import train_test_split
 from twitter.cuad.representation.models.optimization import create_optimizer
 from twitter.cuad.representation.models.text_encoder import TextEncoder
 
-pd.set_option('display.max_colwidth', None)
-pd.set_option('display.expand_frame_repr', False)
+pd.set_option("display.max_colwidth", None)
+pd.set_option("display.expand_frame_repr", False)
 
 print(tf.__version__)
 print(tf.config.list_physical_devices())
 
-log_path = os.path.join('pnsfwtweettext_model_runs', datetime.now().strftime('%Y-%m-%d_%H.%M.%S'))
+log_path = os.path.join(
+    "pnsfwtweettext_model_runs", datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
+)
 
-tweet_text_feature = 'text'
+tweet_text_feature = "text"
 
 params = {
-  'batch_size': 32,
-  'max_seq_lengths': 256,
-  'model_type': 'twitter_bert_base_en_uncased_augmented_mlm',
-  'trainable_text_encoder': True,
-  'lr': 5e-5,
-  'epochs': 10,
+    "batch_size": 32,
+    "max_seq_lengths": 256,
+    "model_type": "twitter_bert_base_en_uncased_augmented_mlm",
+    "trainable_text_encoder": True,
+    "lr": 5e-5,
+    "epochs": 10,
 }
 
 REGEX_PATTERNS = [
-    r'^RT @[A-Za-z0-9_]+: ', 
+    r"^RT @[A-Za-z0-9_]+: ",
     r"@[A-Za-z0-9_]+",
-    r'https:\/\/t\.co\/[A-Za-z0-9]{10}',
-    r'@\?\?\?\?\?',
+    r"https:\/\/t\.co\/[A-Za-z0-9]{10}",
+    r"@\?\?\?\?\?",
 ]
 
 EMOJI_PATTERN = re.compile(
@@ -52,34 +58,40 @@
     "\U0001FA70-\U0001FAFF"
     "\U00002702-\U000027B0"
     "])"
-  )
+)
+
 
 def clean_tweet(text):
     for pattern in REGEX_PATTERNS:
-        text = re.sub(pattern, '', text)
+        text = re.sub(pattern, "", text)
+
+    text = re.sub(EMOJI_PATTERN, r" \1 ", text)
+
+    text = re.sub(r"\n", " ", text)
 
-    text = re.sub(EMOJI_PATTERN, r' \1 ', text)
-    
-    text = re.sub(r'\n', ' ', text)
-    
     return text.strip().lower()
 
 
-df['processed_text'] = df['text'].astype(str).map(clean_tweet)
+df["processed_text"] = df["text"].astype(str).map(clean_tweet)
 df.sample(10)
 
-X_train, X_val, y_train, y_val = train_test_split(df[['processed_text']], df['is_nsfw'], test_size=0.1, random_state=1)
+X_train, X_val, y_train, y_val = train_test_split(
+    df[["processed_text"]], df["is_nsfw"], test_size=0.1, random_state=1
+)
+
 
 def df_to_ds(X, y, shuffle=False):
-  ds = tf.data.Dataset.from_tensor_slices((
-    X.values,
-    tf.one_hot(tf.cast(y.values, tf.int32), depth=2, axis=-1)
-  ))
-  
-  if shuffle:
-    ds = ds.shuffle(1000, seed=1, reshuffle_each_iteration=True)
-  
-  return ds.map(lambda text, label: ({ tweet_text_feature: text }, label)).batch(params['batch_size'])
+    ds = tf.data.Dataset.from_tensor_slices(
+        (X.values, tf.one_hot(tf.cast(y.values, tf.int32), depth=2, axis=-1))
+    )
+
+    if shuffle:
+        ds = ds.shuffle(1000, seed=1, reshuffle_each_iteration=True)
+
+    return ds.map(lambda text, label: ({tweet_text_feature: text}, label)).batch(
+        params["batch_size"]
+    )
+
 
 ds_train = df_to_ds(X_train, y_train, shuffle=True)
 ds_val = df_to_ds(X_val, y_val)
@@ -87,51 +99,47 @@ def df_to_ds(X, y, shuffle=False):
 
 inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name=tweet_text_feature)
 encoder = TextEncoder(
-    max_seq_lengths=params['max_seq_lengths'],
-    model_type=params['model_type'],
-    trainable=params['trainable_text_encoder'],
-    local_preprocessor_path='demo-preprocessor'
+    max_seq_lengths=params["max_seq_lengths"],
+    model_type=params["model_type"],
+    trainable=params["trainable_text_encoder"],
+    local_preprocessor_path="demo-preprocessor",
 )
 embedding = encoder([inputs])["pooled_output"]
-predictions = tf.keras.layers.Dense(2, activation='softmax')(embedding)
+predictions = tf.keras.layers.Dense(2, activation="softmax")(embedding)
 model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
 
 model.summary()
 
 optimizer = create_optimizer(
-  params['lr'],
-  params['epochs'] * len(ds_train),
-  0,
-  weight_decay_rate=0.01,
-  optimizer_type='adamw'
+    params["lr"],
+    params["epochs"] * len(ds_train),
+    0,
+    weight_decay_rate=0.01,
+    optimizer_type="adamw",
 )
 bce = tf.keras.losses.BinaryCrossentropy(from_logits=False)
-pr_auc = tf.keras.metrics.AUC(curve='PR', num_thresholds=1000, from_logits=False)
+pr_auc = tf.keras.metrics.AUC(curve="PR", num_thresholds=1000, from_logits=False)
 model.compile(optimizer=optimizer, loss=bce, metrics=[pr_auc])
 
 callbacks = [
-  tf.keras.callbacks.EarlyStopping(
-    monitor='val_loss',
-    mode='min',
-    patience=1,
-    restore_best_weights=True
-  ),
-  tf.keras.callbacks.ModelCheckpoint(
-    filepath=os.path.join(log_path, 'checkpoints', '{epoch:02d}'),
-    save_freq='epoch'
-  ),
-  tf.keras.callbacks.TensorBoard(
-    log_dir=os.path.join(log_path, 'scalars'),
-    update_freq='batch',
-    write_graph=False
-  )
+    tf.keras.callbacks.EarlyStopping(
+        monitor="val_loss", mode="min", patience=1, restore_best_weights=True
+    ),
+    tf.keras.callbacks.ModelCheckpoint(
+        filepath=os.path.join(log_path, "checkpoints", "{epoch:02d}"), save_freq="epoch"
+    ),
+    tf.keras.callbacks.TensorBoard(
+        log_dir=os.path.join(log_path, "scalars"),
+        update_freq="batch",
+        write_graph=False,
+    ),
 ]
 history = model.fit(
-  ds_train,
-  epochs=params['epochs'],
-  callbacks=callbacks,
-  validation_data=ds_val,
-  steps_per_epoch=len(ds_train)
+    ds_train,
+    epochs=params["epochs"],
+    callbacks=callbacks,
+    validation_data=ds_val,
+    steps_per_epoch=len(ds_train),
 )
 
 model.predict(["xxx 🍑"])
diff --git a/trust_and_safety_models/toxicity/data/data_preprocessing.py b/trust_and_safety_models/toxicity/data/data_preprocessing.py
index f7da608f6..16ad273c8 100644
--- a/trust_and_safety_models/toxicity/data/data_preprocessing.py
+++ b/trust_and_safety_models/toxicity/data/data_preprocessing.py
@@ -1,10 +1,8 @@
-from abc import ABC
 import re
-
-from toxicity_ml_pipeline.settings.hcomp_settings import TOXIC_35
+from abc import ABC
 
 import numpy as np
-
+from toxicity_ml_pipeline.settings.hcomp_settings import TOXIC_35
 
 TOXIC_35_set = set(TOXIC_35)
 
@@ -18,101 +16,119 @@
 
 
 class DataframeCleaner(ABC):
-  def __init__(self):
-    pass
+    def __init__(self):
+        pass
 
-  def _clean(self, df):
-    return df
+    def _clean(self, df):
+        return df
 
-  def _systematic_preprocessing(self, df):
-    df.reset_index(inplace=True, drop=True)
-    if "media_url" in df.columns:
-      print(".... removing tweets with media")
-      df.drop(df[~df.media_url.isna()].index, inplace=True, axis=0)
-    else:
-      print("WARNING you are not removing tweets with media to train a BERT model.")
+    def _systematic_preprocessing(self, df):
+        df.reset_index(inplace=True, drop=True)
+        if "media_url" in df.columns:
+            print(".... removing tweets with media")
+            df.drop(df[~df.media_url.isna()].index, inplace=True, axis=0)
+        else:
+            print(
+                "WARNING you are not removing tweets with media to train a BERT model."
+            )
 
-    print(".... deleting duplicates")
-    df.drop_duplicates("text", inplace=True, keep="last")
-    print(f"Got {df.shape[0]} after cleaning")
+        print(".... deleting duplicates")
+        df.drop_duplicates("text", inplace=True, keep="last")
+        print(f"Got {df.shape[0]} after cleaning")
 
-    return df.reset_index(inplace=False, drop=True)
+        return df.reset_index(inplace=False, drop=True)
 
-  def _postprocess(self, df, *args, **kwargs):
-    return df
+    def _postprocess(self, df, *args, **kwargs):
+        return df
 
-  def __call__(self, df, *args, **kwargs):
-    print(f"Got {df.shape[0]} before cleaning")
+    def __call__(self, df, *args, **kwargs):
+        print(f"Got {df.shape[0]} before cleaning")
 
-    df["raw_text"] = df.text
-    df = self._clean(df)
+        df["raw_text"] = df.text
+        df = self._clean(df)
 
-    df = self._systematic_preprocessing(df)
+        df = self._systematic_preprocessing(df)
 
-    return self._postprocess(df, *args, **kwargs)
+        return self._postprocess(df, *args, **kwargs)
 
 
 def mapping_func(el):
-  if el.aggregated_content in TOXIC_35_set:
-    return 2
-  if el.label == 1:
-    return 1
-  return 0
+    if el.aggregated_content in TOXIC_35_set:
+        return 2
+    if el.label == 1:
+        return 1
+    return 0
 
 
 class DefaultENNoPreprocessor(DataframeCleaner):
-  def _postprocess(self, df, *args, **kwargs):
-    if "toxic_count" in df.columns and "non_toxic_count" in df.columns:
-      df["vote"] = df.toxic_count / (df.toxic_count + df.non_toxic_count)
-      df["agreement_rate"] = np.max((df.vote, 1 - df.vote), axis=0)
-
-    if "label_column" in kwargs and kwargs["label_column"] != "label":
-      if kwargs["label_column"] == "aggregated_content":
-        print("Replacing v3 label by v3.5 label.")
-        if "num_classes" in kwargs and kwargs["num_classes"] < 3:
-          df["label"] = np.where(df.aggregated_content.isin(TOXIC_35_set), 1, 0)
-        elif "num_classes" in kwargs and kwargs["num_classes"] == 3:
-          print("Making it a 3-class pb")
-          df["label"] = df.apply(mapping_func, axis=1)
-        else:
-          raise NotImplementedError
-      elif kwargs['label_column'] in df.columns:
-        df['label'] = df[kwargs['label_column']]
-        if kwargs['class_weight'] is not None:
-          df["class_weight"] = np.where(df['label'] == 1, 1-kwargs['class_weight'],
-                                        kwargs['class_weight'])
-      else:
-        raise NotImplementedError
-
-    if "filter_low_agreements" in kwargs and kwargs["filter_low_agreements"] == True:
-      df.drop(df[(df.agreement_rate <= 0.6)].index, axis=0, inplace=True)
-      raise NotImplementedError
-
-    return df
+    def _postprocess(self, df, *args, **kwargs):
+        if "toxic_count" in df.columns and "non_toxic_count" in df.columns:
+            df["vote"] = df.toxic_count / (df.toxic_count + df.non_toxic_count)
+            df["agreement_rate"] = np.max((df.vote, 1 - df.vote), axis=0)
+
+        if "label_column" in kwargs and kwargs["label_column"] != "label":
+            if kwargs["label_column"] == "aggregated_content":
+                print("Replacing v3 label by v3.5 label.")
+                if "num_classes" in kwargs and kwargs["num_classes"] < 3:
+                    df["label"] = np.where(
+                        df.aggregated_content.isin(TOXIC_35_set), 1, 0
+                    )
+                elif "num_classes" in kwargs and kwargs["num_classes"] == 3:
+                    print("Making it a 3-class pb")
+                    df["label"] = df.apply(mapping_func, axis=1)
+                else:
+                    raise NotImplementedError
+            elif kwargs["label_column"] in df.columns:
+                df["label"] = df[kwargs["label_column"]]
+                if kwargs["class_weight"] is not None:
+                    df["class_weight"] = np.where(
+                        df["label"] == 1,
+                        1 - kwargs["class_weight"],
+                        kwargs["class_weight"],
+                    )
+            else:
+                raise NotImplementedError
+
+        if (
+            "filter_low_agreements" in kwargs
+            and kwargs["filter_low_agreements"] == True
+        ):
+            df.drop(df[(df.agreement_rate <= 0.6)].index, axis=0, inplace=True)
+            raise NotImplementedError
+
+        return df
 
 
 class DefaultENPreprocessor(DefaultENNoPreprocessor):
-  def _clean(self, adhoc_df):
-    print(
-      ".... removing \\n and replacing @mentions and URLs by placeholders. "
-      "Emoji filtering is not done."
-    )
-    adhoc_df["text"] = [url_re.sub("URL", tweet) for tweet in adhoc_df.raw_text.values]
-    adhoc_df["text"] = [mention_re.sub("MENTION", tweet) for tweet in adhoc_df.text.values]
-    adhoc_df["text"] = [
-      newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
-    ]
-    adhoc_df["text"] = [and_re.sub("&", tweet) for tweet in adhoc_df.text.values]
-
-    return adhoc_df
+    def _clean(self, adhoc_df):
+        print(
+            ".... removing \\n and replacing @mentions and URLs by placeholders. "
+            "Emoji filtering is not done."
+        )
+        adhoc_df["text"] = [
+            url_re.sub("URL", tweet) for tweet in adhoc_df.raw_text.values
+        ]
+        adhoc_df["text"] = [
+            mention_re.sub("MENTION", tweet) for tweet in adhoc_df.text.values
+        ]
+        adhoc_df["text"] = [
+            newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ")
+            for tweet in adhoc_df.text.values
+        ]
+        adhoc_df["text"] = [and_re.sub("&", tweet) for tweet in adhoc_df.text.values]
+
+        return adhoc_df
 
 
 class Defaulti18nPreprocessor(DataframeCleaner):
-  def _clean(self, adhoc_df):
-    print(".... removing @mentions, \\n and URLs. Emoji filtering is not done.")
-    adhoc_df["text"] = [urls_mentions_re.sub("", tweet) for tweet in adhoc_df.raw_text.values]
-    adhoc_df["text"] = [
-      newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
-    ]
-
-    return adhoc_df
+    def _clean(self, adhoc_df):
+        print(".... removing @mentions, \\n and URLs. Emoji filtering is not done.")
+        adhoc_df["text"] = [
+            urls_mentions_re.sub("", tweet) for tweet in adhoc_df.raw_text.values
+        ]
+        adhoc_df["text"] = [
+            newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ")
+            for tweet in adhoc_df.text.values
+        ]
+
+        return adhoc_df
diff --git a/trust_and_safety_models/toxicity/data/dataframe_loader.py b/trust_and_safety_models/toxicity/data/dataframe_loader.py
index f3855d6b5..8c47175ed 100644
--- a/trust_and_safety_models/toxicity/data/dataframe_loader.py
+++ b/trust_and_safety_models/toxicity/data/dataframe_loader.py
@@ -1,24 +1,16 @@
+import pickle
 from abc import ABC, abstractmethod
 from datetime import date
 from importlib import import_module
-import pickle
-
-from toxicity_ml_pipeline.settings.default_settings_tox import (
-  CLIENT,
-  EXISTING_TASK_VERSIONS,
-  GCS_ADDRESS,
-  TRAINING_DATA_LOCATION,
-)
-from toxicity_ml_pipeline.utils.helpers import execute_command, execute_query
-from toxicity_ml_pipeline.utils.queries import (
-  FULL_QUERY,
-  FULL_QUERY_W_TWEET_TYPES,
-  PARSER_UDF,
-  QUERY_SETTINGS,
-)
 
 import numpy as np
 import pandas
+from toxicity_ml_pipeline.settings.default_settings_tox import (
+    CLIENT, EXISTING_TASK_VERSIONS, GCS_ADDRESS, TRAINING_DATA_LOCATION)
+from toxicity_ml_pipeline.utils.helpers import execute_command, execute_query
+from toxicity_ml_pipeline.utils.queries import (FULL_QUERY,
+                                                FULL_QUERY_W_TWEET_TYPES,
+                                                PARSER_UDF, QUERY_SETTINGS)
 
 
 class DataframeLoader(ABC):
diff --git a/trust_and_safety_models/toxicity/data/mb_generator.py b/trust_and_safety_models/toxicity/data/mb_generator.py
index 58a89f8c5..efc6fe43d 100644
--- a/trust_and_safety_models/toxicity/data/mb_generator.py
+++ b/trust_and_safety_models/toxicity/data/mb_generator.py
@@ -1,284 +1,321 @@
-from importlib import import_module
 import os
-
-from toxicity_ml_pipeline.settings.default_settings_tox import (
-  INNER_CV,
-  LOCAL_DIR,
-  MAX_SEQ_LENGTH,
-  NUM_PREFETCH,
-  NUM_WORKERS,
-  OUTER_CV,
-  TARGET_POS_PER_EPOCH,
-)
-from toxicity_ml_pipeline.utils.helpers import execute_command
+from importlib import import_module
 
 import numpy as np
 import pandas
-from sklearn.model_selection import StratifiedKFold
 import tensorflow as tf
-
+from sklearn.model_selection import StratifiedKFold
+from toxicity_ml_pipeline.settings.default_settings_tox import (
+    INNER_CV,
+    LOCAL_DIR,
+    MAX_SEQ_LENGTH,
+    NUM_PREFETCH,
+    NUM_WORKERS,
+    OUTER_CV,
+    TARGET_POS_PER_EPOCH,
+)
+from toxicity_ml_pipeline.utils.helpers import execute_command
 
 try:
-  from transformers import AutoTokenizer, DataCollatorWithPadding
+    from transformers import AutoTokenizer, DataCollatorWithPadding
 except ModuleNotFoundError:
-  print("...")
+    print("...")
 else:
-  from datasets import Dataset
+    from datasets import Dataset
 
 
 class BalancedMiniBatchLoader(object):
-  def __init__(
-    self,
-    fold,
-    mb_size,
-    seed,
-    perc_training_tox,
-    scope="TOX",
-    project=...,
-    dual_head=None,
-    n_outer_splits=None,
-    n_inner_splits=None,
-    sample_weights=None,
-    huggingface=False,
-  ):
-    if 0 >= perc_training_tox or perc_training_tox > 0.5:
-      raise ValueError("Perc_training_tox should be in ]0; 0.5]")
-
-    self.perc_training_tox = perc_training_tox
-    if not n_outer_splits:
-      n_outer_splits = OUTER_CV
-    if isinstance(n_outer_splits, int):
-      self.n_outer_splits = n_outer_splits
-      self.get_outer_fold = self._get_outer_cv_fold
-      if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold:
-        raise ValueError(f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [.")
-
-    elif n_outer_splits == "time":
-      self.get_outer_fold = self._get_time_fold
-      if fold != "time":
-        raise ValueError(
-          "To avoid repeating the same run many times, the external fold"
-          "should be time when test data is split according to dates."
+    def __init__(
+        self,
+        fold,
+        mb_size,
+        seed,
+        perc_training_tox,
+        scope="TOX",
+        project=...,
+        dual_head=None,
+        n_outer_splits=None,
+        n_inner_splits=None,
+        sample_weights=None,
+        huggingface=False,
+    ):
+        if 0 >= perc_training_tox or perc_training_tox > 0.5:
+            raise ValueError("Perc_training_tox should be in ]0; 0.5]")
+
+        self.perc_training_tox = perc_training_tox
+        if not n_outer_splits:
+            n_outer_splits = OUTER_CV
+        if isinstance(n_outer_splits, int):
+            self.n_outer_splits = n_outer_splits
+            self.get_outer_fold = self._get_outer_cv_fold
+            if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold:
+                raise ValueError(
+                    f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [."
+                )
+
+        elif n_outer_splits == "time":
+            self.get_outer_fold = self._get_time_fold
+            if fold != "time":
+                raise ValueError(
+                    "To avoid repeating the same run many times, the external fold"
+                    "should be time when test data is split according to dates."
+                )
+            try:
+                setting_file = import_module(
+                    f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings"
+                )
+            except ModuleNotFoundError:
+                raise ValueError(
+                    f"You need to define a setting file for your project {project}."
+                )
+            self.test_begin_date = setting_file.TEST_BEGIN_DATE
+            self.test_end_date = setting_file.TEST_END_DATE
+
+        else:
+            raise ValueError(
+                f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}"
+            )
+
+        self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV
+
+        self.seed = seed
+        self.mb_size = mb_size
+        self.fold = fold
+
+        self.sample_weights = sample_weights
+        self.dual_head = dual_head
+        self.huggingface = huggingface
+        if self.huggingface:
+            self._load_tokenizer()
+
+    def _load_tokenizer(self):
+        print("Making a local copy of Bertweet-base model")
+        local_model_dir = os.path.join(LOCAL_DIR, "models")
+        cmd = f"mkdir {local_model_dir} ; gsutil -m cp -r gs://... {local_model_dir}"
+        execute_command(cmd)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            os.path.join(local_model_dir, "bertweet-base"), normalization=True
+        )
+
+    def tokenize_function(self, el):
+        return self.tokenizer(
+            el["text"],
+            max_length=MAX_SEQ_LENGTH,
+            padding="max_length",
+            truncation=True,
+            add_special_tokens=True,
+            return_token_type_ids=False,
+            return_attention_mask=False,
+        )
+
+    def _get_stratified_kfold(self, n_splits):
+        return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed)
+
+    def _get_time_fold(self, df):
+        test_begin_date = pandas.to_datetime(self.test_begin_date).date()
+        test_end_date = pandas.to_datetime(self.test_end_date).date()
+        print(f"Test is going from {test_begin_date} to {test_end_date}.")
+        test_data = df.query("@test_begin_date <= date <= @test_end_date")
+
+        query = "date < @test_begin_date"
+        other_set = df.query(query)
+        return other_set, test_data
+
+    def _get_outer_cv_fold(self, df):
+        labels = df.int_label
+        stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits)
+
+        k = 0
+        for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels):
+            if k == self.fold:
+                break
+            k += 1
+
+        train_data = df.iloc[train_index].copy()
+        test_data = df.iloc[test_index].copy()
+
+        return train_data, test_data
+
+    def get_steps_per_epoch(self, nb_pos_examples):
+        return int(
+            max(TARGET_POS_PER_EPOCH, nb_pos_examples)
+            / self.mb_size
+            / self.perc_training_tox
+        )
+
+    def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle=True):
+        huggingface_ds = Dataset.from_pandas(group).map(
+            self.tokenize_function, batched=True
+        )
+        data_collator = DataCollatorWithPadding(
+            tokenizer=self.tokenizer, return_tensors="tf"
+        )
+        tensorflow_ds = huggingface_ds.to_tf_dataset(
+            columns=["input_ids"],
+            label_cols=["labels"],
+            shuffle=shuffle,
+            batch_size=self.mb_size if mb_size is None else mb_size,
+            collate_fn=data_collator,
         )
-      try:
-        setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings")
-      except ModuleNotFoundError:
-        raise ValueError(f"You need to define a setting file for your project {project}.")
-      self.test_begin_date = setting_file.TEST_BEGIN_DATE
-      self.test_end_date = setting_file.TEST_END_DATE
-
-    else:
-      raise ValueError(
-        f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}"
-      )
-
-    self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV
-
-    self.seed = seed
-    self.mb_size = mb_size
-    self.fold = fold
-
-    self.sample_weights = sample_weights
-    self.dual_head = dual_head
-    self.huggingface = huggingface
-    if self.huggingface:
-      self._load_tokenizer()
-
-  def _load_tokenizer(self):
-    print("Making a local copy of Bertweet-base model")
-    local_model_dir = os.path.join(LOCAL_DIR, "models")
-    cmd = f"mkdir {local_model_dir} ; gsutil -m cp -r gs://... {local_model_dir}"
-    execute_command(cmd)
-
-    self.tokenizer = AutoTokenizer.from_pretrained(
-      os.path.join(local_model_dir, "bertweet-base"), normalization=True
-    )
-
-  def tokenize_function(self, el):
-    return self.tokenizer(
-      el["text"],
-      max_length=MAX_SEQ_LENGTH,
-      padding="max_length",
-      truncation=True,
-      add_special_tokens=True,
-      return_token_type_ids=False,
-      return_attention_mask=False,
-    )
-
-  def _get_stratified_kfold(self, n_splits):
-    return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed)
-
-  def _get_time_fold(self, df):
-    test_begin_date = pandas.to_datetime(self.test_begin_date).date()
-    test_end_date = pandas.to_datetime(self.test_end_date).date()
-    print(f"Test is going from {test_begin_date} to {test_end_date}.")
-    test_data = df.query("@test_begin_date <= date <= @test_end_date")
-
-    query = "date < @test_begin_date"
-    other_set = df.query(query)
-    return other_set, test_data
-
-  def _get_outer_cv_fold(self, df):
-    labels = df.int_label
-    stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits)
-
-    k = 0
-    for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels):
-      if k == self.fold:
-        break
-      k += 1
-
-    train_data = df.iloc[train_index].copy()
-    test_data = df.iloc[test_index].copy()
-
-    return train_data, test_data
-
-  def get_steps_per_epoch(self, nb_pos_examples):
-    return int(max(TARGET_POS_PER_EPOCH, nb_pos_examples) / self.mb_size / self.perc_training_tox)
-
-  def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle=True):
-    huggingface_ds = Dataset.from_pandas(group).map(self.tokenize_function, batched=True)
-    data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, return_tensors="tf")
-    tensorflow_ds = huggingface_ds.to_tf_dataset(
-      columns=["input_ids"],
-      label_cols=["labels"],
-      shuffle=shuffle,
-      batch_size=self.mb_size if mb_size is None else mb_size,
-      collate_fn=data_collator,
-    )
-
-    if shuffle:
-      return tensorflow_ds.repeat()
-    return tensorflow_ds
-
-  def make_pure_tensorflow_ds(self, df, nb_samples):
-    buffer_size = nb_samples * 2
-
-    if self.sample_weights is not None:
-      if self.sample_weights not in df.columns:
-        raise ValueError
-      ds = tf.data.Dataset.from_tensor_slices(
-        (df.text.values, df.label.values, df[self.sample_weights].values)
-      )
-    elif self.dual_head:
-      label_d = {f'{e}_output': df[f'{e}_label'].values for e in self.dual_head}
-      label_d['content_output'] = tf.keras.utils.to_categorical(label_d['content_output'], num_classes=3)
-      ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d))
-
-    else:
-      ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values))
-    ds = ds.shuffle(buffer_size, seed=self.seed, reshuffle_each_iteration=True).repeat()
-    return ds
-
-  def get_balanced_dataset(self, training_data, size_limit=None, return_as_batch=True):
-    training_data = training_data.sample(frac=1, random_state=self.seed)
-    nb_samples = training_data.shape[0] if not size_limit else size_limit
-
-    num_classes = training_data.int_label.nunique()
-    toxic_class = training_data.int_label.max()
-    if size_limit:
-      training_data = training_data[: size_limit * num_classes]
-
-    print(
-      ".... {} examples, incl. {:.2f}% tox in train, {} classes".format(
-        nb_samples,
-        100 * training_data[training_data.int_label == toxic_class].shape[0] / nb_samples,
-        num_classes,
-      )
-    )
-    label_groups = training_data.groupby("int_label")
-    if self.huggingface:
-      label_datasets = {
-        label: self.make_huggingface_tensorflow_ds(group) for label, group in label_groups
-      }
-
-    else:
-      label_datasets = {
-        label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2)
-        for label, group in label_groups
-      }
-
-    datasets = [label_datasets[0], label_datasets[1]]
-    weights = [1 - self.perc_training_tox, self.perc_training_tox]
-    if num_classes == 3:
-      datasets.append(label_datasets[2])
-      weights = [1 - self.perc_training_tox, self.perc_training_tox / 2, self.perc_training_tox / 2]
-    elif num_classes != 2:
-      raise ValueError("Currently it should not be possible to get other than 2 or 3 classes")
-    resampled_ds = tf.data.experimental.sample_from_datasets(datasets, weights, seed=self.seed)
-
-    if return_as_batch and not self.huggingface:
-      return resampled_ds.batch(
-        self.mb_size, drop_remainder=True, num_parallel_calls=NUM_WORKERS, deterministic=True
-      ).prefetch(NUM_PREFETCH)
-
-    return resampled_ds
-
-  @staticmethod
-  def _compute_int_labels(full_df):
-    if full_df.label.dtype == int:
-      full_df["int_label"] = full_df.label
-
-    elif "int_label" not in full_df.columns:
-      if full_df.label.max() > 1:
-        raise ValueError("Binarizing labels that should not be.")
-      full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0)
-
-    return full_df
-
-  def __call__(self, full_df, *args, **kwargs):
-    full_df = self._compute_int_labels(full_df)
-
-    train_data, test_data = self.get_outer_fold(df=full_df)
-
-    stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits)
-    for train_index, val_index in stratifier.split(
-      np.zeros(train_data.shape[0]), train_data.int_label
+
+        if shuffle:
+            return tensorflow_ds.repeat()
+        return tensorflow_ds
+
+    def make_pure_tensorflow_ds(self, df, nb_samples):
+        buffer_size = nb_samples * 2
+
+        if self.sample_weights is not None:
+            if self.sample_weights not in df.columns:
+                raise ValueError
+            ds = tf.data.Dataset.from_tensor_slices(
+                (df.text.values, df.label.values, df[self.sample_weights].values)
+            )
+        elif self.dual_head:
+            label_d = {f"{e}_output": df[f"{e}_label"].values for e in self.dual_head}
+            label_d["content_output"] = tf.keras.utils.to_categorical(
+                label_d["content_output"], num_classes=3
+            )
+            ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d))
+
+        else:
+            ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values))
+        ds = ds.shuffle(
+            buffer_size, seed=self.seed, reshuffle_each_iteration=True
+        ).repeat()
+        return ds
+
+    def get_balanced_dataset(
+        self, training_data, size_limit=None, return_as_batch=True
     ):
-      curr_train_data = train_data.iloc[train_index]
+        training_data = training_data.sample(frac=1, random_state=self.seed)
+        nb_samples = training_data.shape[0] if not size_limit else size_limit
+
+        num_classes = training_data.int_label.nunique()
+        toxic_class = training_data.int_label.max()
+        if size_limit:
+            training_data = training_data[: size_limit * num_classes]
+
+        print(
+            ".... {} examples, incl. {:.2f}% tox in train, {} classes".format(
+                nb_samples,
+                100
+                * training_data[training_data.int_label == toxic_class].shape[0]
+                / nb_samples,
+                num_classes,
+            )
+        )
+        label_groups = training_data.groupby("int_label")
+        if self.huggingface:
+            label_datasets = {
+                label: self.make_huggingface_tensorflow_ds(group)
+                for label, group in label_groups
+            }
+
+        else:
+            label_datasets = {
+                label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2)
+                for label, group in label_groups
+            }
+
+        datasets = [label_datasets[0], label_datasets[1]]
+        weights = [1 - self.perc_training_tox, self.perc_training_tox]
+        if num_classes == 3:
+            datasets.append(label_datasets[2])
+            weights = [
+                1 - self.perc_training_tox,
+                self.perc_training_tox / 2,
+                self.perc_training_tox / 2,
+            ]
+        elif num_classes != 2:
+            raise ValueError(
+                "Currently it should not be possible to get other than 2 or 3 classes"
+            )
+        resampled_ds = tf.data.experimental.sample_from_datasets(
+            datasets, weights, seed=self.seed
+        )
+
+        if return_as_batch and not self.huggingface:
+            return resampled_ds.batch(
+                self.mb_size,
+                drop_remainder=True,
+                num_parallel_calls=NUM_WORKERS,
+                deterministic=True,
+            ).prefetch(NUM_PREFETCH)
+
+        return resampled_ds
+
+    @staticmethod
+    def _compute_int_labels(full_df):
+        if full_df.label.dtype == int:
+            full_df["int_label"] = full_df.label
+
+        elif "int_label" not in full_df.columns:
+            if full_df.label.max() > 1:
+                raise ValueError("Binarizing labels that should not be.")
+            full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0)
 
-      mini_batches = self.get_balanced_dataset(curr_train_data)
+        return full_df
 
-      steps_per_epoch = self.get_steps_per_epoch(
-        nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0]
-      )
+    def __call__(self, full_df, *args, **kwargs):
+        full_df = self._compute_int_labels(full_df)
 
-      val_data = train_data.iloc[val_index].copy()
+        train_data, test_data = self.get_outer_fold(df=full_df)
 
-      yield mini_batches, steps_per_epoch, val_data, test_data
+        stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits)
+        for train_index, val_index in stratifier.split(
+            np.zeros(train_data.shape[0]), train_data.int_label
+        ):
+            curr_train_data = train_data.iloc[train_index]
 
-  def simple_cv_load(self, full_df):
-    full_df = self._compute_int_labels(full_df)
+            mini_batches = self.get_balanced_dataset(curr_train_data)
 
-    train_data, test_data = self.get_outer_fold(df=full_df)
-    if test_data.shape[0] == 0:
-      test_data = train_data.iloc[:500]
+            steps_per_epoch = self.get_steps_per_epoch(
+                nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0]
+            )
 
-    mini_batches = self.get_balanced_dataset(train_data)
-    steps_per_epoch = self.get_steps_per_epoch(
-      nb_pos_examples=train_data[train_data.int_label != 0].shape[0]
-    )
+            val_data = train_data.iloc[val_index].copy()
+
+            yield mini_batches, steps_per_epoch, val_data, test_data
+
+    def simple_cv_load(self, full_df):
+        full_df = self._compute_int_labels(full_df)
+
+        train_data, test_data = self.get_outer_fold(df=full_df)
+        if test_data.shape[0] == 0:
+            test_data = train_data.iloc[:500]
+
+        mini_batches = self.get_balanced_dataset(train_data)
+        steps_per_epoch = self.get_steps_per_epoch(
+            nb_pos_examples=train_data[train_data.int_label != 0].shape[0]
+        )
 
-    return mini_batches, test_data, steps_per_epoch
+        return mini_batches, test_data, steps_per_epoch
 
-  def no_cv_load(self, full_df):
-    full_df = self._compute_int_labels(full_df)
+    def no_cv_load(self, full_df):
+        full_df = self._compute_int_labels(full_df)
 
-    val_test = full_df[full_df.origin == "precision"].copy(deep=True)
-    val_data, test_data = self.get_outer_fold(df=val_test)
+        val_test = full_df[full_df.origin == "precision"].copy(deep=True)
+        val_data, test_data = self.get_outer_fold(df=val_test)
 
-    train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0)
-    if test_data.shape[0] == 0:
-      test_data = train_data.iloc[:500]
+        train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0)
+        if test_data.shape[0] == 0:
+            test_data = train_data.iloc[:500]
 
-    mini_batches = self.get_balanced_dataset(train_data)
-    if train_data.int_label.nunique() == 1:
-      raise ValueError('Should be at least two labels')
+        mini_batches = self.get_balanced_dataset(train_data)
+        if train_data.int_label.nunique() == 1:
+            raise ValueError("Should be at least two labels")
 
-    num_examples = train_data[train_data.int_label == 1].shape[0]
-    if train_data.int_label.nunique() > 2:
-      second_most_frequent_label = train_data.loc[train_data.int_label != 0, 'int_label'].mode().values[0]
-      num_examples = train_data[train_data.int_label == second_most_frequent_label].shape[0] * 2
-    steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples)
+        num_examples = train_data[train_data.int_label == 1].shape[0]
+        if train_data.int_label.nunique() > 2:
+            second_most_frequent_label = (
+                train_data.loc[train_data.int_label != 0, "int_label"].mode().values[0]
+            )
+            num_examples = (
+                train_data[train_data.int_label == second_most_frequent_label].shape[0]
+                * 2
+            )
+        steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples)
 
-    return mini_batches, steps_per_epoch, val_data, test_data
+        return mini_batches, steps_per_epoch, val_data, test_data
diff --git a/trust_and_safety_models/toxicity/load_model.py b/trust_and_safety_models/toxicity/load_model.py
index 7b271066f..8f35d5e9f 100644
--- a/trust_and_safety_models/toxicity/load_model.py
+++ b/trust_and_safety_models/toxicity/load_model.py
@@ -1,227 +1,255 @@
 import os
 
 from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR, MAX_SEQ_LENGTH
+
 try:
-  from toxicity_ml_pipeline.optim.losses import MaskedBCE
+    from toxicity_ml_pipeline.optim.losses import MaskedBCE
 except ImportError:
-  print('No MaskedBCE loss')
-from toxicity_ml_pipeline.utils.helpers import execute_command
-
+    print("No MaskedBCE loss")
 import tensorflow as tf
-
+from toxicity_ml_pipeline.utils.helpers import execute_command
 
 try:
-  from twitter.cuad.representation.models.text_encoder import TextEncoder
+    from twitter.cuad.representation.models.text_encoder import TextEncoder
 except ModuleNotFoundError:
-  print("No TextEncoder package")
+    print("No TextEncoder package")
 
 try:
-  from transformers import TFAutoModelForSequenceClassification
+    from transformers import TFAutoModelForSequenceClassification
 except ModuleNotFoundError:
-  print("No HuggingFace package")
+    print("No HuggingFace package")
 
 LOCAL_MODEL_DIR = os.path.join(LOCAL_DIR, "models")
 
 
 def reload_model_weights(weights_dir, language, **kwargs):
-  optimizer = tf.keras.optimizers.Adam(0.01)
-  model_type = (
-    "twitter_bert_base_en_uncased_mlm"
-    if language == "en"
-    else "twitter_multilingual_bert_base_cased_mlm"
-  )
-  model = load(optimizer=optimizer, seed=42, model_type=model_type, **kwargs)
-  model.load_weights(weights_dir)
+    optimizer = tf.keras.optimizers.Adam(0.01)
+    model_type = (
+        "twitter_bert_base_en_uncased_mlm"
+        if language == "en"
+        else "twitter_multilingual_bert_base_cased_mlm"
+    )
+    model = load(optimizer=optimizer, seed=42, model_type=model_type, **kwargs)
+    model.load_weights(weights_dir)
 
-  return model
+    return model
 
 
 def _locally_copy_models(model_type):
-  if model_type == "twitter_multilingual_bert_base_cased_mlm":
-    preprocessor = "bert_multi_cased_preprocess_3"
-  elif model_type == "twitter_bert_base_en_uncased_mlm":
-    preprocessor = "bert_en_uncased_preprocess_3"
-  else:
-    raise NotImplementedError
-
-  copy_cmd = """mkdir {local_dir}
+    if model_type == "twitter_multilingual_bert_base_cased_mlm":
+        preprocessor = "bert_multi_cased_preprocess_3"
+    elif model_type == "twitter_bert_base_en_uncased_mlm":
+        preprocessor = "bert_en_uncased_preprocess_3"
+    else:
+        raise NotImplementedError
+
+    copy_cmd = """mkdir {local_dir}
 gsutil cp -r ...
 gsutil cp -r ..."""
-  execute_command(
-    copy_cmd.format(model_type=model_type, preprocessor=preprocessor, local_dir=LOCAL_MODEL_DIR)
-  )
+    execute_command(
+        copy_cmd.format(
+            model_type=model_type, preprocessor=preprocessor, local_dir=LOCAL_MODEL_DIR
+        )
+    )
 
-  return preprocessor
+    return preprocessor
 
 
 def load_encoder(model_type, trainable):
-  try:
-    model = TextEncoder(
-      max_seq_lengths=MAX_SEQ_LENGTH,
-      model_type=model_type,
-      cluster="gcp",
-      trainable=trainable,
-      enable_dynamic_shapes=True,
-    )
-  except (OSError, tf.errors.AbortedError) as e:
-    print(e)
-    preprocessor = _locally_copy_models(model_type)
-
-    model = TextEncoder(
-      max_seq_lengths=MAX_SEQ_LENGTH,
-      local_model_path=f"models/{model_type}",
-      local_preprocessor_path=f"models/{preprocessor}",
-      cluster="gcp",
-      trainable=trainable,
-      enable_dynamic_shapes=True,
-    )
-
-  return model
+    try:
+        model = TextEncoder(
+            max_seq_lengths=MAX_SEQ_LENGTH,
+            model_type=model_type,
+            cluster="gcp",
+            trainable=trainable,
+            enable_dynamic_shapes=True,
+        )
+    except (OSError, tf.errors.AbortedError) as e:
+        print(e)
+        preprocessor = _locally_copy_models(model_type)
+
+        model = TextEncoder(
+            max_seq_lengths=MAX_SEQ_LENGTH,
+            local_model_path=f"models/{model_type}",
+            local_preprocessor_path=f"models/{preprocessor}",
+            cluster="gcp",
+            trainable=trainable,
+            enable_dynamic_shapes=True,
+        )
+
+    return model
 
 
 def get_loss(loss_name, from_logits, **kwargs):
-  loss_name = loss_name.lower()
-  if loss_name == "bce":
-    print("Binary CE loss")
-    return tf.keras.losses.BinaryCrossentropy(from_logits=from_logits)
-
-  if loss_name == "cce":
-    print("Categorical cross-entropy loss")
-    return tf.keras.losses.CategoricalCrossentropy(from_logits=from_logits)
-
-  if loss_name == "scce":
-    print("Sparse categorical cross-entropy loss")
-    return tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits)
-
-  if loss_name == "focal_bce":
-    gamma = kwargs.get("gamma", 2)
-    print("Focal binary CE loss", gamma)
-    return tf.keras.losses.BinaryFocalCrossentropy(gamma=gamma, from_logits=from_logits)
-
-  if loss_name == 'masked_bce':
-    multitask = kwargs.get("multitask", False)
-    if from_logits or multitask:
-      raise NotImplementedError
-    print(f'Masked Binary Cross Entropy')
-    return MaskedBCE()
-
-  if loss_name == "inv_kl_loss":
-    raise NotImplementedError
-
-  raise ValueError(
-    f"This loss name is not valid: {loss_name}. Accepted loss names: BCE, masked BCE, CCE, sCCE, "
-    f"Focal_BCE, inv_KL_loss"
-  )
+    loss_name = loss_name.lower()
+    if loss_name == "bce":
+        print("Binary CE loss")
+        return tf.keras.losses.BinaryCrossentropy(from_logits=from_logits)
+
+    if loss_name == "cce":
+        print("Categorical cross-entropy loss")
+        return tf.keras.losses.CategoricalCrossentropy(from_logits=from_logits)
+
+    if loss_name == "scce":
+        print("Sparse categorical cross-entropy loss")
+        return tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits)
+
+    if loss_name == "focal_bce":
+        gamma = kwargs.get("gamma", 2)
+        print("Focal binary CE loss", gamma)
+        return tf.keras.losses.BinaryFocalCrossentropy(
+            gamma=gamma, from_logits=from_logits
+        )
+
+    if loss_name == "masked_bce":
+        multitask = kwargs.get("multitask", False)
+        if from_logits or multitask:
+            raise NotImplementedError
+        print(f"Masked Binary Cross Entropy")
+        return MaskedBCE()
+
+    if loss_name == "inv_kl_loss":
+        raise NotImplementedError
+
+    raise ValueError(
+        f"This loss name is not valid: {loss_name}. Accepted loss names: BCE, masked BCE, CCE, sCCE, "
+        f"Focal_BCE, inv_KL_loss"
+    )
+
 
 def _add_additional_embedding_layer(doc_embedding, glorot, seed):
-  doc_embedding = tf.keras.layers.Dense(768, activation="tanh", kernel_initializer=glorot)(doc_embedding)
-  doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
-  return doc_embedding
+    doc_embedding = tf.keras.layers.Dense(
+        768, activation="tanh", kernel_initializer=glorot
+    )(doc_embedding)
+    doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
+    return doc_embedding
+
 
 def _get_bias(**kwargs):
-  smart_bias_value = kwargs.get('smart_bias_value', 0)
-  print('Smart bias init to ', smart_bias_value)
-  output_bias = tf.keras.initializers.Constant(smart_bias_value)
-  return output_bias
+    smart_bias_value = kwargs.get("smart_bias_value", 0)
+    print("Smart bias init to ", smart_bias_value)
+    output_bias = tf.keras.initializers.Constant(smart_bias_value)
+    return output_bias
 
 
 def load_inhouse_bert(model_type, trainable, seed, **kwargs):
-  inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
-  encoder = load_encoder(model_type=model_type, trainable=trainable)
-  doc_embedding = encoder([inputs])["pooled_output"]
-  doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
-
-  glorot = tf.keras.initializers.glorot_uniform(seed=seed)
-  if kwargs.get("additional_layer", False):
-    doc_embedding = _add_additional_embedding_layer(doc_embedding, glorot, seed)
-
-  if kwargs.get('content_num_classes', None):
-    probs = get_last_layer(glorot=glorot, last_layer_name='target_output', **kwargs)(doc_embedding)
-    second_probs = get_last_layer(num_classes=kwargs['content_num_classes'],
-                                  last_layer_name='content_output',
-                                  glorot=glorot)(doc_embedding)
-    probs = [probs, second_probs]
-  else:
-    probs = get_last_layer(glorot=glorot, **kwargs)(doc_embedding)
-  model = tf.keras.models.Model(inputs=inputs, outputs=probs)
-
-  return model, False
+    inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
+    encoder = load_encoder(model_type=model_type, trainable=trainable)
+    doc_embedding = encoder([inputs])["pooled_output"]
+    doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
+
+    glorot = tf.keras.initializers.glorot_uniform(seed=seed)
+    if kwargs.get("additional_layer", False):
+        doc_embedding = _add_additional_embedding_layer(doc_embedding, glorot, seed)
+
+    if kwargs.get("content_num_classes", None):
+        probs = get_last_layer(
+            glorot=glorot, last_layer_name="target_output", **kwargs
+        )(doc_embedding)
+        second_probs = get_last_layer(
+            num_classes=kwargs["content_num_classes"],
+            last_layer_name="content_output",
+            glorot=glorot,
+        )(doc_embedding)
+        probs = [probs, second_probs]
+    else:
+        probs = get_last_layer(glorot=glorot, **kwargs)(doc_embedding)
+    model = tf.keras.models.Model(inputs=inputs, outputs=probs)
+
+    return model, False
 
-def get_last_layer(**kwargs):
-  output_bias = _get_bias(**kwargs)
-  if 'glorot' in kwargs:
-    glorot = kwargs['glorot']
-  else:
-    glorot = tf.keras.initializers.glorot_uniform(seed=kwargs['seed'])
-  layer_name = kwargs.get('last_layer_name', 'dense_1')
-
-  if kwargs.get('num_classes', 1) > 1:
-    last_layer = tf.keras.layers.Dense(
-      kwargs["num_classes"], activation="softmax", kernel_initializer=glorot,
-      bias_initializer=output_bias, name=layer_name
-    )
 
-  elif kwargs.get('num_raters', 1) > 1:
-    if kwargs.get('multitask', False):
-      raise NotImplementedError
-    last_layer = tf.keras.layers.Dense(
-      kwargs['num_raters'], activation="sigmoid", kernel_initializer=glorot,
-      bias_initializer=output_bias, name='probs')
-
-  else:
-    last_layer = tf.keras.layers.Dense(
-      1, activation="sigmoid", kernel_initializer=glorot,
-      bias_initializer=output_bias, name=layer_name
-    )
+def get_last_layer(**kwargs):
+    output_bias = _get_bias(**kwargs)
+    if "glorot" in kwargs:
+        glorot = kwargs["glorot"]
+    else:
+        glorot = tf.keras.initializers.glorot_uniform(seed=kwargs["seed"])
+    layer_name = kwargs.get("last_layer_name", "dense_1")
+
+    if kwargs.get("num_classes", 1) > 1:
+        last_layer = tf.keras.layers.Dense(
+            kwargs["num_classes"],
+            activation="softmax",
+            kernel_initializer=glorot,
+            bias_initializer=output_bias,
+            name=layer_name,
+        )
+
+    elif kwargs.get("num_raters", 1) > 1:
+        if kwargs.get("multitask", False):
+            raise NotImplementedError
+        last_layer = tf.keras.layers.Dense(
+            kwargs["num_raters"],
+            activation="sigmoid",
+            kernel_initializer=glorot,
+            bias_initializer=output_bias,
+            name="probs",
+        )
+
+    else:
+        last_layer = tf.keras.layers.Dense(
+            1,
+            activation="sigmoid",
+            kernel_initializer=glorot,
+            bias_initializer=output_bias,
+            name=layer_name,
+        )
+
+    return last_layer
 
-  return last_layer
 
 def load_bertweet(**kwargs):
-  bert = TFAutoModelForSequenceClassification.from_pretrained(
-    os.path.join(LOCAL_MODEL_DIR, "bertweet-base"),
-    num_labels=1,
-    classifier_dropout=0.1,
-    hidden_size=768,
-  )
-  if "num_classes" in kwargs and kwargs["num_classes"] > 2:
-    raise NotImplementedError
+    bert = TFAutoModelForSequenceClassification.from_pretrained(
+        os.path.join(LOCAL_MODEL_DIR, "bertweet-base"),
+        num_labels=1,
+        classifier_dropout=0.1,
+        hidden_size=768,
+    )
+    if "num_classes" in kwargs and kwargs["num_classes"] > 2:
+        raise NotImplementedError
 
-  return bert, True
+    return bert, True
 
 
 def load(
-  optimizer,
-  seed,
-  model_type="twitter_multilingual_bert_base_cased_mlm",
-  loss_name="BCE",
-  trainable=True,
-  **kwargs,
+    optimizer,
+    seed,
+    model_type="twitter_multilingual_bert_base_cased_mlm",
+    loss_name="BCE",
+    trainable=True,
+    **kwargs,
 ):
-  if model_type == "bertweet-base":
-    model, from_logits = load_bertweet()
-  else:
-    model, from_logits = load_inhouse_bert(model_type, trainable, seed, **kwargs)
-
-  pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc", from_logits=from_logits)
-  roc_auc = tf.keras.metrics.AUC(curve="ROC", name="roc_auc", from_logits=from_logits)
-
-  loss = get_loss(loss_name, from_logits, **kwargs)
-  if kwargs.get('content_num_classes', None):
-    second_loss = get_loss(loss_name=kwargs['content_loss_name'], from_logits=from_logits)
-    loss_weights = {'content_output': kwargs['content_loss_weight'], 'target_output': 1}
-    model.compile(
-      optimizer=optimizer,
-      loss={'content_output': second_loss, 'target_output': loss},
-      loss_weights=loss_weights,
-      metrics=[pr_auc, roc_auc],
-    )
-
-  else:
-    model.compile(
-      optimizer=optimizer,
-      loss=loss,
-      metrics=[pr_auc, roc_auc],
-    )
-  print(model.summary(), "logits: ", from_logits)
-
-  return model
\ No newline at end of file
+    if model_type == "bertweet-base":
+        model, from_logits = load_bertweet()
+    else:
+        model, from_logits = load_inhouse_bert(model_type, trainable, seed, **kwargs)
+
+    pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc", from_logits=from_logits)
+    roc_auc = tf.keras.metrics.AUC(curve="ROC", name="roc_auc", from_logits=from_logits)
+
+    loss = get_loss(loss_name, from_logits, **kwargs)
+    if kwargs.get("content_num_classes", None):
+        second_loss = get_loss(
+            loss_name=kwargs["content_loss_name"], from_logits=from_logits
+        )
+        loss_weights = {
+            "content_output": kwargs["content_loss_weight"],
+            "target_output": 1,
+        }
+        model.compile(
+            optimizer=optimizer,
+            loss={"content_output": second_loss, "target_output": loss},
+            loss_weights=loss_weights,
+            metrics=[pr_auc, roc_auc],
+        )
+
+    else:
+        model.compile(
+            optimizer=optimizer,
+            loss=loss,
+            metrics=[pr_auc, roc_auc],
+        )
+    print(model.summary(), "logits: ", from_logits)
+
+    return model
diff --git a/trust_and_safety_models/toxicity/optim/callbacks.py b/trust_and_safety_models/toxicity/optim/callbacks.py
index bbf8d7c97..f587bc4eb 100644
--- a/trust_and_safety_models/toxicity/optim/callbacks.py
+++ b/trust_and_safety_models/toxicity/optim/callbacks.py
@@ -1,220 +1,242 @@
-from collections import defaultdict
 import os
+from collections import defaultdict
 
-from toxicity_ml_pipeline.settings.default_settings_tox import REMOTE_LOGDIR
-from toxicity_ml_pipeline.settings.default_settings_abs import LABEL_NAMES
-from toxicity_ml_pipeline.utils.absv_utils import parse_labeled_data
-from toxicity_ml_pipeline.utils.helpers import compute_precision_fixed_recall, execute_command
-
-from sklearn.metrics import average_precision_score, roc_auc_score
 import tensorflow as tf
 import wandb
+from sklearn.metrics import average_precision_score, roc_auc_score
+from toxicity_ml_pipeline.settings.default_settings_abs import LABEL_NAMES
+from toxicity_ml_pipeline.settings.default_settings_tox import REMOTE_LOGDIR
+from toxicity_ml_pipeline.utils.absv_utils import parse_labeled_data
+from toxicity_ml_pipeline.utils.helpers import (
+    compute_precision_fixed_recall,
+    execute_command,
+)
 
 
 class NothingCallback(tf.keras.callbacks.Callback):
-  def on_epoch_begin(self, epoch, logs=None):
-    print("ici, ", epoch)
+    def on_epoch_begin(self, epoch, logs=None):
+        print("ici, ", epoch)
 
-  def on_epoch_end(self, epoch, logs=None):
-    print("fin ", epoch)
+    def on_epoch_end(self, epoch, logs=None):
+        print("fin ", epoch)
 
-  def on_train_batch_end(self, batch, logs=None):
-    print("fin de batch ", batch)
+    def on_train_batch_end(self, batch, logs=None):
+        print("fin de batch ", batch)
 
 
 class ControlledStoppingCheckpointCallback(tf.keras.callbacks.ModelCheckpoint):
-  def __init__(self, stopping_epoch, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.stopping_epoch = stopping_epoch
+    def __init__(self, stopping_epoch, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.stopping_epoch = stopping_epoch
 
-  def on_epoch_end(self, epoch, logs=None):
-    super().on_epoch_end(epoch, logs)
-    if epoch == self.stopping_epoch:
-      self.model.stop_training = True
+    def on_epoch_end(self, epoch, logs=None):
+        super().on_epoch_end(epoch, logs)
+        if epoch == self.stopping_epoch:
+            self.model.stop_training = True
 
 
 class SyncingTensorBoard(tf.keras.callbacks.TensorBoard):
-  def __init__(self, remote_logdir=None, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.remote_logdir = remote_logdir if remote_logdir is not None else REMOTE_LOGDIR
+    def __init__(self, remote_logdir=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.remote_logdir = (
+            remote_logdir if remote_logdir is not None else REMOTE_LOGDIR
+        )
 
-  def on_epoch_end(self, epoch, logs=None):
-    super().on_epoch_end(epoch, logs=logs)
-    self.synchronize()
+    def on_epoch_end(self, epoch, logs=None):
+        super().on_epoch_end(epoch, logs=logs)
+        self.synchronize()
 
-  def synchronize(self):
-    base_dir = os.path.dirname(self.log_dir)
-    cmd = f"gsutil -m rsync -r {base_dir} {self.remote_logdir}"
-    execute_command(cmd)
+    def synchronize(self):
+        base_dir = os.path.dirname(self.log_dir)
+        cmd = f"gsutil -m rsync -r {base_dir} {self.remote_logdir}"
+        execute_command(cmd)
 
 
 class GradientLoggingTensorBoard(SyncingTensorBoard):
-  def __init__(self, loader, val_data, freq, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    val_dataset = loader.get_balanced_dataset(
-      training_data=val_data, size_limit=50, return_as_batch=False
-    )
-    data_args = list(val_dataset.batch(32).take(1))[0]
-    self.x_batch, self.y_batch = data_args[0], data_args[1]
-    self.freq = freq
-    self.counter = 0
-
-  def _log_gradients(self):
-    writer = self._train_writer
-
-    with writer.as_default():
-      with tf.GradientTape() as tape:
-        y_pred = self.model(self.x_batch)
-        loss = self.model.compiled_loss(y_true=self.y_batch, y_pred=y_pred)
-        gradient_norm = tf.linalg.global_norm(tape.gradient(loss, self.model.trainable_weights))
-
-      tf.summary.scalar("gradient_norm", data=gradient_norm, step=self.counter)
-    writer.flush()
-
-  def on_train_batch_end(self, batch, logs=None):
-    super().on_batch_end(batch, logs=logs)
-    self.counter += 1
-    if batch % self.freq == 0:
-      self._log_gradients()
+    def __init__(self, loader, val_data, freq, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        val_dataset = loader.get_balanced_dataset(
+            training_data=val_data, size_limit=50, return_as_batch=False
+        )
+        data_args = list(val_dataset.batch(32).take(1))[0]
+        self.x_batch, self.y_batch = data_args[0], data_args[1]
+        self.freq = freq
+        self.counter = 0
+
+    def _log_gradients(self):
+        writer = self._train_writer
+
+        with writer.as_default():
+            with tf.GradientTape() as tape:
+                y_pred = self.model(self.x_batch)
+                loss = self.model.compiled_loss(y_true=self.y_batch, y_pred=y_pred)
+                gradient_norm = tf.linalg.global_norm(
+                    tape.gradient(loss, self.model.trainable_weights)
+                )
+
+            tf.summary.scalar("gradient_norm", data=gradient_norm, step=self.counter)
+        writer.flush()
+
+    def on_train_batch_end(self, batch, logs=None):
+        super().on_batch_end(batch, logs=logs)
+        self.counter += 1
+        if batch % self.freq == 0:
+            self._log_gradients()
 
 
 class AdditionalResultLogger(tf.keras.callbacks.Callback):
-  def __init__(
-    self,
-    data,
-    set_,
-    fixed_recall=0.85,
-    from_logits=False,
-    dataset_transform_func=None,
-    batch_size=64,
-    dual_head=None,
-    *args,
-    **kwargs,
-  ):
-    super().__init__(*args, **kwargs)
-    self.set_ = set_
-    if data is None:
-      return None    
-
-    self.single_head = True
-    try:
-      self.labels = data.int_label.values
-    except AttributeError:
-      self.labels = data.to_dataframe()[LABEL_NAMES].values.astype('int')
-      self.data = data.to_tf_dataset().map(parse_labeled_data).batch(batch_size)
-      self.label_names = LABEL_NAMES
-    else:
-      self.label_names = ['']
-      if dual_head:
-        self.label_names = [f'{e}_label' for e in dual_head]
-        self.labels = {f'{e}_output': data[f'{e}_label'].values for e in dual_head}
-        self.single_head = False
-      if dataset_transform_func is None:
-        self.data = data.text.values
-      else:
-        self.data = dataset_transform_func(data, mb_size=batch_size, shuffle=False)
-        
-    finally:
-      if len(self.label_names) == 1:
-        self.metric_kw = {}
-      else:
-        self.metric_kw = {'average': None}
-
-      self.counter = 0
-      self.best_metrics = defaultdict(float)
-      self.from_logits = from_logits
-      print(f"Loaded callback for {set_}, from_logits: {from_logits}, labels {self.label_names}")
-
-      if 1 < fixed_recall <= 100:
-        fixed_recall = fixed_recall / 100
-      elif not (0 < fixed_recall <= 100):
-        raise ValueError("Threshold should be between 0 and 1, or 0 and 100")
-      self.fixed_recall = fixed_recall
-      self.batch_size = batch_size
-
-  def compute_precision_fixed_recall(self, labels, preds):
-    result, _ = compute_precision_fixed_recall(labels=labels, preds=preds,
-      fixed_recall=self.fixed_recall)
-
-    return result
-
-  def on_epoch_end(self, epoch, logs=None):
-    self.additional_evaluations(step=epoch, eval_time="epoch")
-
-  def on_train_batch_end(self, batch, logs=None):
-    self.counter += 1
-    if self.counter % 2000 == 0:
-      self.additional_evaluations(step=self.counter, eval_time="batch")
-
-  def _binary_evaluations(self, preds, label_name=None, class_index=None):
-    mask = None
-    curr_labels = self.labels
-    if label_name is not None:
-      curr_labels = self.labels[label_name]
-      if class_index is not None:
-        curr_labels = (curr_labels == class_index).astype(int)
-
-    if -1 in curr_labels:
-      mask = curr_labels != -1   
-      curr_labels = curr_labels[mask]
-      preds = preds[mask] 
-    
-    return {
-        f"precision_recall{self.fixed_recall}": self.compute_precision_fixed_recall(
-          labels=curr_labels, preds=preds
-        ),
-        "pr_auc": average_precision_score(y_true=curr_labels, y_score=preds),
-        "roc_auc": roc_auc_score(y_true=curr_labels, y_score=preds),
-      }
-
-
-  def _multiclass_evaluations(self, preds):
-    pr_auc_l = average_precision_score(y_true=self.labels, y_score=preds, **self.metric_kw)
-    roc_auc_l = roc_auc_score(y_true=self.labels, y_score=preds, **self.metric_kw)
-    metrics = {}
-    for i, label in enumerate(self.label_names):
-      metrics[f'pr_auc_{label}'] = pr_auc_l[i]
-      metrics[f'roc_auc_{label}'] = roc_auc_l[i]
-
-    return metrics
-  
-  def additional_evaluations(self, step, eval_time):
-    print("Evaluating ", self.set_, eval_time, step)
-
-    preds = self.model.predict(x=self.data, batch_size=self.batch_size)
-    if self.from_logits:
-      preds = tf.keras.activations.sigmoid(preds.logits).numpy()
-    
-    if self.single_head:
-      if len(self.label_names) == 1:
-        metrics = self._binary_evaluations(preds)
-      else:
-        metrics = self._multiclass_evaluations(preds)
-    else:
-      if preds[0].shape[1] == 1:
-        binary_preds = preds[0]
-        multic_preds = preds[1]
-      else:
-        binary_preds = preds[1]
-        multic_preds = preds[0]
-
-      binary_metrics = self._binary_evaluations(binary_preds, label_name='target_output')
-      metrics = {f'{k}_target': v for k, v in binary_metrics.items()}
-      num_classes = multic_preds.shape[1]
-      for class_ in range(num_classes):
-        binary_metrics = self._binary_evaluations(multic_preds[:, class_], label_name='content_output', class_index=class_)
-        metrics.update({f'{k}_content_{class_}': v for k, v in binary_metrics.items()})
-
-    for k, v in metrics.items():
-      self.best_metrics[f"max_{k}"] = max(v, self.best_metrics[f"max_{k}"])
-
-    self.log_metrics(metrics, step=step, eval_time=eval_time)
-
-  def log_metrics(self, metrics_d, step, eval_time):
-    commit = False if self.set_ == "validation" else True
-    to_report = {self.set_: {**metrics_d, **self.best_metrics}}
-
-    if eval_time == "epoch":
-      to_report["epoch"] = step
-
-    wandb.log(to_report, commit=commit)
+    def __init__(
+        self,
+        data,
+        set_,
+        fixed_recall=0.85,
+        from_logits=False,
+        dataset_transform_func=None,
+        batch_size=64,
+        dual_head=None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.set_ = set_
+        if data is None:
+            return None
+
+        self.single_head = True
+        try:
+            self.labels = data.int_label.values
+        except AttributeError:
+            self.labels = data.to_dataframe()[LABEL_NAMES].values.astype("int")
+            self.data = data.to_tf_dataset().map(parse_labeled_data).batch(batch_size)
+            self.label_names = LABEL_NAMES
+        else:
+            self.label_names = [""]
+            if dual_head:
+                self.label_names = [f"{e}_label" for e in dual_head]
+                self.labels = {
+                    f"{e}_output": data[f"{e}_label"].values for e in dual_head
+                }
+                self.single_head = False
+            if dataset_transform_func is None:
+                self.data = data.text.values
+            else:
+                self.data = dataset_transform_func(
+                    data, mb_size=batch_size, shuffle=False
+                )
+
+        finally:
+            if len(self.label_names) == 1:
+                self.metric_kw = {}
+            else:
+                self.metric_kw = {"average": None}
+
+            self.counter = 0
+            self.best_metrics = defaultdict(float)
+            self.from_logits = from_logits
+            print(
+                f"Loaded callback for {set_}, from_logits: {from_logits}, labels {self.label_names}"
+            )
+
+            if 1 < fixed_recall <= 100:
+                fixed_recall = fixed_recall / 100
+            elif not (0 < fixed_recall <= 100):
+                raise ValueError("Threshold should be between 0 and 1, or 0 and 100")
+            self.fixed_recall = fixed_recall
+            self.batch_size = batch_size
+
+    def compute_precision_fixed_recall(self, labels, preds):
+        result, _ = compute_precision_fixed_recall(
+            labels=labels, preds=preds, fixed_recall=self.fixed_recall
+        )
+
+        return result
+
+    def on_epoch_end(self, epoch, logs=None):
+        self.additional_evaluations(step=epoch, eval_time="epoch")
+
+    def on_train_batch_end(self, batch, logs=None):
+        self.counter += 1
+        if self.counter % 2000 == 0:
+            self.additional_evaluations(step=self.counter, eval_time="batch")
+
+    def _binary_evaluations(self, preds, label_name=None, class_index=None):
+        mask = None
+        curr_labels = self.labels
+        if label_name is not None:
+            curr_labels = self.labels[label_name]
+            if class_index is not None:
+                curr_labels = (curr_labels == class_index).astype(int)
+
+        if -1 in curr_labels:
+            mask = curr_labels != -1
+            curr_labels = curr_labels[mask]
+            preds = preds[mask]
+
+        return {
+            f"precision_recall{self.fixed_recall}": self.compute_precision_fixed_recall(
+                labels=curr_labels, preds=preds
+            ),
+            "pr_auc": average_precision_score(y_true=curr_labels, y_score=preds),
+            "roc_auc": roc_auc_score(y_true=curr_labels, y_score=preds),
+        }
+
+    def _multiclass_evaluations(self, preds):
+        pr_auc_l = average_precision_score(
+            y_true=self.labels, y_score=preds, **self.metric_kw
+        )
+        roc_auc_l = roc_auc_score(y_true=self.labels, y_score=preds, **self.metric_kw)
+        metrics = {}
+        for i, label in enumerate(self.label_names):
+            metrics[f"pr_auc_{label}"] = pr_auc_l[i]
+            metrics[f"roc_auc_{label}"] = roc_auc_l[i]
+
+        return metrics
+
+    def additional_evaluations(self, step, eval_time):
+        print("Evaluating ", self.set_, eval_time, step)
+
+        preds = self.model.predict(x=self.data, batch_size=self.batch_size)
+        if self.from_logits:
+            preds = tf.keras.activations.sigmoid(preds.logits).numpy()
+
+        if self.single_head:
+            if len(self.label_names) == 1:
+                metrics = self._binary_evaluations(preds)
+            else:
+                metrics = self._multiclass_evaluations(preds)
+        else:
+            if preds[0].shape[1] == 1:
+                binary_preds = preds[0]
+                multic_preds = preds[1]
+            else:
+                binary_preds = preds[1]
+                multic_preds = preds[0]
+
+            binary_metrics = self._binary_evaluations(
+                binary_preds, label_name="target_output"
+            )
+            metrics = {f"{k}_target": v for k, v in binary_metrics.items()}
+            num_classes = multic_preds.shape[1]
+            for class_ in range(num_classes):
+                binary_metrics = self._binary_evaluations(
+                    multic_preds[:, class_],
+                    label_name="content_output",
+                    class_index=class_,
+                )
+                metrics.update(
+                    {f"{k}_content_{class_}": v for k, v in binary_metrics.items()}
+                )
+
+        for k, v in metrics.items():
+            self.best_metrics[f"max_{k}"] = max(v, self.best_metrics[f"max_{k}"])
+
+        self.log_metrics(metrics, step=step, eval_time=eval_time)
+
+    def log_metrics(self, metrics_d, step, eval_time):
+        commit = False if self.set_ == "validation" else True
+        to_report = {self.set_: {**metrics_d, **self.best_metrics}}
+
+        if eval_time == "epoch":
+            to_report["epoch"] = step
+
+        wandb.log(to_report, commit=commit)
diff --git a/trust_and_safety_models/toxicity/optim/losses.py b/trust_and_safety_models/toxicity/optim/losses.py
index 273c6676e..3adf13011 100644
--- a/trust_and_safety_models/toxicity/optim/losses.py
+++ b/trust_and_safety_models/toxicity/optim/losses.py
@@ -1,56 +1,57 @@
 import tensorflow as tf
-from keras.utils import tf_utils
-from keras.utils import losses_utils
 from keras import backend
+from keras.utils import losses_utils, tf_utils
+
 
 def inv_kl_divergence(y_true, y_pred):
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  y_true = backend.clip(y_true, backend.epsilon(), 1)
-  y_pred = backend.clip(y_pred, backend.epsilon(), 1)
-  return tf.reduce_sum(y_pred * tf.math.log(y_pred / y_true), axis=-1)
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    y_true = backend.clip(y_true, backend.epsilon(), 1)
+    y_pred = backend.clip(y_pred, backend.epsilon(), 1)
+    return tf.reduce_sum(y_pred * tf.math.log(y_pred / y_true), axis=-1)
+
 
 def masked_bce(y_true, y_pred):
-  y_true = tf.cast(y_true, dtype=tf.float32)
-  mask = y_true != -1
-  
-  return tf.keras.metrics.binary_crossentropy(tf.boolean_mask(y_true, mask), 
-                                              tf.boolean_mask(y_pred, mask))
+    y_true = tf.cast(y_true, dtype=tf.float32)
+    mask = y_true != -1
+
+    return tf.keras.metrics.binary_crossentropy(
+        tf.boolean_mask(y_true, mask), tf.boolean_mask(y_pred, mask)
+    )
 
 
 class LossFunctionWrapper(tf.keras.losses.Loss):
-  def __init__(self,
-    fn,
-    reduction=losses_utils.ReductionV2.AUTO,
-    name=None,
-    **kwargs):
-    super().__init__(reduction=reduction, name=name)
-    self.fn = fn
-    self._fn_kwargs = kwargs
-
-  def call(self, y_true, y_pred):
-    if tf.is_tensor(y_pred) and tf.is_tensor(y_true):
-      y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true)
-
-    ag_fn = tf.__internal__.autograph.tf_convert(self.fn, tf.__internal__.autograph.control_status_ctx())
-    return ag_fn(y_true, y_pred, **self._fn_kwargs)
-
-  def get_config(self):
-    config = {}
-    for k, v in self._fn_kwargs.items():
-      config[k] = backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def __init__(
+        self, fn, reduction=losses_utils.ReductionV2.AUTO, name=None, **kwargs
+    ):
+        super().__init__(reduction=reduction, name=name)
+        self.fn = fn
+        self._fn_kwargs = kwargs
+
+    def call(self, y_true, y_pred):
+        if tf.is_tensor(y_pred) and tf.is_tensor(y_true):
+            y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true)
+
+        ag_fn = tf.__internal__.autograph.tf_convert(
+            self.fn, tf.__internal__.autograph.control_status_ctx()
+        )
+        return ag_fn(y_true, y_pred, **self._fn_kwargs)
+
+    def get_config(self):
+        config = {}
+        for k, v in self._fn_kwargs.items():
+            config[k] = backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
 
 class InvKLD(LossFunctionWrapper):
-  def __init__(self,
-    reduction=losses_utils.ReductionV2.AUTO,
-    name='inv_kl_divergence'):
-    super().__init__(inv_kl_divergence, name=name, reduction=reduction)
+    def __init__(
+        self, reduction=losses_utils.ReductionV2.AUTO, name="inv_kl_divergence"
+    ):
+        super().__init__(inv_kl_divergence, name=name, reduction=reduction)
 
 
 class MaskedBCE(LossFunctionWrapper):
-  def __init__(self,
-    reduction=losses_utils.ReductionV2.AUTO,
-    name='masked_bce'):
-    super().__init__(masked_bce, name=name, reduction=reduction)
+    def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name="masked_bce"):
+        super().__init__(masked_bce, name=name, reduction=reduction)
diff --git a/trust_and_safety_models/toxicity/optim/schedulers.py b/trust_and_safety_models/toxicity/optim/schedulers.py
index 59f6c9afa..4a3d5091e 100644
--- a/trust_and_safety_models/toxicity/optim/schedulers.py
+++ b/trust_and_safety_models/toxicity/optim/schedulers.py
@@ -4,41 +4,41 @@
 
 
 class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
-  def __init__(
-    self,
-    initial_learning_rate: float,
-    decay_schedule_fn: Callable,
-    warmup_steps: int,
-    power: float = 1.0,
-    name: str = "",
-  ):
-    super().__init__()
-    self.initial_learning_rate = initial_learning_rate
-    self.warmup_steps = warmup_steps
-    self.power = power
-    self.decay_schedule_fn = decay_schedule_fn
-    self.name = name
+    def __init__(
+        self,
+        initial_learning_rate: float,
+        decay_schedule_fn: Callable,
+        warmup_steps: int,
+        power: float = 1.0,
+        name: str = "",
+    ):
+        super().__init__()
+        self.initial_learning_rate = initial_learning_rate
+        self.warmup_steps = warmup_steps
+        self.power = power
+        self.decay_schedule_fn = decay_schedule_fn
+        self.name = name
 
-  def __call__(self, step):
-    with tf.name_scope(self.name or "WarmUp") as name:
-      global_step_float = tf.cast(step, tf.float32)
-      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
-      warmup_percent_done = global_step_float / warmup_steps_float
-      warmup_learning_rate = self.initial_learning_rate * tf.math.pow(
-        warmup_percent_done, self.power
-      )
-      return tf.cond(
-        global_step_float < warmup_steps_float,
-        lambda: warmup_learning_rate,
-        lambda: self.decay_schedule_fn(step - self.warmup_steps),
-        name=name,
-      )
+    def __call__(self, step):
+        with tf.name_scope(self.name or "WarmUp") as name:
+            global_step_float = tf.cast(step, tf.float32)
+            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+            warmup_percent_done = global_step_float / warmup_steps_float
+            warmup_learning_rate = self.initial_learning_rate * tf.math.pow(
+                warmup_percent_done, self.power
+            )
+            return tf.cond(
+                global_step_float < warmup_steps_float,
+                lambda: warmup_learning_rate,
+                lambda: self.decay_schedule_fn(step - self.warmup_steps),
+                name=name,
+            )
 
-  def get_config(self):
-    return {
-      "initial_learning_rate": self.initial_learning_rate,
-      "decay_schedule_fn": self.decay_schedule_fn,
-      "warmup_steps": self.warmup_steps,
-      "power": self.power,
-      "name": self.name,
-    }
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_schedule_fn": self.decay_schedule_fn,
+            "warmup_steps": self.warmup_steps,
+            "power": self.power,
+            "name": self.name,
+        }
diff --git a/trust_and_safety_models/toxicity/rescoring.py b/trust_and_safety_models/toxicity/rescoring.py
index 71d95ed76..392f97c8e 100644
--- a/trust_and_safety_models/toxicity/rescoring.py
+++ b/trust_and_safety_models/toxicity/rescoring.py
@@ -1,54 +1,59 @@
-from toxicity_ml_pipeline.load_model import reload_model_weights
-from toxicity_ml_pipeline.utils.helpers import load_inference_func, upload_model
-
 import numpy as np
 import tensorflow as tf
+from toxicity_ml_pipeline.load_model import reload_model_weights
+from toxicity_ml_pipeline.utils.helpers import load_inference_func, upload_model
 
 
-def score(language, df, gcs_model_path, batch_size=64, text_col="text", kw="", **kwargs):
-  if language != "en":
-    raise NotImplementedError(
-      "Data preprocessing not implemented here, needs to be added for i18n models"
-    )
-  model_folder = upload_model(full_gcs_model_path=gcs_model_path)
-  try:
-    inference_func = load_inference_func(model_folder)
-  except OSError:
-    model = reload_model_weights(model_folder, language, **kwargs)
-    preds = model.predict(x=df[text_col], batch_size=batch_size)
-    if type(preds) != list:
-      if len(preds.shape)> 1 and preds.shape[1] > 1:
-        if 'num_classes' in kwargs and kwargs['num_classes'] > 1:
-          raise NotImplementedError
-        preds = np.mean(preds, 1)
-
-      df[f"prediction_{kw}"] = preds
-    else:
-      if len(preds) > 2:
-        raise NotImplementedError
-      for preds_arr in preds:
-        if preds_arr.shape[1] == 1:
-          df[f"prediction_{kw}_target"] = preds_arr
+def score(
+    language, df, gcs_model_path, batch_size=64, text_col="text", kw="", **kwargs
+):
+    if language != "en":
+        raise NotImplementedError(
+            "Data preprocessing not implemented here, needs to be added for i18n models"
+        )
+    model_folder = upload_model(full_gcs_model_path=gcs_model_path)
+    try:
+        inference_func = load_inference_func(model_folder)
+    except OSError:
+        model = reload_model_weights(model_folder, language, **kwargs)
+        preds = model.predict(x=df[text_col], batch_size=batch_size)
+        if type(preds) != list:
+            if len(preds.shape) > 1 and preds.shape[1] > 1:
+                if "num_classes" in kwargs and kwargs["num_classes"] > 1:
+                    raise NotImplementedError
+                preds = np.mean(preds, 1)
+
+            df[f"prediction_{kw}"] = preds
         else:
-          for ind in range(preds_arr.shape[1]):
-            df[f"prediction_{kw}_content_{ind}"] = preds_arr[:, ind]
-
-    return df
-  else:
-    return _get_score(inference_func, df, kw=kw, batch_size=batch_size, text_col=text_col)
+            if len(preds) > 2:
+                raise NotImplementedError
+            for preds_arr in preds:
+                if preds_arr.shape[1] == 1:
+                    df[f"prediction_{kw}_target"] = preds_arr
+                else:
+                    for ind in range(preds_arr.shape[1]):
+                        df[f"prediction_{kw}_content_{ind}"] = preds_arr[:, ind]
+
+        return df
+    else:
+        return _get_score(
+            inference_func, df, kw=kw, batch_size=batch_size, text_col=text_col
+        )
 
 
 def _get_score(inference_func, df, text_col="text", kw="", batch_size=64):
-  score_col = f"prediction_{kw}"
-  beginning = 0
-  end = df.shape[0]
-  predictions = np.zeros(shape=end, dtype=float)
-
-  while beginning < end:
-    mb = df[text_col].values[beginning : beginning + batch_size]
-    res = inference_func(input_1=tf.constant(mb))
-    predictions[beginning : beginning + batch_size] = list(res.values())[0].numpy()[:, 0]
-    beginning += batch_size
-
-  df[score_col] = predictions
-  return df
+    score_col = f"prediction_{kw}"
+    beginning = 0
+    end = df.shape[0]
+    predictions = np.zeros(shape=end, dtype=float)
+
+    while beginning < end:
+        mb = df[text_col].values[beginning : beginning + batch_size]
+        res = inference_func(input_1=tf.constant(mb))
+        predictions[beginning : beginning + batch_size] = list(res.values())[0].numpy()[
+            :, 0
+        ]
+        beginning += batch_size
+
+    df[score_col] = predictions
+    return df
diff --git a/trust_and_safety_models/toxicity/settings/default_settings_tox.py b/trust_and_safety_models/toxicity/settings/default_settings_tox.py
index 0968b9adc..dbae08a50 100644
--- a/trust_and_safety_models/toxicity/settings/default_settings_tox.py
+++ b/trust_and_safety_models/toxicity/settings/default_settings_tox.py
@@ -1,20 +1,19 @@
 import os
 
-
 TEAM_PROJECT = "twttr-toxicity-prod"
 try:
-  from google.cloud import bigquery
+    from google.cloud import bigquery
 except (ModuleNotFoundError, ImportError):
-  print("No Google packages")
-  CLIENT = None
+    print("No Google packages")
+    CLIENT = None
 else:
-  from google.auth.exceptions import DefaultCredentialsError
+    from google.auth.exceptions import DefaultCredentialsError
 
-  try:
-    CLIENT = bigquery.Client(project=TEAM_PROJECT)
-  except DefaultCredentialsError as e:
-    CLIENT = None
-    print("Issue at logging time", e)
+    try:
+        CLIENT = bigquery.Client(project=TEAM_PROJECT)
+    except DefaultCredentialsError as e:
+        CLIENT = None
+        print("Issue at logging time", e)
 
 TRAINING_DATA_LOCATION = f"..."
 GCS_ADDRESS = "..."
diff --git a/trust_and_safety_models/toxicity/train.py b/trust_and_safety_models/toxicity/train.py
index de450ee7b..acefc8b6c 100644
--- a/trust_and_safety_models/toxicity/train.py
+++ b/trust_and_safety_models/toxicity/train.py
@@ -1,401 +1,437 @@
+import os
 from datetime import datetime
 from importlib import import_module
-import os
 
+import numpy as np
+import tensorflow as tf
 from toxicity_ml_pipeline.data.data_preprocessing import (
-  DefaultENNoPreprocessor,
-  DefaultENPreprocessor,
+    DefaultENNoPreprocessor,
+    DefaultENPreprocessor,
 )
 from toxicity_ml_pipeline.data.dataframe_loader import ENLoader, ENLoaderWithSampling
 from toxicity_ml_pipeline.data.mb_generator import BalancedMiniBatchLoader
-from toxicity_ml_pipeline.load_model import load, get_last_layer
+from toxicity_ml_pipeline.load_model import get_last_layer, load
 from toxicity_ml_pipeline.optim.callbacks import (
-  AdditionalResultLogger,
-  ControlledStoppingCheckpointCallback,
-  GradientLoggingTensorBoard,
-  SyncingTensorBoard,
+    AdditionalResultLogger,
+    ControlledStoppingCheckpointCallback,
+    GradientLoggingTensorBoard,
+    SyncingTensorBoard,
 )
 from toxicity_ml_pipeline.optim.schedulers import WarmUp
 from toxicity_ml_pipeline.settings.default_settings_abs import GCS_ADDRESS as ABS_GCS
+from toxicity_ml_pipeline.settings.default_settings_tox import GCS_ADDRESS as TOX_GCS
 from toxicity_ml_pipeline.settings.default_settings_tox import (
-  GCS_ADDRESS as TOX_GCS,
-  MODEL_DIR,
-  RANDOM_SEED,
-  REMOTE_LOGDIR,
-  WARM_UP_PERC,
+    MODEL_DIR,
+    RANDOM_SEED,
+    REMOTE_LOGDIR,
+    WARM_UP_PERC,
 )
 from toxicity_ml_pipeline.utils.helpers import check_gpu, set_seeds, upload_model
 
-import numpy as np
-import tensorflow as tf
-
-
 try:
-  from tensorflow_addons.optimizers import AdamW
+    from tensorflow_addons.optimizers import AdamW
 except ModuleNotFoundError:
-  print("No TFA")
+    print("No TFA")
 
 
 class Trainer(object):
-  OPTIMIZERS = ["Adam", "AdamW"]
-
-  def __init__(
-    self,
-    optimizer_name,
-    weight_decay,
-    learning_rate,
-    mb_size,
-    train_epochs,
-    content_loss_weight=1,
-    language="en",
-    scope='TOX',
-    project=...,
-    experiment_id="default",
-    gradient_clipping=None,
-    fold="time",
-    seed=RANDOM_SEED,
-    log_gradients=False,
-    kw="",
-    stopping_epoch=None,
-    test=False,
-  ):
-    self.seed = seed
-    self.weight_decay = weight_decay
-    self.learning_rate = learning_rate
-    self.mb_size = mb_size
-    self.train_epochs = train_epochs
-    self.gradient_clipping = gradient_clipping
-
-    if optimizer_name not in self.OPTIMIZERS:
-      raise ValueError(
-        f"Optimizer {optimizer_name} not implemented. Accepted values {self.OPTIMIZERS}."
-      )
-    self.optimizer_name = optimizer_name
-    self.log_gradients = log_gradients
-    self.test = test
-    self.fold = fold
-    self.stopping_epoch = stopping_epoch
-    self.language = language
-    if scope == 'TOX':
-      GCS_ADDRESS = TOX_GCS.format(project=project)
-    elif scope == 'ABS':
-      GCS_ADDRESS = ABS_GCS
-    else:
-      raise ValueError
-    GCS_ADDRESS = GCS_ADDRESS.format(project=project)
-    try:
-      self.setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings")
-    except ModuleNotFoundError:
-      raise ValueError(f"You need to define a setting file for your project {project}.")
-    experiment_settings = self.setting_file.experiment_settings
-
-    self.project = project
-    self.remote_logdir = REMOTE_LOGDIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project)
-    self.model_dir = MODEL_DIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project)
-
-    if experiment_id not in experiment_settings:
-      raise ValueError("This is not an experiment id as defined in the settings file.")
-
-    for var, default_value in experiment_settings["default"].items():
-      override_val = experiment_settings[experiment_id].get(var, default_value)
-      print("Setting ", var, override_val)
-      self.__setattr__(var, override_val)
-
-    self.content_loss_weight = content_loss_weight if self.dual_head else None
-
-    self.mb_loader = BalancedMiniBatchLoader(
-      fold=self.fold,
-      seed=self.seed,
-      perc_training_tox=self.perc_training_tox,
-      mb_size=self.mb_size,
-      n_outer_splits="time",
-      scope=scope,
-      project=project,
-      dual_head=self.dual_head,
-      sample_weights=self.sample_weights,
-      huggingface=("bertweet" in self.model_type),
-    )
-    self._init_dirnames(kw=kw, experiment_id=experiment_id)
-    print("------- Checking there is a GPU")
-    check_gpu()
-
-  def _init_dirnames(self, kw, experiment_id):
-    kw = "test" if self.test else kw
-    hyper_param_kw = ""
-    if self.optimizer_name == "AdamW":
-      hyper_param_kw += f"{self.weight_decay}_"
-    if self.gradient_clipping:
-      hyper_param_kw += f"{self.gradient_clipping}_"
-    if self.content_loss_weight:
-      hyper_param_kw += f"{self.content_loss_weight}_"
-    experiment_name = (
-      f"{self.language}{str(datetime.now()).replace(' ', '')[:-7]}{kw}_{experiment_id}{self.fold}_"
-      f"{self.optimizer_name}_"
-      f"{self.learning_rate}_"
-      f"{hyper_param_kw}"
-      f"{self.mb_size}_"
-      f"{self.perc_training_tox}_"
-      f"{self.train_epochs}_seed{self.seed}"
-    )
-    print("------- Experiment name: ", experiment_name)
-    self.logdir = (
-      f"..."
-      if self.test
-      else f"..."
-    )
-    self.checkpoint_path = f"{self.model_dir}/{experiment_name}"
-
-  @staticmethod
-  def _additional_writers(logdir, metric_name):
-    return tf.summary.create_file_writer(os.path.join(logdir, metric_name))
-
-  def get_callbacks(self, fold, val_data, test_data):
-    fold_logdir = self.logdir + f"_fold{fold}"
-    fold_checkpoint_path = self.checkpoint_path + f"_fold{fold}/{{epoch:02d}}"
-
-    tb_args = {
-      "log_dir": fold_logdir,
-      "histogram_freq": 0,
-      "update_freq": 500,
-      "embeddings_freq": 0,
-      "remote_logdir": f"{self.remote_logdir}_{self.language}"
-      if not self.test
-      else f"{self.remote_logdir}_test",
-    }
-    tensorboard_callback = (
-      GradientLoggingTensorBoard(loader=self.mb_loader, val_data=val_data, freq=10, **tb_args)
-      if self.log_gradients
-      else SyncingTensorBoard(**tb_args)
-    )
-
-    callbacks = [tensorboard_callback]
-    if "bertweet" in self.model_type:
-      from_logits = True
-      dataset_transform_func = self.mb_loader.make_huggingface_tensorflow_ds
-    else:
-      from_logits = False
-      dataset_transform_func = None
-
-    fixed_recall = 0.85 if not self.dual_head else 0.5
-    val_callback = AdditionalResultLogger(
-      data=val_data,
-      set_="validation",
-      from_logits=from_logits,
-      dataset_transform_func=dataset_transform_func,
-      dual_head=self.dual_head,
-      fixed_recall=fixed_recall
-    )
-    if val_callback is not None:
-      callbacks.append(val_callback)
-
-    test_callback = AdditionalResultLogger(
-      data=test_data,
-      set_="test",
-      from_logits=from_logits,
-      dataset_transform_func=dataset_transform_func,
-      dual_head=self.dual_head,
-      fixed_recall=fixed_recall
-    )
-    callbacks.append(test_callback)
-
-    checkpoint_args = {
-      "filepath": fold_checkpoint_path,
-      "verbose": 0,
-      "monitor": "val_pr_auc",
-      "save_weights_only": True,
-      "mode": "max",
-      "save_freq": "epoch",
-    }
-    if self.stopping_epoch:
-      checkpoint_callback = ControlledStoppingCheckpointCallback(
-        **checkpoint_args,
-        stopping_epoch=self.stopping_epoch,
-        save_best_only=False,
-      )
-      callbacks.append(checkpoint_callback)
-
-    return callbacks
-
-  def get_lr_schedule(self, steps_per_epoch):
-    total_num_steps = steps_per_epoch * self.train_epochs
-
-    warm_up_perc = WARM_UP_PERC if self.learning_rate >= 1e-3 else 0
-    warm_up_steps = int(total_num_steps * warm_up_perc)
-    if self.linear_lr_decay:
-      learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-        self.learning_rate,
-        total_num_steps - warm_up_steps,
-        end_learning_rate=0.0,
-        power=1.0,
-        cycle=False,
-      )
-    else:
-      print('Constant learning rate')
-      learning_rate_fn = self.learning_rate
-
-    if warm_up_perc > 0:
-      print(f".... using warm-up for {warm_up_steps} steps")
-      warm_up_schedule = WarmUp(
-        initial_learning_rate=self.learning_rate,
-        decay_schedule_fn=learning_rate_fn,
-        warmup_steps=warm_up_steps,
-      )
-      return warm_up_schedule
-    return learning_rate_fn
-
-  def get_optimizer(self, schedule):
-    optim_args = {
-      "learning_rate": schedule,
-      "beta_1": 0.9,
-      "beta_2": 0.999,
-      "epsilon": 1e-6,
-      "amsgrad": False,
-    }
-    if self.gradient_clipping:
-      optim_args["global_clipnorm"] = self.gradient_clipping
-
-    print(f".... {self.optimizer_name} w global clipnorm {self.gradient_clipping}")
-    if self.optimizer_name == "Adam":
-      return tf.keras.optimizers.Adam(**optim_args)
-
-    if self.optimizer_name == "AdamW":
-      optim_args["weight_decay"] = self.weight_decay
-      return AdamW(**optim_args)
-    raise NotImplementedError
-
-  def get_training_actors(self, steps_per_epoch, val_data, test_data, fold):
-    callbacks = self.get_callbacks(fold=fold, val_data=val_data, test_data=test_data)
-    schedule = self.get_lr_schedule(steps_per_epoch=steps_per_epoch)
-
-    optimizer = self.get_optimizer(schedule)
-
-    return optimizer, callbacks
-
-  def load_data(self):
-    if self.project == 435 or self.project == 211:
-      if self.dataset_type is None:
-        data_loader = ENLoader(project=self.project, setting_file=self.setting_file)
-        dataset_type_args = {}
-      else:
-        data_loader = ENLoaderWithSampling(project=self.project, setting_file=self.setting_file)
-        dataset_type_args = self.dataset_type
-
-    df = data_loader.load_data(
-      language=self.language, test=self.test, reload=self.dataset_reload, **dataset_type_args
-    )
-
-    return df
-
-  def preprocess(self, df):
-    if self.project == 435 or self.project == 211:
-      if self.preprocessing is None:
-        data_prepro = DefaultENNoPreprocessor()
-      elif self.preprocessing == "default":
-        data_prepro = DefaultENPreprocessor()
-      else:
+    OPTIMIZERS = ["Adam", "AdamW"]
+
+    def __init__(
+        self,
+        optimizer_name,
+        weight_decay,
+        learning_rate,
+        mb_size,
+        train_epochs,
+        content_loss_weight=1,
+        language="en",
+        scope="TOX",
+        project=...,
+        experiment_id="default",
+        gradient_clipping=None,
+        fold="time",
+        seed=RANDOM_SEED,
+        log_gradients=False,
+        kw="",
+        stopping_epoch=None,
+        test=False,
+    ):
+        self.seed = seed
+        self.weight_decay = weight_decay
+        self.learning_rate = learning_rate
+        self.mb_size = mb_size
+        self.train_epochs = train_epochs
+        self.gradient_clipping = gradient_clipping
+
+        if optimizer_name not in self.OPTIMIZERS:
+            raise ValueError(
+                f"Optimizer {optimizer_name} not implemented. Accepted values {self.OPTIMIZERS}."
+            )
+        self.optimizer_name = optimizer_name
+        self.log_gradients = log_gradients
+        self.test = test
+        self.fold = fold
+        self.stopping_epoch = stopping_epoch
+        self.language = language
+        if scope == "TOX":
+            GCS_ADDRESS = TOX_GCS.format(project=project)
+        elif scope == "ABS":
+            GCS_ADDRESS = ABS_GCS
+        else:
+            raise ValueError
+        GCS_ADDRESS = GCS_ADDRESS.format(project=project)
+        try:
+            self.setting_file = import_module(
+                f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings"
+            )
+        except ModuleNotFoundError:
+            raise ValueError(
+                f"You need to define a setting file for your project {project}."
+            )
+        experiment_settings = self.setting_file.experiment_settings
+
+        self.project = project
+        self.remote_logdir = REMOTE_LOGDIR.format(
+            GCS_ADDRESS=GCS_ADDRESS, project=project
+        )
+        self.model_dir = MODEL_DIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project)
+
+        if experiment_id not in experiment_settings:
+            raise ValueError(
+                "This is not an experiment id as defined in the settings file."
+            )
+
+        for var, default_value in experiment_settings["default"].items():
+            override_val = experiment_settings[experiment_id].get(var, default_value)
+            print("Setting ", var, override_val)
+            self.__setattr__(var, override_val)
+
+        self.content_loss_weight = content_loss_weight if self.dual_head else None
+
+        self.mb_loader = BalancedMiniBatchLoader(
+            fold=self.fold,
+            seed=self.seed,
+            perc_training_tox=self.perc_training_tox,
+            mb_size=self.mb_size,
+            n_outer_splits="time",
+            scope=scope,
+            project=project,
+            dual_head=self.dual_head,
+            sample_weights=self.sample_weights,
+            huggingface=("bertweet" in self.model_type),
+        )
+        self._init_dirnames(kw=kw, experiment_id=experiment_id)
+        print("------- Checking there is a GPU")
+        check_gpu()
+
+    def _init_dirnames(self, kw, experiment_id):
+        kw = "test" if self.test else kw
+        hyper_param_kw = ""
+        if self.optimizer_name == "AdamW":
+            hyper_param_kw += f"{self.weight_decay}_"
+        if self.gradient_clipping:
+            hyper_param_kw += f"{self.gradient_clipping}_"
+        if self.content_loss_weight:
+            hyper_param_kw += f"{self.content_loss_weight}_"
+        experiment_name = (
+            f"{self.language}{str(datetime.now()).replace(' ', '')[:-7]}{kw}_{experiment_id}{self.fold}_"
+            f"{self.optimizer_name}_"
+            f"{self.learning_rate}_"
+            f"{hyper_param_kw}"
+            f"{self.mb_size}_"
+            f"{self.perc_training_tox}_"
+            f"{self.train_epochs}_seed{self.seed}"
+        )
+        print("------- Experiment name: ", experiment_name)
+        self.logdir = f"..." if self.test else f"..."
+        self.checkpoint_path = f"{self.model_dir}/{experiment_name}"
+
+    @staticmethod
+    def _additional_writers(logdir, metric_name):
+        return tf.summary.create_file_writer(os.path.join(logdir, metric_name))
+
+    def get_callbacks(self, fold, val_data, test_data):
+        fold_logdir = self.logdir + f"_fold{fold}"
+        fold_checkpoint_path = self.checkpoint_path + f"_fold{fold}/{{epoch:02d}}"
+
+        tb_args = {
+            "log_dir": fold_logdir,
+            "histogram_freq": 0,
+            "update_freq": 500,
+            "embeddings_freq": 0,
+            "remote_logdir": f"{self.remote_logdir}_{self.language}"
+            if not self.test
+            else f"{self.remote_logdir}_test",
+        }
+        tensorboard_callback = (
+            GradientLoggingTensorBoard(
+                loader=self.mb_loader, val_data=val_data, freq=10, **tb_args
+            )
+            if self.log_gradients
+            else SyncingTensorBoard(**tb_args)
+        )
+
+        callbacks = [tensorboard_callback]
+        if "bertweet" in self.model_type:
+            from_logits = True
+            dataset_transform_func = self.mb_loader.make_huggingface_tensorflow_ds
+        else:
+            from_logits = False
+            dataset_transform_func = None
+
+        fixed_recall = 0.85 if not self.dual_head else 0.5
+        val_callback = AdditionalResultLogger(
+            data=val_data,
+            set_="validation",
+            from_logits=from_logits,
+            dataset_transform_func=dataset_transform_func,
+            dual_head=self.dual_head,
+            fixed_recall=fixed_recall,
+        )
+        if val_callback is not None:
+            callbacks.append(val_callback)
+
+        test_callback = AdditionalResultLogger(
+            data=test_data,
+            set_="test",
+            from_logits=from_logits,
+            dataset_transform_func=dataset_transform_func,
+            dual_head=self.dual_head,
+            fixed_recall=fixed_recall,
+        )
+        callbacks.append(test_callback)
+
+        checkpoint_args = {
+            "filepath": fold_checkpoint_path,
+            "verbose": 0,
+            "monitor": "val_pr_auc",
+            "save_weights_only": True,
+            "mode": "max",
+            "save_freq": "epoch",
+        }
+        if self.stopping_epoch:
+            checkpoint_callback = ControlledStoppingCheckpointCallback(
+                **checkpoint_args,
+                stopping_epoch=self.stopping_epoch,
+                save_best_only=False,
+            )
+            callbacks.append(checkpoint_callback)
+
+        return callbacks
+
+    def get_lr_schedule(self, steps_per_epoch):
+        total_num_steps = steps_per_epoch * self.train_epochs
+
+        warm_up_perc = WARM_UP_PERC if self.learning_rate >= 1e-3 else 0
+        warm_up_steps = int(total_num_steps * warm_up_perc)
+        if self.linear_lr_decay:
+            learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+                self.learning_rate,
+                total_num_steps - warm_up_steps,
+                end_learning_rate=0.0,
+                power=1.0,
+                cycle=False,
+            )
+        else:
+            print("Constant learning rate")
+            learning_rate_fn = self.learning_rate
+
+        if warm_up_perc > 0:
+            print(f".... using warm-up for {warm_up_steps} steps")
+            warm_up_schedule = WarmUp(
+                initial_learning_rate=self.learning_rate,
+                decay_schedule_fn=learning_rate_fn,
+                warmup_steps=warm_up_steps,
+            )
+            return warm_up_schedule
+        return learning_rate_fn
+
+    def get_optimizer(self, schedule):
+        optim_args = {
+            "learning_rate": schedule,
+            "beta_1": 0.9,
+            "beta_2": 0.999,
+            "epsilon": 1e-6,
+            "amsgrad": False,
+        }
+        if self.gradient_clipping:
+            optim_args["global_clipnorm"] = self.gradient_clipping
+
+        print(f".... {self.optimizer_name} w global clipnorm {self.gradient_clipping}")
+        if self.optimizer_name == "Adam":
+            return tf.keras.optimizers.Adam(**optim_args)
+
+        if self.optimizer_name == "AdamW":
+            optim_args["weight_decay"] = self.weight_decay
+            return AdamW(**optim_args)
         raise NotImplementedError
 
-    return data_prepro(
-      df=df,
-      label_column=self.label_column,
-      class_weight=self.perc_training_tox if self.sample_weights == 'class_weight' else None,
-      filter_low_agreements=self.filter_low_agreements,
-      num_classes=self.num_classes,
-    )
-
-  def load_model(self, optimizer):
-    smart_bias_value = (
-      np.log(self.perc_training_tox / (1 - self.perc_training_tox)) if self.smart_bias_init else 0
-    )
-    model = load(
-      optimizer,
-      seed=self.seed,
-      trainable=self.trainable,
-      model_type=self.model_type,
-      loss_name=self.loss_name,
-      num_classes=self.num_classes,
-      additional_layer=self.additional_layer,
-      smart_bias_value=smart_bias_value,
-      content_num_classes=self.content_num_classes,
-      content_loss_name=self.content_loss_name,
-      content_loss_weight=self.content_loss_weight
-    )
-
-    if self.model_reload is not False:
-      model_folder = upload_model(full_gcs_model_path=os.path.join(self.model_dir, self.model_reload))
-      model.load_weights(model_folder)
-      if self.scratch_last_layer:
-        print('Putting the last layer back to scratch')
-        model.layers[-1] = get_last_layer(seed=self.seed,
-                                        num_classes=self.num_classes,
-                                        smart_bias_value=smart_bias_value)
-
-    return model
-
-  def _train_single_fold(self, mb_generator, test_data, steps_per_epoch, fold, val_data=None):
-    steps_per_epoch = 100 if self.test else steps_per_epoch
-
-    optimizer, callbacks = self.get_training_actors(
-      steps_per_epoch=steps_per_epoch, val_data=val_data, test_data=test_data, fold=fold
-    )
-    print("Loading model")
-    model = self.load_model(optimizer)
-    print(f"Nb of steps per epoch: {steps_per_epoch} ---- launching training")
-    training_args = {
-      "epochs": self.train_epochs,
-      "steps_per_epoch": steps_per_epoch,
-      "batch_size": self.mb_size,
-      "callbacks": callbacks,
-      "verbose": 2,
-    }
-
-    model.fit(mb_generator, **training_args)
-    return
-
-  def train_full_model(self):
-    print("Setting up random seed.")
-    set_seeds(self.seed)
-
-    print(f"Loading {self.language} data")
-    df = self.load_data()
-    df = self.preprocess(df=df)
-
-    print("Going to train on everything but the test dataset")
-    mini_batches, test_data, steps_per_epoch = self.mb_loader.simple_cv_load(df)
-
-    self._train_single_fold(
-      mb_generator=mini_batches, test_data=test_data, steps_per_epoch=steps_per_epoch, fold="full"
-    )
-
-  def train(self):
-    print("Setting up random seed.")
-    set_seeds(self.seed)
-
-    print(f"Loading {self.language} data")
-    df = self.load_data()
-    df = self.preprocess(df=df)
-
-    print("Loading MB generator")
-    i = 0
-    if self.project == 435 or self.project == 211:
-      mb_generator, steps_per_epoch, val_data, test_data = self.mb_loader.no_cv_load(full_df=df)
-      self._train_single_fold(
-        mb_generator=mb_generator,
-        val_data=val_data,
-        test_data=test_data,
-        steps_per_epoch=steps_per_epoch,
-        fold=i,
-      )
-    else:
-      raise ValueError("Sure you want to do multiple fold training")
-      for mb_generator, steps_per_epoch, val_data, test_data in self.mb_loader(full_df=df):
+    def get_training_actors(self, steps_per_epoch, val_data, test_data, fold):
+        callbacks = self.get_callbacks(
+            fold=fold, val_data=val_data, test_data=test_data
+        )
+        schedule = self.get_lr_schedule(steps_per_epoch=steps_per_epoch)
+
+        optimizer = self.get_optimizer(schedule)
+
+        return optimizer, callbacks
+
+    def load_data(self):
+        if self.project == 435 or self.project == 211:
+            if self.dataset_type is None:
+                data_loader = ENLoader(
+                    project=self.project, setting_file=self.setting_file
+                )
+                dataset_type_args = {}
+            else:
+                data_loader = ENLoaderWithSampling(
+                    project=self.project, setting_file=self.setting_file
+                )
+                dataset_type_args = self.dataset_type
+
+        df = data_loader.load_data(
+            language=self.language,
+            test=self.test,
+            reload=self.dataset_reload,
+            **dataset_type_args,
+        )
+
+        return df
+
+    def preprocess(self, df):
+        if self.project == 435 or self.project == 211:
+            if self.preprocessing is None:
+                data_prepro = DefaultENNoPreprocessor()
+            elif self.preprocessing == "default":
+                data_prepro = DefaultENPreprocessor()
+            else:
+                raise NotImplementedError
+
+        return data_prepro(
+            df=df,
+            label_column=self.label_column,
+            class_weight=self.perc_training_tox
+            if self.sample_weights == "class_weight"
+            else None,
+            filter_low_agreements=self.filter_low_agreements,
+            num_classes=self.num_classes,
+        )
+
+    def load_model(self, optimizer):
+        smart_bias_value = (
+            np.log(self.perc_training_tox / (1 - self.perc_training_tox))
+            if self.smart_bias_init
+            else 0
+        )
+        model = load(
+            optimizer,
+            seed=self.seed,
+            trainable=self.trainable,
+            model_type=self.model_type,
+            loss_name=self.loss_name,
+            num_classes=self.num_classes,
+            additional_layer=self.additional_layer,
+            smart_bias_value=smart_bias_value,
+            content_num_classes=self.content_num_classes,
+            content_loss_name=self.content_loss_name,
+            content_loss_weight=self.content_loss_weight,
+        )
+
+        if self.model_reload is not False:
+            model_folder = upload_model(
+                full_gcs_model_path=os.path.join(self.model_dir, self.model_reload)
+            )
+            model.load_weights(model_folder)
+            if self.scratch_last_layer:
+                print("Putting the last layer back to scratch")
+                model.layers[-1] = get_last_layer(
+                    seed=self.seed,
+                    num_classes=self.num_classes,
+                    smart_bias_value=smart_bias_value,
+                )
+
+        return model
+
+    def _train_single_fold(
+        self, mb_generator, test_data, steps_per_epoch, fold, val_data=None
+    ):
+        steps_per_epoch = 100 if self.test else steps_per_epoch
+
+        optimizer, callbacks = self.get_training_actors(
+            steps_per_epoch=steps_per_epoch,
+            val_data=val_data,
+            test_data=test_data,
+            fold=fold,
+        )
+        print("Loading model")
+        model = self.load_model(optimizer)
+        print(f"Nb of steps per epoch: {steps_per_epoch} ---- launching training")
+        training_args = {
+            "epochs": self.train_epochs,
+            "steps_per_epoch": steps_per_epoch,
+            "batch_size": self.mb_size,
+            "callbacks": callbacks,
+            "verbose": 2,
+        }
+
+        model.fit(mb_generator, **training_args)
+        return
+
+    def train_full_model(self):
+        print("Setting up random seed.")
+        set_seeds(self.seed)
+
+        print(f"Loading {self.language} data")
+        df = self.load_data()
+        df = self.preprocess(df=df)
+
+        print("Going to train on everything but the test dataset")
+        mini_batches, test_data, steps_per_epoch = self.mb_loader.simple_cv_load(df)
+
         self._train_single_fold(
-          mb_generator=mb_generator,
-          val_data=val_data,
-          test_data=test_data,
-          steps_per_epoch=steps_per_epoch,
-          fold=i,
+            mb_generator=mini_batches,
+            test_data=test_data,
+            steps_per_epoch=steps_per_epoch,
+            fold="full",
         )
-        i += 1
-        if i == 3:
-          break
+
+    def train(self):
+        print("Setting up random seed.")
+        set_seeds(self.seed)
+
+        print(f"Loading {self.language} data")
+        df = self.load_data()
+        df = self.preprocess(df=df)
+
+        print("Loading MB generator")
+        i = 0
+        if self.project == 435 or self.project == 211:
+            (
+                mb_generator,
+                steps_per_epoch,
+                val_data,
+                test_data,
+            ) = self.mb_loader.no_cv_load(full_df=df)
+            self._train_single_fold(
+                mb_generator=mb_generator,
+                val_data=val_data,
+                test_data=test_data,
+                steps_per_epoch=steps_per_epoch,
+                fold=i,
+            )
+        else:
+            raise ValueError("Sure you want to do multiple fold training")
+            for mb_generator, steps_per_epoch, val_data, test_data in self.mb_loader(
+                full_df=df
+            ):
+                self._train_single_fold(
+                    mb_generator=mb_generator,
+                    val_data=val_data,
+                    test_data=test_data,
+                    steps_per_epoch=steps_per_epoch,
+                    fold=i,
+                )
+                i += 1
+                if i == 3:
+                    break
diff --git a/trust_and_safety_models/toxicity/utils/helpers.py b/trust_and_safety_models/toxicity/utils/helpers.py
index c21d7eb1c..2147abc95 100644
--- a/trust_and_safety_models/toxicity/utils/helpers.py
+++ b/trust_and_safety_models/toxicity/utils/helpers.py
@@ -3,97 +3,101 @@
 import random as python_random
 import subprocess
 
-from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR
-
 import numpy as np
 from sklearn.metrics import precision_recall_curve
-
+from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR
 
 try:
-  import tensorflow as tf
+    import tensorflow as tf
 except ModuleNotFoundError:
-  pass
+    pass
 
 
 def upload_model(full_gcs_model_path):
-  folder_name = full_gcs_model_path
-  if folder_name[:5] != "gs://":
-    folder_name = "gs://" + folder_name
-
-  dirname = os.path.dirname(folder_name)
-  epoch = os.path.basename(folder_name)
+    folder_name = full_gcs_model_path
+    if folder_name[:5] != "gs://":
+        folder_name = "gs://" + folder_name
+
+    dirname = os.path.dirname(folder_name)
+    epoch = os.path.basename(folder_name)
+
+    model_dir = os.path.join(LOCAL_DIR, "models")
+    cmd = f"mkdir {model_dir}"
+    try:
+        execute_command(cmd)
+    except subprocess.CalledProcessError:
+        pass
+    model_dir = os.path.join(model_dir, os.path.basename(dirname))
+    cmd = f"mkdir {model_dir}"
+    try:
+        execute_command(cmd)
+    except subprocess.CalledProcessError:
+        pass
+
+    try:
+        _ = int(epoch)
+    except ValueError:
+        cmd = f"gsutil rsync -r '{folder_name}' {model_dir}"
+        weights_dir = model_dir
+
+    else:
+        cmd = f"gsutil cp '{dirname}/checkpoint' {model_dir}/"
+        execute_command(cmd)
+        cmd = f"gsutil cp '{os.path.join(dirname, epoch)}*' {model_dir}/"
+        weights_dir = f"{model_dir}/{epoch}"
 
-  model_dir = os.path.join(LOCAL_DIR, "models")
-  cmd = f"mkdir {model_dir}"
-  try:
     execute_command(cmd)
-  except subprocess.CalledProcessError:
-    pass
-  model_dir = os.path.join(model_dir, os.path.basename(dirname))
-  cmd = f"mkdir {model_dir}"
-  try:
-    execute_command(cmd)
-  except subprocess.CalledProcessError:
-    pass
-
-  try:
-    _ = int(epoch)
-  except ValueError:
-    cmd = f"gsutil rsync -r '{folder_name}' {model_dir}"
-    weights_dir = model_dir
+    return weights_dir
 
-  else:
-    cmd = f"gsutil cp '{dirname}/checkpoint' {model_dir}/"
-    execute_command(cmd)
-    cmd = f"gsutil cp '{os.path.join(dirname, epoch)}*' {model_dir}/"
-    weights_dir = f"{model_dir}/{epoch}"
-
-  execute_command(cmd)
-  return weights_dir
 
 def compute_precision_fixed_recall(labels, preds, fixed_recall):
-  precision_values, recall_values, thresholds = precision_recall_curve(y_true=labels, probas_pred=preds)
-  index_recall = bisect.bisect_left(-recall_values, -1 * fixed_recall)
-  result = precision_values[index_recall - 1]
-  print(f"Precision at {recall_values[index_recall-1]} recall: {result}")
+    precision_values, recall_values, thresholds = precision_recall_curve(
+        y_true=labels, probas_pred=preds
+    )
+    index_recall = bisect.bisect_left(-recall_values, -1 * fixed_recall)
+    result = precision_values[index_recall - 1]
+    print(f"Precision at {recall_values[index_recall-1]} recall: {result}")
+
+    return result, thresholds[index_recall - 1]
 
-  return result, thresholds[index_recall - 1]
 
 def load_inference_func(model_folder):
-  model = tf.saved_model.load(model_folder, ["serve"])
-  inference_func = model.signatures["serving_default"]
-  return inference_func
+    model = tf.saved_model.load(model_folder, ["serve"])
+    inference_func = model.signatures["serving_default"]
+    return inference_func
 
 
 def execute_query(client, query):
-  job = client.query(query)
-  df = job.result().to_dataframe()
-  return df
+    job = client.query(query)
+    df = job.result().to_dataframe()
+    return df
 
 
 def execute_command(cmd, print_=True):
-  s = subprocess.run(cmd, shell=True, capture_output=print_, check=True)
-  if print_:
-    print(s.stderr.decode("utf-8"))
-    print(s.stdout.decode("utf-8"))
+    s = subprocess.run(cmd, shell=True, capture_output=print_, check=True)
+    if print_:
+        print(s.stderr.decode("utf-8"))
+        print(s.stdout.decode("utf-8"))
 
 
 def check_gpu():
-  try:
-    execute_command("nvidia-smi")
-  except subprocess.CalledProcessError:
-    print("There is no GPU when there should be one.")
-    raise AttributeError
+    try:
+        execute_command("nvidia-smi")
+    except subprocess.CalledProcessError:
+        print("There is no GPU when there should be one.")
+        raise AttributeError
 
-  l = tf.config.list_physical_devices("GPU")
-  if len(l) == 0:
-    raise ModuleNotFoundError("Tensorflow has not found the GPU. Check your installation")
-  print(l)
+    l = tf.config.list_physical_devices("GPU")
+    if len(l) == 0:
+        raise ModuleNotFoundError(
+            "Tensorflow has not found the GPU. Check your installation"
+        )
+    print(l)
 
 
 def set_seeds(seed):
-  np.random.seed(seed)
+    np.random.seed(seed)
 
-  python_random.seed(seed)
+    python_random.seed(seed)
 
-  tf.random.set_seed(seed)
+    tf.random.set_seed(seed)
diff --git a/twml/libtwml/setup.py b/twml/libtwml/setup.py
index 2dcfa105d..ebd76e577 100644
--- a/twml/libtwml/setup.py
+++ b/twml/libtwml/setup.py
@@ -1,12 +1,12 @@
 """
 libtwml setup.py module
 """
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 setup(
-  name='libtwml',
-  version='2.0',
-  description="Tensorflow C++ ops for twml",
-  packages=find_packages(),
-  data_files=[('', ['libtwml_tf.so'])],
+    name="libtwml",
+    version="2.0",
+    description="Tensorflow C++ ops for twml",
+    packages=find_packages(),
+    data_files=[("", ["libtwml_tf.so"])],
 )
diff --git a/twml/libtwml/src/ops/scripts/get_inc.py b/twml/libtwml/src/ops/scripts/get_inc.py
index c50edfa90..df92dea44 100644
--- a/twml/libtwml/src/ops/scripts/get_inc.py
+++ b/twml/libtwml/src/ops/scripts/get_inc.py
@@ -2,4 +2,4 @@
 
 import tensorflow.compat.v1 as tf
 
-print(tf.sysconfig.get_include(), end='')
+print(tf.sysconfig.get_include(), end="")
diff --git a/twml/libtwml/src/ops/scripts/get_lib.py b/twml/libtwml/src/ops/scripts/get_lib.py
index 7150c48b7..c212e27d0 100644
--- a/twml/libtwml/src/ops/scripts/get_lib.py
+++ b/twml/libtwml/src/ops/scripts/get_lib.py
@@ -2,4 +2,4 @@
 
 import tensorflow.compat.v1 as tf
 
-print(tf.sysconfig.get_lib(), end='')
+print(tf.sysconfig.get_lib(), end="")
diff --git a/twml/setup.py b/twml/setup.py
index 7e4003bae..bdd548874 100644
--- a/twml/setup.py
+++ b/twml/setup.py
@@ -2,28 +2,27 @@
 
 from setuptools import find_packages, setup
 
-
 THIS_DIR = os.path.dirname(os.path.realpath(__file__))
-TWML_TEST_DATA_DIR = os.path.join(THIS_DIR, 'twml/tests/data')
+TWML_TEST_DATA_DIR = os.path.join(THIS_DIR, "twml/tests/data")
 
 data_files = []
 for parent, children, files in os.walk(TWML_TEST_DATA_DIR):
-  data_files += [os.path.join(parent, f) for f in files]
+    data_files += [os.path.join(parent, f) for f in files]
 
 setup(
-  name='twml',
-  version='2.0',
-  description="Tensorflow wrapper for twml",
-  packages=find_packages(exclude=["build"]),
-  install_requires=[
-    'thriftpy2',
-    'numpy',
-    'pyyaml',
-    'future',
-    'scikit-learn',
-    'scipy'
-  ],
-  package_data={
-    'twml': data_files,
-  },
+    name="twml",
+    version="2.0",
+    description="Tensorflow wrapper for twml",
+    packages=find_packages(exclude=["build"]),
+    install_requires=[
+        "thriftpy2",
+        "numpy",
+        "pyyaml",
+        "future",
+        "scikit-learn",
+        "scipy",
+    ],
+    package_data={
+        "twml": data_files,
+    },
 )
diff --git a/twml/twml/__init__.py b/twml/twml/__init__.py
index 0c96df68b..229fa0542 100644
--- a/twml/twml/__init__.py
+++ b/twml/twml/__init__.py
@@ -2,60 +2,67 @@
 
 import os
 
+import tensorflow.compat.v1 as tf  # noqa: F402
+
 # Import from twitter.deepbird
 from twitter.deepbird.logging.log_level import set_logging_level  # noqa: F401
 from twitter.deepbird.sparse import SparseTensor  # noqa: F401
 from twitter.deepbird.sparse import sparse_dense_matmul  # noqa: F401
 
-from .util import dynamic_partition, feature_id, limit_bits, limit_sparse_tensor_size  # noqa: F401
-from .util import write_file, fixed_length_tensor, setup_tf_logging_formatter  # noqa: F401
-from .array import Array  # noqa: F401
+from . import constants  # noqa: F401
+from . import errors  # noqa: F401
+from . import layers  # noqa: F401
+from . import lookup  # noqa: F401
+from . import readers  # noqa: F401
+from . import summary  # noqa: F401
+from . import tensorboard  # noqa: F401
 
-# Module to parse feature patterns and match them from data_spec.json
-from .feature_config import FeatureConfig, FeatureConfigBuilder  # noqa: F401
+# Custom argparser for Trainer
+from .argument_parser import *  # noqa: T400
+from .array import Array  # noqa: F401
+from .block_format_writer import *  # noqa: T400
 
 # Data record streaming, reading, writing, and parsing.
 from .dataset import *  # noqa: T400
-from .readers import *  # noqa: T400
-from .block_format_writer import *  # noqa: T400
 
 # Graph output functions
 from .export_output_fns import *  # noqa: T400
 
-# Input parsers
-from .parsers import *  # noqa: T400
-
-# Input functions
-from .input_fns import *  # noqa: T400
+# Module to parse feature patterns and match them from data_spec.json
+from .feature_config import FeatureConfig, FeatureConfigBuilder  # noqa: F401
 
 # Feature filter functions
 from .filters import *  # noqa: T400
 
-# Custom argparser for Trainer
-from .argument_parser import *  # noqa: T400
+# Input functions
+from .input_fns import *  # noqa: T400
 
-from . import constants  # noqa: F401
-from . import errors  # noqa: F401
-from . import layers  # noqa: F401
-from . import lookup  # noqa: F401
-from . import readers  # noqa: F401
-from . import summary  # noqa: F401
-from . import tensorboard  # noqa: F401
+# Input parsers
+from .parsers import *  # noqa: T400
+from .readers import *  # noqa: T400
+from .util import (
+    dynamic_partition,
+    feature_id,  # noqa: F401
+    fixed_length_tensor,
+    limit_bits,
+    limit_sparse_tensor_size,
+    setup_tf_logging_formatter,
+    write_file,
+)
 
-import tensorflow.compat.v1 as tf  # noqa: F402
 tf.disable_eager_execution()
 
 # TODO: Figure out a better way to deal with this.
-if 'OMP_NUM_THREADS' not in os.environ and 'MKL_NUM_THREADS' not in os.environ:
-  os.environ["OMP_NUM_THREADS"] = '1'
+if "OMP_NUM_THREADS" not in os.environ and "MKL_NUM_THREADS" not in os.environ:
+    os.environ["OMP_NUM_THREADS"] = "1"
 
 # Import all custom C++ ops
-from libtwml import add1, partition_sparse_tensor, CLIB  # noqa: F401
+from libtwml import CLIB, add1, partition_sparse_tensor  # noqa: F401
 
 # Configure logging levels to info for various frameworks
-set_logging_level('INFO')
+set_logging_level("INFO")
 
 from . import contrib  # noqa: F401
 from . import hooks  # noqa: F401
-from . import trainers  # noqa: F401
 from . import metrics  # noqa: F401
+from . import trainers  # noqa: F401
diff --git a/twml/twml/argument_parser.py b/twml/twml/argument_parser.py
index c771eebdf..abd51d590 100644
--- a/twml/twml/argument_parser.py
+++ b/twml/twml/argument_parser.py
@@ -3,296 +3,447 @@
 Command-line argument parsing for the Trainer.
 """
 import argparse
+import tempfile
 from argparse import ArgumentError
 from operator import attrgetter
-import tempfile
 
-import twml
 import tensorflow.compat.v1 as tf
 
+import twml
 
 SERIAL = "serial"
 TREE = "tree"
 LOG_LEVELS = {
-  "debug": tf.logging.DEBUG,
-  "info": tf.logging.INFO,
-  "warn": tf.logging.WARN,
-  "error": tf.logging.ERROR}
+    "debug": tf.logging.DEBUG,
+    "info": tf.logging.INFO,
+    "warn": tf.logging.WARN,
+    "error": tf.logging.ERROR,
+}
 
 
 class SortingHelpFormatter(argparse.HelpFormatter):
-  """
-  Used to sort args alphabetically in the help message.
-  """
+    """
+    Used to sort args alphabetically in the help message.
+    """
 
-  def add_arguments(self, actions):
-    actions = sorted(actions, key=attrgetter('option_strings'))
-    super(SortingHelpFormatter, self).add_arguments(actions)
+    def add_arguments(self, actions):
+        actions = sorted(actions, key=attrgetter("option_strings"))
+        super(SortingHelpFormatter, self).add_arguments(actions)
 
 
 def _set_log_level(level=None):
-  """Sets the tensorflow log level to the input level."""
-  if level is None:
-    return None
-  level = level.lower()
-  if level not in LOG_LEVELS.keys():
-    raise ValueError(f"Unexpected log level {level} was given but expected one of {LOG_LEVELS.keys()}.")
-  tf.logging.set_verbosity(LOG_LEVELS[level])
-  tf.logging.info(f"Setting tensorflow logging level to {level} or {LOG_LEVELS[level]}")
-  return level
+    """Sets the tensorflow log level to the input level."""
+    if level is None:
+        return None
+    level = level.lower()
+    if level not in LOG_LEVELS.keys():
+        raise ValueError(
+            f"Unexpected log level {level} was given but expected one of {LOG_LEVELS.keys()}."
+        )
+    tf.logging.set_verbosity(LOG_LEVELS[level])
+    tf.logging.info(
+        f"Setting tensorflow logging level to {level} or {LOG_LEVELS[level]}"
+    )
+    return level
 
 
 def get_trainer_parser():
-  """
-  Add common commandline args to parse for the Trainer class.
-  Typically, the user calls this function and then parses cmd-line arguments
-  into an argparse.Namespace object which is then passed to the Trainer constructor
-  via the params argument.
-
-  See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-  for a list and description of all cmd-line arguments.
-
-  Args:
-    learning_rate_decay:
-      Defaults to False. When True, parses learning rate decay arguments.
-
-  Returns:
-    argparse.ArgumentParser instance with some useful args already added.
-  """
-  parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
-
-  parser.add_argument(
-    "--save_dir", type=str, default=tempfile.mkdtemp(),
-    help="Path to the training result directory."
-         "supports local filesystem path and hdfs://default/<path> which requires "
-         "setting HDFS configuration via env variable HADOOP_CONF_DIR ")
-  parser.add_argument(
-    "--export_dir", type=str, default=None,
-    help="Path to the directory to export a SavedModel for prediction servers.")
-  parser.add_argument(
-    "--log_aggregation_app_id", type=str, default=None,
-    help="specify app_id for log aggregation. disabled by default.")
-  parser.add_argument(
-    "--train.batch_size", "--train_batch_size", type=int, default=32,
-    dest='train_batch_size',
-    help="number of samples per training batch")
-  parser.add_argument(
-    "--eval.batch_size", "--eval_batch_size", type=int, default=32,
-    dest='eval_batch_size',
-    help="number of samples per cross-validation batch. Defaults to train_batch_size")
-  parser.add_argument(
-    "--train.learning_rate", "--learning_rate", type=float, default=0.002,
-    dest='learning_rate',
-    help="learning rate. Scales the gradient update.")
-  parser.add_argument(
-    "--train.steps", "--train_steps", type=int, default=-1,
-    dest='train_steps',
-    help="number of training batches before running evaluation."
-         "Defaults to -1 (runs through entire dataset). "
-         "Only used for Trainer.[train,learn]. "
-         "For Trainer.train_and_evaluate, use train.max_steps instead. ")
-  parser.add_argument(
-    "--eval.steps", "--eval_steps", type=int, default=-1,
-    dest="eval_steps",
-    help="number of steps per evaluation. Each batch is a step."
-         "Defaults to -1 (runs through entire dataset). ")
-  parser.add_argument(
-    "--eval.period", "--eval_period", type=int, default=600,
-    dest="eval_period",
-    help="Trainer.train_and_evaluate waits for this long after each evaluation. "
-         "Defaults to 600 seconds (evaluate every ten minutes). "
-         "Note that anything lower than 10*60seconds is probably a bad idea because TF saves "
-         "checkpoints every 10mins by default. eval.delay is time to wait before doing first eval. "
-         "eval.period is time between successive evals.")
-  parser.add_argument(
-    "--eval.delay", "--eval_delay", type=int, default=120,
-    dest="eval_delay",
-    help="Trainer.train_and_evaluate waits for this long before performing the first evaluation"
-         "Defaults to 120 seconds (evaluate after first 2 minutes of training). "
-         "eval.delay is time to wait before doing first eval. "
-         "eval.period is time between successive evals.")
-  parser.add_argument(
-    "--train.max_steps", "--train_max_steps", type=int, default=None,
-    dest="train_max_steps",
-    help="Stop training after this many global steps. Each training batch is its own step."
-         "If set to None, step after one train()/evaluate() call. Useful when train.steps=-1."
-         "If set to a non-positive value, loop forever. Usually useful with early stopping.")
-  parser.add_argument(
-    "--train.log_metrics", dest="train_log_metrics", action="store_true", default=False,
-    help="Set this to true to see metrics during training. "
-         "WARNING: metrics during training does not represent model performance. "
-         "WARNING: use for debugging only as this slows down training.")
-  parser.add_argument(
-    "--train.early_stop_patience", "--early_stop_patience", type=int, default=-1,
-    dest="early_stop_patience",
-    help="max number of evaluations (epochs) to wait for an improvement in the early_stop_metric."
-         "Defaults to -1 (no early-stopping)."
-         "NOTE: This can not be enabled when --distributed is also set.")
-  parser.add_argument(
-    "--train.early_stop_tolerance", "--early_stop_tolerance", type=float, default=0,
-    dest="early_stop_tolerance",
-    help="a non-negative tolerance for comparing early_stop_metric."
-         "e.g. when maximizing the condition is current_metric > best_metric + tolerance."
-         "Defaults to 0.")
-  parser.add_argument(
-    "--train.dataset_shards", "--train_dataset_shards",
-    dest="train_dataset_shards",
-    type=int, default=None,
-    help="An int value that indicates the number of partitions (shards) for the dataset. This is"
-    " useful for codistillation and other techniques that require each worker to train on disjoint"
-    " partitions of the dataset.")
-  parser.add_argument(
-    "--train.dataset_shard_index", "--train_dataset_shard_index",
-    dest="train_dataset_shard_index",
-    type=int, default=None,
-    help="An int value (starting at zero) that indicates which partition (shard) of the dataset"
-    " to use if --train.dataset_shards is set.")
-  parser.add_argument(
-    "--continue_from_checkpoint", dest="continue_from_checkpoint", action="store_true",
-    help="DEPRECATED. This option is currently a no-op."
-    " Continuing from the provided checkpoint is now the default."
-    " Use --overwrite_save_dir if you would like to override it instead"
-    " and restart training from scratch.")
-  parser.add_argument(
-    "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true",
-    help="Delete the contents of the current save_dir if it exists")
-  parser.add_argument(
-    "--data_threads", "--num_threads", type=int, default=2,
-    dest="num_threads",
-    help="Number of threads to use for loading the dataset. "
-         "num_threads is deprecated and to be removed in future versions. Use data_threads.")
-  parser.add_argument(
-    "--max_duration", "--max_duration", type=float, default=None,
-    dest="max_duration",
-    help="Maximum duration (in secs) that training/validation will be allowed to run for before being automatically terminated.")
-  parser.add_argument(
-    "--num_workers", type=int, default=None,
-    help="Number of workers to use when training in hogwild manner on a single node.")
-  parser.add_argument(
-    "--distributed", dest="distributed", action="store_true",
-    help="Pass this flag to use train_and_evaluate to train in a distributed fashion"
-         "NOTE: You can not use early stopping when --distributed is enabled"
-  )
-  parser.add_argument(
-    "--distributed_training_cleanup",
-    dest="distributed_training_cleanup",
-    action="store_true",
-    help="Set if using distributed training on GKE to stop TwitterSetDeployment"
-         "from continuing training upon restarts (will be deprecated once we migrate off"
-         "TwitterSetDeployment for distributed training on GKE)."
-  )
-  parser.add_argument(
-    "--disable_auto_ps_shutdown", default=False, action="store_true",
-    help="Disable the functionality of automatically shutting down parameter server after "
-         "distributed training complete (either succeed or failed)."
-  )
-  parser.add_argument(
-    "--disable_tensorboard", default=False, action="store_true",
-    help="Do not start the TensorBoard server."
-  )
-  parser.add_argument(
-    "--tensorboard_port", type=int, default=None,
-    help="Port for tensorboard to run on. Ignored if --disable_tensorboard is set.")
-  parser.add_argument(
-    "--health_port", type=int, default=None,
-    help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
-         "Not user-facing as it is set automatically by the twml_cli."
-  )
-  parser.add_argument(
-    "--stats_port", type=int, default=None,
-    help="Port to listen on for stats endpoints"
-  )
-  parser.add_argument(
-    "--experiment_tracking_path",
-    dest="experiment_tracking_path",
-    type=str, default=None,
-    help="The tracking path of this experiment. Format: \
+    """
+    Add common commandline args to parse for the Trainer class.
+    Typically, the user calls this function and then parses cmd-line arguments
+    into an argparse.Namespace object which is then passed to the Trainer constructor
+    via the params argument.
+
+    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
+    for a list and description of all cmd-line arguments.
+
+    Args:
+      learning_rate_decay:
+        Defaults to False. When True, parses learning rate decay arguments.
+
+    Returns:
+      argparse.ArgumentParser instance with some useful args already added.
+    """
+    parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
+
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default=tempfile.mkdtemp(),
+        help="Path to the training result directory."
+        "supports local filesystem path and hdfs://default/<path> which requires "
+        "setting HDFS configuration via env variable HADOOP_CONF_DIR ",
+    )
+    parser.add_argument(
+        "--export_dir",
+        type=str,
+        default=None,
+        help="Path to the directory to export a SavedModel for prediction servers.",
+    )
+    parser.add_argument(
+        "--log_aggregation_app_id",
+        type=str,
+        default=None,
+        help="specify app_id for log aggregation. disabled by default.",
+    )
+    parser.add_argument(
+        "--train.batch_size",
+        "--train_batch_size",
+        type=int,
+        default=32,
+        dest="train_batch_size",
+        help="number of samples per training batch",
+    )
+    parser.add_argument(
+        "--eval.batch_size",
+        "--eval_batch_size",
+        type=int,
+        default=32,
+        dest="eval_batch_size",
+        help="number of samples per cross-validation batch. Defaults to train_batch_size",
+    )
+    parser.add_argument(
+        "--train.learning_rate",
+        "--learning_rate",
+        type=float,
+        default=0.002,
+        dest="learning_rate",
+        help="learning rate. Scales the gradient update.",
+    )
+    parser.add_argument(
+        "--train.steps",
+        "--train_steps",
+        type=int,
+        default=-1,
+        dest="train_steps",
+        help="number of training batches before running evaluation."
+        "Defaults to -1 (runs through entire dataset). "
+        "Only used for Trainer.[train,learn]. "
+        "For Trainer.train_and_evaluate, use train.max_steps instead. ",
+    )
+    parser.add_argument(
+        "--eval.steps",
+        "--eval_steps",
+        type=int,
+        default=-1,
+        dest="eval_steps",
+        help="number of steps per evaluation. Each batch is a step."
+        "Defaults to -1 (runs through entire dataset). ",
+    )
+    parser.add_argument(
+        "--eval.period",
+        "--eval_period",
+        type=int,
+        default=600,
+        dest="eval_period",
+        help="Trainer.train_and_evaluate waits for this long after each evaluation. "
+        "Defaults to 600 seconds (evaluate every ten minutes). "
+        "Note that anything lower than 10*60seconds is probably a bad idea because TF saves "
+        "checkpoints every 10mins by default. eval.delay is time to wait before doing first eval. "
+        "eval.period is time between successive evals.",
+    )
+    parser.add_argument(
+        "--eval.delay",
+        "--eval_delay",
+        type=int,
+        default=120,
+        dest="eval_delay",
+        help="Trainer.train_and_evaluate waits for this long before performing the first evaluation"
+        "Defaults to 120 seconds (evaluate after first 2 minutes of training). "
+        "eval.delay is time to wait before doing first eval. "
+        "eval.period is time between successive evals.",
+    )
+    parser.add_argument(
+        "--train.max_steps",
+        "--train_max_steps",
+        type=int,
+        default=None,
+        dest="train_max_steps",
+        help="Stop training after this many global steps. Each training batch is its own step."
+        "If set to None, step after one train()/evaluate() call. Useful when train.steps=-1."
+        "If set to a non-positive value, loop forever. Usually useful with early stopping.",
+    )
+    parser.add_argument(
+        "--train.log_metrics",
+        dest="train_log_metrics",
+        action="store_true",
+        default=False,
+        help="Set this to true to see metrics during training. "
+        "WARNING: metrics during training does not represent model performance. "
+        "WARNING: use for debugging only as this slows down training.",
+    )
+    parser.add_argument(
+        "--train.early_stop_patience",
+        "--early_stop_patience",
+        type=int,
+        default=-1,
+        dest="early_stop_patience",
+        help="max number of evaluations (epochs) to wait for an improvement in the early_stop_metric."
+        "Defaults to -1 (no early-stopping)."
+        "NOTE: This can not be enabled when --distributed is also set.",
+    )
+    parser.add_argument(
+        "--train.early_stop_tolerance",
+        "--early_stop_tolerance",
+        type=float,
+        default=0,
+        dest="early_stop_tolerance",
+        help="a non-negative tolerance for comparing early_stop_metric."
+        "e.g. when maximizing the condition is current_metric > best_metric + tolerance."
+        "Defaults to 0.",
+    )
+    parser.add_argument(
+        "--train.dataset_shards",
+        "--train_dataset_shards",
+        dest="train_dataset_shards",
+        type=int,
+        default=None,
+        help="An int value that indicates the number of partitions (shards) for the dataset. This is"
+        " useful for codistillation and other techniques that require each worker to train on disjoint"
+        " partitions of the dataset.",
+    )
+    parser.add_argument(
+        "--train.dataset_shard_index",
+        "--train_dataset_shard_index",
+        dest="train_dataset_shard_index",
+        type=int,
+        default=None,
+        help="An int value (starting at zero) that indicates which partition (shard) of the dataset"
+        " to use if --train.dataset_shards is set.",
+    )
+    parser.add_argument(
+        "--continue_from_checkpoint",
+        dest="continue_from_checkpoint",
+        action="store_true",
+        help="DEPRECATED. This option is currently a no-op."
+        " Continuing from the provided checkpoint is now the default."
+        " Use --overwrite_save_dir if you would like to override it instead"
+        " and restart training from scratch.",
+    )
+    parser.add_argument(
+        "--overwrite_save_dir",
+        dest="overwrite_save_dir",
+        action="store_true",
+        help="Delete the contents of the current save_dir if it exists",
+    )
+    parser.add_argument(
+        "--data_threads",
+        "--num_threads",
+        type=int,
+        default=2,
+        dest="num_threads",
+        help="Number of threads to use for loading the dataset. "
+        "num_threads is deprecated and to be removed in future versions. Use data_threads.",
+    )
+    parser.add_argument(
+        "--max_duration",
+        "--max_duration",
+        type=float,
+        default=None,
+        dest="max_duration",
+        help="Maximum duration (in secs) that training/validation will be allowed to run for before being automatically terminated.",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=None,
+        help="Number of workers to use when training in hogwild manner on a single node.",
+    )
+    parser.add_argument(
+        "--distributed",
+        dest="distributed",
+        action="store_true",
+        help="Pass this flag to use train_and_evaluate to train in a distributed fashion"
+        "NOTE: You can not use early stopping when --distributed is enabled",
+    )
+    parser.add_argument(
+        "--distributed_training_cleanup",
+        dest="distributed_training_cleanup",
+        action="store_true",
+        help="Set if using distributed training on GKE to stop TwitterSetDeployment"
+        "from continuing training upon restarts (will be deprecated once we migrate off"
+        "TwitterSetDeployment for distributed training on GKE).",
+    )
+    parser.add_argument(
+        "--disable_auto_ps_shutdown",
+        default=False,
+        action="store_true",
+        help="Disable the functionality of automatically shutting down parameter server after "
+        "distributed training complete (either succeed or failed).",
+    )
+    parser.add_argument(
+        "--disable_tensorboard",
+        default=False,
+        action="store_true",
+        help="Do not start the TensorBoard server.",
+    )
+    parser.add_argument(
+        "--tensorboard_port",
+        type=int,
+        default=None,
+        help="Port for tensorboard to run on. Ignored if --disable_tensorboard is set.",
+    )
+    parser.add_argument(
+        "--health_port",
+        type=int,
+        default=None,
+        help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
+        "Not user-facing as it is set automatically by the twml_cli.",
+    )
+    parser.add_argument(
+        "--stats_port",
+        type=int,
+        default=None,
+        help="Port to listen on for stats endpoints",
+    )
+    parser.add_argument(
+        "--experiment_tracking_path",
+        dest="experiment_tracking_path",
+        type=str,
+        default=None,
+        help="The tracking path of this experiment. Format: \
         user_name:project_name:experiment_name:run_name. The path is used to track and display \
         a record of this experiment on ML Dashboard. Note: this embedded experiment tracking is \
-        disabled when the deprecated Model Repo TrackRun is used in your model config. ")
-  parser.add_argument(
-    "--disable_experiment_tracking",
-    dest="disable_experiment_tracking",
-    action="store_true",
-    help="Whether experiment tracking should be disabled.")
-  parser.add_argument(
-    "--config.save_checkpoints_secs", "--save_checkpoints_secs", type=int, default=600,
-    dest='save_checkpoints_secs',
-    help="Configures the tf.estimator.RunConfig.save_checkpoints_secs attribute. "
-    "Specifies how often checkpoints are saved in seconds. Defaults to 10*60 seconds.")
-  parser.add_argument(
-    "--config.keep_checkpoint_max", "--keep_checkpoint_max", type=int, default=20,
-    dest='keep_checkpoint_max',
-    help="Configures the tf.estimator.RunConfig.keep_checkpoint_max attribute. "
-    "Specifies how many checkpoints to keep. Defaults to 20.")
-  parser.add_argument(
-    "--config.tf_random_seed", "--tf_random_seed", type=int, default=None,
-    dest='tf_random_seed',
-    help="Configures the tf.estimator.RunConfig.tf_random_seed attribute. "
-         "Specifies the seed to use. Defaults to None.")
-  parser.add_argument(
-    "--optimizer", type=str, default='SGD',
-    help="Optimizer to use: SGD (Default), Adagrad, Adam, Ftrl, Momentum, RMSProp, LazyAdam, DGC.")
-  parser.add_argument(
-    "--gradient_noise_scale", type=float, default=None,
-    help="adds 0-mean normal noise scaled by this value. Defaults to None.")
-  parser.add_argument(
-    "--clip_gradients", type=float, default=None,
-    help="If specified, a global clipping is applied to prevent "
-         "the norm of the gradient to exceed this value. Defaults to None.")
-  parser.add_argument(
-    "--dgc.density", "--dgc_density", type=float, default=0.1,
-    dest="dgc_density",
-    help="Specifies gradient density level when using deep gradient compression optimizer."
-         "E.g., default value being 0.1 means that only top 10%% most significant rows "
-         "(based on absolute value sums) are kept."
-  )
-  parser.add_argument(
-    "--dgc.density_decay", "--dgc_density_decay", type=bool, default=True,
-    dest="dgc_density_decay",
-    help="Specifies whether to (exponentially) decay the gradient density level when"
-         " doing gradient compression. If set 'False', the 'density_decay_steps', "
-         "'density_decay_rate' and 'min_density' arguments will be ignored."
-  )
-  parser.add_argument(
-    "--dgc.density_decay_steps", "--dgc_density_decay_steps", type=int, default=10000,
-    dest="dgc_density_decay_steps",
-    help="Specifies the step interval to perform density decay."
-  )
-  parser.add_argument(
-    "--dgc.density_decay_rate", "--dgc_density_decay_rate", type=float, default=0.5,
-    dest="dgc_density_decay_rate",
-    help="Specifies the decay rate when perfoming density decay."
-  )
-  parser.add_argument(
-    "--dgc.min_density", "--dgc_min_density", type=float, default=0.1,
-    dest="dgc_min_density",
-    help="Specifies the minimum density level when perfoming density decay."
-  )
-  parser.add_argument(
-    "--dgc.accumulation", "--dgc_accumulation", type=bool, default=False,
-    dest="dgc_accumulation",
-    help="Specifies whether to accumulate small gradients when using deep gradient compression "
-         "optimizer."
-  )
-  parser.add_argument(
-    "--show_optimizer_summaries", dest="show_optimizer_summaries", action="store_true",
-    help="When specified, displays gradients and learning rate in tensorboard."
-    "Turning it on has 10-20%% performance hit. Enable for debugging only")
-
-  parser.add_argument(
-    "--num_mkl_threads", dest="num_mkl_threads", default=1, type=int,
-    help="Specifies how many threads to use for MKL"
-    "inter_op_ parallelism_threds is set to TWML_NUM_CPUS / num_mkl_threads."
-    "intra_op_parallelism_threads is set to num_mkl_threads.")
-
-  parser.add_argument("--verbosity", type=_set_log_level, choices=LOG_LEVELS.keys(), default=None,
-    help="Sets log level to a given verbosity.")
-
-  parser.add_argument(
-    "--feature_importance.algorithm", dest="feature_importance_algorithm",
-    type=str, default=TREE, choices=[SERIAL, TREE],
-    help="""
+        disabled when the deprecated Model Repo TrackRun is used in your model config. ",
+    )
+    parser.add_argument(
+        "--disable_experiment_tracking",
+        dest="disable_experiment_tracking",
+        action="store_true",
+        help="Whether experiment tracking should be disabled.",
+    )
+    parser.add_argument(
+        "--config.save_checkpoints_secs",
+        "--save_checkpoints_secs",
+        type=int,
+        default=600,
+        dest="save_checkpoints_secs",
+        help="Configures the tf.estimator.RunConfig.save_checkpoints_secs attribute. "
+        "Specifies how often checkpoints are saved in seconds. Defaults to 10*60 seconds.",
+    )
+    parser.add_argument(
+        "--config.keep_checkpoint_max",
+        "--keep_checkpoint_max",
+        type=int,
+        default=20,
+        dest="keep_checkpoint_max",
+        help="Configures the tf.estimator.RunConfig.keep_checkpoint_max attribute. "
+        "Specifies how many checkpoints to keep. Defaults to 20.",
+    )
+    parser.add_argument(
+        "--config.tf_random_seed",
+        "--tf_random_seed",
+        type=int,
+        default=None,
+        dest="tf_random_seed",
+        help="Configures the tf.estimator.RunConfig.tf_random_seed attribute. "
+        "Specifies the seed to use. Defaults to None.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="SGD",
+        help="Optimizer to use: SGD (Default), Adagrad, Adam, Ftrl, Momentum, RMSProp, LazyAdam, DGC.",
+    )
+    parser.add_argument(
+        "--gradient_noise_scale",
+        type=float,
+        default=None,
+        help="adds 0-mean normal noise scaled by this value. Defaults to None.",
+    )
+    parser.add_argument(
+        "--clip_gradients",
+        type=float,
+        default=None,
+        help="If specified, a global clipping is applied to prevent "
+        "the norm of the gradient to exceed this value. Defaults to None.",
+    )
+    parser.add_argument(
+        "--dgc.density",
+        "--dgc_density",
+        type=float,
+        default=0.1,
+        dest="dgc_density",
+        help="Specifies gradient density level when using deep gradient compression optimizer."
+        "E.g., default value being 0.1 means that only top 10%% most significant rows "
+        "(based on absolute value sums) are kept.",
+    )
+    parser.add_argument(
+        "--dgc.density_decay",
+        "--dgc_density_decay",
+        type=bool,
+        default=True,
+        dest="dgc_density_decay",
+        help="Specifies whether to (exponentially) decay the gradient density level when"
+        " doing gradient compression. If set 'False', the 'density_decay_steps', "
+        "'density_decay_rate' and 'min_density' arguments will be ignored.",
+    )
+    parser.add_argument(
+        "--dgc.density_decay_steps",
+        "--dgc_density_decay_steps",
+        type=int,
+        default=10000,
+        dest="dgc_density_decay_steps",
+        help="Specifies the step interval to perform density decay.",
+    )
+    parser.add_argument(
+        "--dgc.density_decay_rate",
+        "--dgc_density_decay_rate",
+        type=float,
+        default=0.5,
+        dest="dgc_density_decay_rate",
+        help="Specifies the decay rate when perfoming density decay.",
+    )
+    parser.add_argument(
+        "--dgc.min_density",
+        "--dgc_min_density",
+        type=float,
+        default=0.1,
+        dest="dgc_min_density",
+        help="Specifies the minimum density level when perfoming density decay.",
+    )
+    parser.add_argument(
+        "--dgc.accumulation",
+        "--dgc_accumulation",
+        type=bool,
+        default=False,
+        dest="dgc_accumulation",
+        help="Specifies whether to accumulate small gradients when using deep gradient compression "
+        "optimizer.",
+    )
+    parser.add_argument(
+        "--show_optimizer_summaries",
+        dest="show_optimizer_summaries",
+        action="store_true",
+        help="When specified, displays gradients and learning rate in tensorboard."
+        "Turning it on has 10-20%% performance hit. Enable for debugging only",
+    )
+
+    parser.add_argument(
+        "--num_mkl_threads",
+        dest="num_mkl_threads",
+        default=1,
+        type=int,
+        help="Specifies how many threads to use for MKL"
+        "inter_op_ parallelism_threds is set to TWML_NUM_CPUS / num_mkl_threads."
+        "intra_op_parallelism_threads is set to num_mkl_threads.",
+    )
+
+    parser.add_argument(
+        "--verbosity",
+        type=_set_log_level,
+        choices=LOG_LEVELS.keys(),
+        default=None,
+        help="Sets log level to a given verbosity.",
+    )
+
+    parser.add_argument(
+        "--feature_importance.algorithm",
+        dest="feature_importance_algorithm",
+        type=str,
+        default=TREE,
+        choices=[SERIAL, TREE],
+        help="""
     There are two algorithms that the module supports, `serial` and `tree`.
       The `serial` algorithm computes feature importances for each feature, and
       the `tree` algorithm groups features by feature name prefix, computes feature
@@ -302,260 +453,345 @@ def get_trainer_parser():
       upper bound rather than an exact importance value. We suggest that users generally stick
       to the `tree` algorithm, unless if they have a very small number of features or
       near-random model performance.
-      """)
+      """,
+    )
 
-  parser.add_argument(
-    "--feature_importance.sensitivity", dest="feature_importance_sensitivity", type=float, default=0.03,
-    help="""
+    parser.add_argument(
+        "--feature_importance.sensitivity",
+        dest="feature_importance_sensitivity",
+        type=float,
+        default=0.03,
+        help="""
     The maximum amount that permuting a feature group can cause the model performance (determined
       by `feature_importance.metric`) to drop before the algorithm decides to not expand the feature
       group. This is only used for the `tree` algorithm.
-    """)
+    """,
+    )
 
-  parser.add_argument(
-    "--feature_importance.dont_build_tree", dest="dont_build_tree", action="store_true", default=False,
-    help="""
+    parser.add_argument(
+        "--feature_importance.dont_build_tree",
+        dest="dont_build_tree",
+        action="store_true",
+        default=False,
+        help="""
     If True, don't build the feature trie for the tree algorithm and only use the extra_groups
-    """)
+    """,
+    )
 
-  parser.add_argument(
-    "--feature_importance.split_feature_group_on_period", dest="split_feature_group_on_period", action="store_true", default=False,
-    help="If true, split feature groups by the period rather than the optimal prefix. Only used for the TREE algorithm")
+    parser.add_argument(
+        "--feature_importance.split_feature_group_on_period",
+        dest="split_feature_group_on_period",
+        action="store_true",
+        default=False,
+        help="If true, split feature groups by the period rather than the optimal prefix. Only used for the TREE algorithm",
+    )
 
-  parser.add_argument(
-    "--feature_importance.example_count", dest="feature_importance_example_count", type=int, default=10000,
-    help="""
+    parser.add_argument(
+        "--feature_importance.example_count",
+        dest="feature_importance_example_count",
+        type=int,
+        default=10000,
+        help="""
     The number of examples used to compute feature importance.
     Larger values yield more reliable results, but also take longer to compute.
     These records are loaded into memory. This number is agnostic to batch size.
-    """)
-
-  parser.add_argument(
-    "--feature_importance.data_dir", dest="feature_importance_data_dir", type=str, default=None,
-    help="Path to the dataset used to compute feature importance."
-         "supports local filesystem path and hdfs://default/<path> which requires "
-         "setting HDFS configuration via env variable HADOOP_CONF_DIR "
-         "Defaults to eval_data_dir")
-
-  parser.add_argument(
-    "--feature_importance.metric", dest="feature_importance_metric", type=str, default="roc_auc",
-    help="The metric used to determine when to stop expanding the feature importance tree. This is only used for the `tree` algorithm.")
-
-  parser.add_argument(
-    "--feature_importance.is_metric_larger_the_better", dest="feature_importance_is_metric_larger_the_better", action="store_true", default=False,
-    help="If true, interpret `--feature_importance.metric` to be a metric where larger values are better (e.g. ROC_AUC)")
-
-  parser.add_argument(
-    "--feature_importance.is_metric_smaller_the_better", dest="feature_importance_is_metric_smaller_the_better", action="store_true", default=False,
-    help="If true, interpret `--feature_importance.metric` to be a metric where smaller values are better (e.g. LOSS)")
-
-  subparsers = parser.add_subparsers(help='Learning Rate Decay Functions. Can only pass 1.'
-                                          'Should be specified after all the optional arguments'
-                                          'and followed by its specific args'
-                                          'e.g. --learning_rate 0.01 inverse_learning_rate_decay_fn'
-                                          ' --decay_rate 0.0004 --min_learning_rate 0.001',
-                                     dest='learning_rate_decay')
-
-  # Create the parser for the "exponential_learning_rate_decay_fn"
-  parser_exponential = subparsers.add_parser('exponential_learning_rate_decay',
-                                             help='Exponential learning rate decay. '
-                                             'Exponential decay implements:'
-                                             'decayed_learning_rate = learning_rate * '
-                                             'exponential_decay_rate ^ '
-                                             '(global_step / decay_steps')
-  parser_exponential.add_argument(
-    "--decay_steps", type=float, default=None,
-    help="Required for 'exponential' learning_rate_decay.")
-  parser_exponential.add_argument(
-    "--exponential_decay_rate", type=float, default=None,
-    help="Required for 'exponential' learning_rate_decay. Must be positive. ")
-
-  # Create the parser for the "polynomial_learning_rate_decay_fn"
-  parser_polynomial = subparsers.add_parser('polynomial_learning_rate_decay',
-                                            help='Polynomial learning rate decay. '
-                                            'Polynomial decay implements: '
-                                            'global_step = min(global_step, decay_steps)'
-                                            'decayed_learning_rate = '
-                                            '(learning_rate - end_learning_rate) * '
-                                            '(1 - global_step / decay_steps) ^ '
-                                            '(polynomial_power) + end_learning_rate'
-                                            'So for linear decay you can use a '
-                                            'polynomial_power=1 (the default)')
-  parser_polynomial.add_argument(
-    "--end_learning_rate", type=float, default=0.0001,
-    help="Required for 'polynomial' learning_rate_decay (ignored otherwise).")
-  parser_polynomial.add_argument(
-    "--polynomial_power", type=float, default=0.0001,
-    help="Required for 'polynomial' learning_rate_decay."
-         "The power of the polynomial. Defaults to linear, 1.0.")
-  parser_polynomial.add_argument(
-    "--decay_steps", type=float, default=None,
-    help="Required for 'polynomial' learning_rate_decay. ")
-
-  # Create the parser for the "piecewise_constant_learning_rate_decay_fn"
-  parser_piecewise_constant = subparsers.add_parser('piecewise_constant_learning_rate_decay',
-                                                    help='Piecewise Constant '
-                                                    'learning rate decay. '
-                                                    'For piecewise_constant, '
-                                                    'consider this example: '
-                                                    'We want to use a learning rate '
-                                                    'that is 1.0 for'
-                                                    'the first 100000 steps,'
-                                                    '0.5 for steps 100001 to 110000, '
-                                                    'and 0.1 for any additional steps. '
-                                                    'To do so, specify '
-                                                    '--piecewise_constant_boundaries=100000,110000'
-                                                    '--piecewise_constant_values=1.0,0.5,0.1')
-  parser_piecewise_constant.add_argument(
-    "--piecewise_constant_values",
-    action=parse_comma_separated_list(element_type=float),
-    default=None,
-    help="Required for 'piecewise_constant_values' learning_rate_decay. "
-         "A list of comma seperated floats or ints that specifies the values "
-         "for the intervals defined by boundaries. It should have one more "
-         "element than boundaries.")
-  parser_piecewise_constant.add_argument(
-    "--piecewise_constant_boundaries",
-    action=parse_comma_separated_list(element_type=int),
-    default=None,
-    help="Required for 'piecewise_constant_values' learning_rate_decay. "
-         "A list of comma seperated integers, with strictly increasing entries.")
-
-  # Create the parser for the "inverse_learning_rate_decay_fn"
-  parser_inverse = subparsers.add_parser('inverse_learning_rate_decay',
-                                         help='Inverse Leaning rate decay. '
-                                         'Inverse implements:'
-                                         'decayed_lr = max(lr /(1 + decay_rate * '
-                                         'floor(global_step /decay_step)),'
-                                         ' min_learning_rate)'
-                                         'When decay_step=1 this mimics the behaviour'
-                                         'of the default learning rate decay'
-                                         'of DeepBird v1.')
-
-  parser_inverse.add_argument(
-    "--decay_rate", type=float, default=None,
-    help="Required for 'inverse' learning_rate_decay. Rate in which we decay the learning rate.")
-  parser_inverse.add_argument(
-    "--min_learning_rate", type=float, default=None,
-    help="Required for 'inverse' learning_rate_decay.Minimum possible learning_rate.")
-  parser_inverse.add_argument(
-    "--decay_steps", type=float, default=1,
-    help="Required for 'inverse' learning_rate_decay.")
-
-  # Create the parser for the "cosine_learning_rate_decay_fn"
-  parser_cosine = subparsers.add_parser('cosine_learning_rate_decay',
-                                        help='Cosine Leaning rate decay. '
-                                        'Cosine implements:'
-                                        'decayed_lr = 0.5 * (1 + cos(pi *\
-                                         global_step / decay_steps)) * lr'
-                                       )
-
-  parser_cosine.add_argument(
-    "--alpha", type=float, default=0,
-    help="A scalar float32 or float64 Tensor or a Python number.\
-    Minimum learning rate value as a fraction of learning_rate.")
-  parser_cosine.add_argument(
-    "--decay_steps", type=float,
-    help="Required for 'inverse' learning_rate_decay.")
-
-  # Create the parser for the "cosine_restart_learning_rate_decay_fn"
-  parser_cosine_restart = subparsers.add_parser('cosine_restarts_learning_rate_decay',
-                                                help='Applies cosine decay with restarts \
-                                                  to the learning rate'
-                                                'See [Loshchilov & Hutter, ICLR2016],\
-                                                   SGDR: Stochastic'
-                                                'Gradient Descent with Warm Restarts.'
-                                                'https://arxiv.org/abs/1608.03983'
-                                               )
-  parser_cosine_restart.add_argument(
-    "--first_decay_steps", type=float,
-    help="Required for 'cosine_restart' learning_rate_decay.")
-  parser_cosine_restart.add_argument(
-    "--alpha", type=float, default=0,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-           Minimum learning rate value as a fraction of learning_rate.")
-  parser_cosine_restart.add_argument(
-    "--t_mul", type=float, default=2,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-           Used to derive the number of iterations in the i-th period")
-  parser_cosine_restart.add_argument(
-    "--m_mul", type=float, default=1,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-      Used to derive the initial learning rate of the i-th period.")
-
-  # Create dummy parser for None, which is the default.
-  parser_default = subparsers.add_parser(
-    'no_learning_rate_decay',
-    help='No learning rate decay')  # noqa: F841
-
-  parser.set_default_subparser('no_learning_rate_decay')
-
-  return parser
+    """,
+    )
 
+    parser.add_argument(
+        "--feature_importance.data_dir",
+        dest="feature_importance_data_dir",
+        type=str,
+        default=None,
+        help="Path to the dataset used to compute feature importance."
+        "supports local filesystem path and hdfs://default/<path> which requires "
+        "setting HDFS configuration via env variable HADOOP_CONF_DIR "
+        "Defaults to eval_data_dir",
+    )
 
-class DefaultSubcommandArgParse(argparse.ArgumentParser):
-  """
-  Subclass of argparse.ArgumentParser that sets default parser
-  """
-  _DEFAULT_SUBPARSER = None
+    parser.add_argument(
+        "--feature_importance.metric",
+        dest="feature_importance_metric",
+        type=str,
+        default="roc_auc",
+        help="The metric used to determine when to stop expanding the feature importance tree. This is only used for the `tree` algorithm.",
+    )
 
-  def set_default_subparser(self, name):
-    """
-    sets the default subparser
-    """
-    self._DEFAULT_SUBPARSER = name
+    parser.add_argument(
+        "--feature_importance.is_metric_larger_the_better",
+        dest="feature_importance_is_metric_larger_the_better",
+        action="store_true",
+        default=False,
+        help="If true, interpret `--feature_importance.metric` to be a metric where larger values are better (e.g. ROC_AUC)",
+    )
 
-  def _parse_known_args(self, arg_strings, *args, **kwargs):
+    parser.add_argument(
+        "--feature_importance.is_metric_smaller_the_better",
+        dest="feature_importance_is_metric_smaller_the_better",
+        action="store_true",
+        default=False,
+        help="If true, interpret `--feature_importance.metric` to be a metric where smaller values are better (e.g. LOSS)",
+    )
+
+    subparsers = parser.add_subparsers(
+        help="Learning Rate Decay Functions. Can only pass 1."
+        "Should be specified after all the optional arguments"
+        "and followed by its specific args"
+        "e.g. --learning_rate 0.01 inverse_learning_rate_decay_fn"
+        " --decay_rate 0.0004 --min_learning_rate 0.001",
+        dest="learning_rate_decay",
+    )
+
+    # Create the parser for the "exponential_learning_rate_decay_fn"
+    parser_exponential = subparsers.add_parser(
+        "exponential_learning_rate_decay",
+        help="Exponential learning rate decay. "
+        "Exponential decay implements:"
+        "decayed_learning_rate = learning_rate * "
+        "exponential_decay_rate ^ "
+        "(global_step / decay_steps",
+    )
+    parser_exponential.add_argument(
+        "--decay_steps",
+        type=float,
+        default=None,
+        help="Required for 'exponential' learning_rate_decay.",
+    )
+    parser_exponential.add_argument(
+        "--exponential_decay_rate",
+        type=float,
+        default=None,
+        help="Required for 'exponential' learning_rate_decay. Must be positive. ",
+    )
+
+    # Create the parser for the "polynomial_learning_rate_decay_fn"
+    parser_polynomial = subparsers.add_parser(
+        "polynomial_learning_rate_decay",
+        help="Polynomial learning rate decay. "
+        "Polynomial decay implements: "
+        "global_step = min(global_step, decay_steps)"
+        "decayed_learning_rate = "
+        "(learning_rate - end_learning_rate) * "
+        "(1 - global_step / decay_steps) ^ "
+        "(polynomial_power) + end_learning_rate"
+        "So for linear decay you can use a "
+        "polynomial_power=1 (the default)",
+    )
+    parser_polynomial.add_argument(
+        "--end_learning_rate",
+        type=float,
+        default=0.0001,
+        help="Required for 'polynomial' learning_rate_decay (ignored otherwise).",
+    )
+    parser_polynomial.add_argument(
+        "--polynomial_power",
+        type=float,
+        default=0.0001,
+        help="Required for 'polynomial' learning_rate_decay."
+        "The power of the polynomial. Defaults to linear, 1.0.",
+    )
+    parser_polynomial.add_argument(
+        "--decay_steps",
+        type=float,
+        default=None,
+        help="Required for 'polynomial' learning_rate_decay. ",
+    )
+
+    # Create the parser for the "piecewise_constant_learning_rate_decay_fn"
+    parser_piecewise_constant = subparsers.add_parser(
+        "piecewise_constant_learning_rate_decay",
+        help="Piecewise Constant "
+        "learning rate decay. "
+        "For piecewise_constant, "
+        "consider this example: "
+        "We want to use a learning rate "
+        "that is 1.0 for"
+        "the first 100000 steps,"
+        "0.5 for steps 100001 to 110000, "
+        "and 0.1 for any additional steps. "
+        "To do so, specify "
+        "--piecewise_constant_boundaries=100000,110000"
+        "--piecewise_constant_values=1.0,0.5,0.1",
+    )
+    parser_piecewise_constant.add_argument(
+        "--piecewise_constant_values",
+        action=parse_comma_separated_list(element_type=float),
+        default=None,
+        help="Required for 'piecewise_constant_values' learning_rate_decay. "
+        "A list of comma seperated floats or ints that specifies the values "
+        "for the intervals defined by boundaries. It should have one more "
+        "element than boundaries.",
+    )
+    parser_piecewise_constant.add_argument(
+        "--piecewise_constant_boundaries",
+        action=parse_comma_separated_list(element_type=int),
+        default=None,
+        help="Required for 'piecewise_constant_values' learning_rate_decay. "
+        "A list of comma seperated integers, with strictly increasing entries.",
+    )
+
+    # Create the parser for the "inverse_learning_rate_decay_fn"
+    parser_inverse = subparsers.add_parser(
+        "inverse_learning_rate_decay",
+        help="Inverse Leaning rate decay. "
+        "Inverse implements:"
+        "decayed_lr = max(lr /(1 + decay_rate * "
+        "floor(global_step /decay_step)),"
+        " min_learning_rate)"
+        "When decay_step=1 this mimics the behaviour"
+        "of the default learning rate decay"
+        "of DeepBird v1.",
+    )
+
+    parser_inverse.add_argument(
+        "--decay_rate",
+        type=float,
+        default=None,
+        help="Required for 'inverse' learning_rate_decay. Rate in which we decay the learning rate.",
+    )
+    parser_inverse.add_argument(
+        "--min_learning_rate",
+        type=float,
+        default=None,
+        help="Required for 'inverse' learning_rate_decay.Minimum possible learning_rate.",
+    )
+    parser_inverse.add_argument(
+        "--decay_steps",
+        type=float,
+        default=1,
+        help="Required for 'inverse' learning_rate_decay.",
+    )
+
+    # Create the parser for the "cosine_learning_rate_decay_fn"
+    parser_cosine = subparsers.add_parser(
+        "cosine_learning_rate_decay",
+        help="Cosine Leaning rate decay. "
+        "Cosine implements:"
+        "decayed_lr = 0.5 * (1 + cos(pi *\
+                                         global_step / decay_steps)) * lr",
+    )
+
+    parser_cosine.add_argument(
+        "--alpha",
+        type=float,
+        default=0,
+        help="A scalar float32 or float64 Tensor or a Python number.\
+    Minimum learning rate value as a fraction of learning_rate.",
+    )
+    parser_cosine.add_argument(
+        "--decay_steps", type=float, help="Required for 'inverse' learning_rate_decay."
+    )
+
+    # Create the parser for the "cosine_restart_learning_rate_decay_fn"
+    parser_cosine_restart = subparsers.add_parser(
+        "cosine_restarts_learning_rate_decay",
+        help="Applies cosine decay with restarts \
+                                                  to the learning rate"
+        "See [Loshchilov & Hutter, ICLR2016],\
+                                                   SGDR: Stochastic"
+        "Gradient Descent with Warm Restarts."
+        "https://arxiv.org/abs/1608.03983",
+    )
+    parser_cosine_restart.add_argument(
+        "--first_decay_steps",
+        type=float,
+        help="Required for 'cosine_restart' learning_rate_decay.",
+    )
+    parser_cosine_restart.add_argument(
+        "--alpha",
+        type=float,
+        default=0,
+        help="A scalar float32 or float64 Tensor or a Python number. \
+           Minimum learning rate value as a fraction of learning_rate.",
+    )
+    parser_cosine_restart.add_argument(
+        "--t_mul",
+        type=float,
+        default=2,
+        help="A scalar float32 or float64 Tensor or a Python number. \
+           Used to derive the number of iterations in the i-th period",
+    )
+    parser_cosine_restart.add_argument(
+        "--m_mul",
+        type=float,
+        default=1,
+        help="A scalar float32 or float64 Tensor or a Python number. \
+      Used to derive the initial learning rate of the i-th period.",
+    )
+
+    # Create dummy parser for None, which is the default.
+    parser_default = subparsers.add_parser(
+        "no_learning_rate_decay", help="No learning rate decay"
+    )  # noqa: F841
+
+    parser.set_default_subparser("no_learning_rate_decay")
+
+    return parser
+
+
+class DefaultSubcommandArgParse(argparse.ArgumentParser):
     """
-    Overwrites _parse_known_args
+    Subclass of argparse.ArgumentParser that sets default parser
     """
-    in_args = set(arg_strings)
-    d_sp = self._DEFAULT_SUBPARSER
-    if d_sp is not None and not {'-h', '--help'}.intersection(in_args):
-      for x_val in self._subparsers._actions:
-        subparser_found = (
-          isinstance(x_val, argparse._SubParsersAction) and
-          in_args.intersection(x_val._name_parser_map.keys())
+
+    _DEFAULT_SUBPARSER = None
+
+    def set_default_subparser(self, name):
+        """
+        sets the default subparser
+        """
+        self._DEFAULT_SUBPARSER = name
+
+    def _parse_known_args(self, arg_strings, *args, **kwargs):
+        """
+        Overwrites _parse_known_args
+        """
+        in_args = set(arg_strings)
+        d_sp = self._DEFAULT_SUBPARSER
+        if d_sp is not None and not {"-h", "--help"}.intersection(in_args):
+            for x_val in self._subparsers._actions:
+                subparser_found = isinstance(
+                    x_val, argparse._SubParsersAction
+                ) and in_args.intersection(x_val._name_parser_map.keys())
+                if subparser_found:
+                    break
+            else:
+                # insert default in first position, this implies no
+                # global options without a sub_parsers specified
+                arg_strings = arg_strings + [d_sp]
+        return super(DefaultSubcommandArgParse, self)._parse_known_args(
+            arg_strings, *args, **kwargs
         )
-        if subparser_found:
-          break
-      else:
-        # insert default in first position, this implies no
-        # global options without a sub_parsers specified
-        arg_strings = arg_strings + [d_sp]
-    return super(DefaultSubcommandArgParse, self)._parse_known_args(
-      arg_strings, *args, **kwargs
-    )
-
-  def _check_value(self, action, value):
-    try:
-      super(DefaultSubcommandArgParse, self)._check_value(
-        action, value
-      )
-    except ArgumentError as error:
-      error.message += ("\nERROR: Deepbird is trying to interpret \"{}\" as a value of {}. If this is not what you expected, "
-        "then most likely one of the following two things are happening: Either one of your cli arguments are not recognized, "
-        "probably {} or whichever argument you are passing {} as a value to OR you are passing in an argument after "
-        "the `learning_rate_decay` argument.\n").format(value, action.dest, value, value)
-      raise error
 
+    def _check_value(self, action, value):
+        try:
+            super(DefaultSubcommandArgParse, self)._check_value(action, value)
+        except ArgumentError as error:
+            error.message += (
+                '\nERROR: Deepbird is trying to interpret "{}" as a value of {}. If this is not what you expected, '
+                "then most likely one of the following two things are happening: Either one of your cli arguments are not recognized, "
+                "probably {} or whichever argument you are passing {} as a value to OR you are passing in an argument after "
+                "the `learning_rate_decay` argument.\n"
+            ).format(value, action.dest, value, value)
+            raise error
 
-def parse_comma_separated_list(element_type=str):
-  """
-  Generates an argparse.Action that converts a string representing a comma separated list to a
-  list and converts each element to a specified type.
-  """
 
-  # pylint: disable-msg=too-few-public-methods
-  class _ParseCommaSeparatedList(argparse.Action):
+def parse_comma_separated_list(element_type=str):
     """
-    Converts a string representing a comma separated list to a list and converts each element to a
-    specified type.
+    Generates an argparse.Action that converts a string representing a comma separated list to a
+    list and converts each element to a specified type.
     """
 
-    def __call__(self, parser, namespace, values, option_string=None):
-      if values is not None:
-        values = [element_type(v) for v in values.split(',')]
-      setattr(namespace, self.dest, values)
+    # pylint: disable-msg=too-few-public-methods
+    class _ParseCommaSeparatedList(argparse.Action):
+        """
+        Converts a string representing a comma separated list to a list and converts each element to a
+        specified type.
+        """
+
+        def __call__(self, parser, namespace, values, option_string=None):
+            if values is not None:
+                values = [element_type(v) for v in values.split(",")]
+            setattr(namespace, self.dest, values)
 
-  return _ParseCommaSeparatedList
+    return _ParseCommaSeparatedList
diff --git a/twml/twml/array.py b/twml/twml/array.py
index a8524a06d..bc8377e14 100644
--- a/twml/twml/array.py
+++ b/twml/twml/array.py
@@ -2,100 +2,104 @@
 
 import ctypes as ct
 
+import numpy as np
 from absl import logging
 from libtwml import CLIB
-import numpy as np
-
 
 _NP_TO_TWML_TYPE = {
-  'float32': ct.c_int(1),
-  'float64': ct.c_int(2),
-  'int32': ct.c_int(3),
-  'int64': ct.c_int(4),
-  'int8': ct.c_int(5),
-  'uint8': ct.c_int(6),
+    "float32": ct.c_int(1),
+    "float64": ct.c_int(2),
+    "int32": ct.c_int(3),
+    "int64": ct.c_int(4),
+    "int8": ct.c_int(5),
+    "uint8": ct.c_int(6),
 }
 
 
 class Array(object):
-  """
-  Wrapper class to allow numpy arrays to work with twml functions.
-  """
-
-  def __init__(self, array):
-    """
-    Wraps numpy array and creates a handle that can be passed to C functions from libtwml.
-
-    array: Numpy array
-    """
-    if not isinstance(array, np.ndarray):
-      raise TypeError("Input must be a numpy array")
-
-    try:
-      ttype = _NP_TO_TWML_TYPE[array.dtype.name]
-    except KeyError as err:
-      logging.error("Unsupported numpy type")
-      raise err
-
-    handle = ct.c_void_p(0)
-    ndim = ct.c_int(array.ndim)
-    dims = array.ctypes.get_shape()
-    isize = array.dtype.itemsize
-
-    strides_t = ct.c_size_t * array.ndim
-    strides = strides_t(*[n // isize for n in array.strides])
-
-    err = CLIB.twml_tensor_create(ct.pointer(handle),
-                                  array.ctypes.get_as_parameter(),
-                                  ndim, dims, strides, ttype)
-
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-
-    # Store the numpy array to ensure it isn't deleted before self
-    self._array = array
-
-    self._handle = handle
-
-    self._type = ttype
-
-  @property
-  def handle(self):
-    """
-    Return the twml handle
-    """
-    return self._handle
-
-  @property
-  def shape(self):
     """
-    Return the shape
+    Wrapper class to allow numpy arrays to work with twml functions.
     """
-    return self._array.shape
 
-  @property
-  def ndim(self):
-    """
-    Return the shape
-    """
-    return self._array.ndim
-
-  @property
-  def array(self):
-    """
-    Return the numpy array
-    """
-    return self._array
-
-  @property
-  def dtype(self):
-    """
-    Return numpy dtype
-    """
-    return self._array.dtype
-
-  def __del__(self):
-    """
-    Delete the handle
-    """
-    CLIB.twml_tensor_delete(self._handle)
+    def __init__(self, array):
+        """
+        Wraps numpy array and creates a handle that can be passed to C functions from libtwml.
+
+        array: Numpy array
+        """
+        if not isinstance(array, np.ndarray):
+            raise TypeError("Input must be a numpy array")
+
+        try:
+            ttype = _NP_TO_TWML_TYPE[array.dtype.name]
+        except KeyError as err:
+            logging.error("Unsupported numpy type")
+            raise err
+
+        handle = ct.c_void_p(0)
+        ndim = ct.c_int(array.ndim)
+        dims = array.ctypes.get_shape()
+        isize = array.dtype.itemsize
+
+        strides_t = ct.c_size_t * array.ndim
+        strides = strides_t(*[n // isize for n in array.strides])
+
+        err = CLIB.twml_tensor_create(
+            ct.pointer(handle),
+            array.ctypes.get_as_parameter(),
+            ndim,
+            dims,
+            strides,
+            ttype,
+        )
+
+        if err != 1000:
+            raise RuntimeError("Error from libtwml")
+
+        # Store the numpy array to ensure it isn't deleted before self
+        self._array = array
+
+        self._handle = handle
+
+        self._type = ttype
+
+    @property
+    def handle(self):
+        """
+        Return the twml handle
+        """
+        return self._handle
+
+    @property
+    def shape(self):
+        """
+        Return the shape
+        """
+        return self._array.shape
+
+    @property
+    def ndim(self):
+        """
+        Return the shape
+        """
+        return self._array.ndim
+
+    @property
+    def array(self):
+        """
+        Return the numpy array
+        """
+        return self._array
+
+    @property
+    def dtype(self):
+        """
+        Return numpy dtype
+        """
+        return self._array.dtype
+
+    def __del__(self):
+        """
+        Delete the handle
+        """
+        CLIB.twml_tensor_delete(self._handle)
diff --git a/twml/twml/block_format_writer.py b/twml/twml/block_format_writer.py
index 9c4a9b6a8..a33779b1e 100644
--- a/twml/twml/block_format_writer.py
+++ b/twml/twml/block_format_writer.py
@@ -5,61 +5,61 @@
 
 
 class BlockFormatWriter(object):
-  """
-  Class to write block format file.
-  """
+    """
+    Class to write block format file.
+    """
 
-  def __init__(self, file_name, records_per_block=100):
-    file_name = file_name
-    if not isinstance(file_name, str):
-      raise ValueError("file_name has to be of type str")
+    def __init__(self, file_name, records_per_block=100):
+        file_name = file_name
+        if not isinstance(file_name, str):
+            raise ValueError("file_name has to be of type str")
 
-    self.file_name = ct.c_char_p(file_name.encode())
-    self.records_per_block = ct.c_int(int(records_per_block))
-    handle = ct.c_void_p(0)
-    err = CLIB.block_format_writer_create(ct.pointer(handle),
-                                          self.file_name,
-                                          self.records_per_block)
-    self._handle = None
-    # 1000 means TWML_ERR_NONE
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-    self._handle = handle
+        self.file_name = ct.c_char_p(file_name.encode())
+        self.records_per_block = ct.c_int(int(records_per_block))
+        handle = ct.c_void_p(0)
+        err = CLIB.block_format_writer_create(
+            ct.pointer(handle), self.file_name, self.records_per_block
+        )
+        self._handle = None
+        # 1000 means TWML_ERR_NONE
+        if err != 1000:
+            raise RuntimeError("Error from libtwml")
+        self._handle = handle
 
-  @property
-  def handle(self):
-    """
-    Return the handle
-    """
-    return self._handle
+    @property
+    def handle(self):
+        """
+        Return the handle
+        """
+        return self._handle
 
-  def write(self, class_name, record):
-    """
-    Write a record.
+    def write(self, class_name, record):
+        """
+        Write a record.
 
-    Note: `record` needs to be in a format that can be converted to ctypes.c_char_p.
-    """
-    if not isinstance(class_name, str):
-      raise ValueError("class_name has to be of type str")
+        Note: `record` needs to be in a format that can be converted to ctypes.c_char_p.
+        """
+        if not isinstance(class_name, str):
+            raise ValueError("class_name has to be of type str")
 
-    record_len = len(record)
-    class_name = ct.c_char_p(class_name.encode())
-    record = ct.c_char_p(record)
-    err = CLIB.block_format_write(self._handle, class_name, record, record_len)
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
+        record_len = len(record)
+        class_name = ct.c_char_p(class_name.encode())
+        record = ct.c_char_p(record)
+        err = CLIB.block_format_write(self._handle, class_name, record, record_len)
+        if err != 1000:
+            raise RuntimeError("Error from libtwml")
 
-  def flush(self):
-    """
-    Flush records in buffer to outputfile.
-    """
-    err = CLIB.block_format_flush(self._handle)
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
+    def flush(self):
+        """
+        Flush records in buffer to outputfile.
+        """
+        err = CLIB.block_format_flush(self._handle)
+        if err != 1000:
+            raise RuntimeError("Error from libtwml")
 
-  def __del__(self):
-    """
-    Delete the handle
-    """
-    if self._handle:
-      CLIB.block_format_writer_delete(self._handle)
+    def __del__(self):
+        """
+        Delete the handle
+        """
+        if self._handle:
+            CLIB.block_format_writer_delete(self._handle)
diff --git a/twml/twml/constants.py b/twml/twml/constants.py
index c6c726eed..2888280c3 100644
--- a/twml/twml/constants.py
+++ b/twml/twml/constants.py
@@ -1,11 +1,11 @@
 # These should coincide with 'enum class DecodeMode' values in HashedDataRecordReader.h
 
+from twitter.deepbird.io.legacy.constants import DECODE_MODES  # noqa: F401
+from twitter.deepbird.io.legacy.constants import DEFAULT_DECODE_MODE  # noqa: F401
 from twitter.deepbird.io.legacy.constants import (
-  DECODE_MODES,  # noqa: F401
-  DEFAULT_DECODE_MODE,  # noqa: F401
-  HASH_FNAME_AND_VALNAME,  # noqa: F401
-  HASH_VALNAME,  # noqa: F401
-  HashingDiscretizerOptions,  # noqa: F401
-  DEFAULT_ZOOKEEPER_BASE_ZNODE,  # noqa: F401
-  DEFAULT_ZOOKEEPER_HOST,  # noqa: F401
-)
+    DEFAULT_ZOOKEEPER_BASE_ZNODE,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.constants import DEFAULT_ZOOKEEPER_HOST  # noqa: F401
+from twitter.deepbird.io.legacy.constants import HASH_FNAME_AND_VALNAME  # noqa: F401
+from twitter.deepbird.io.legacy.constants import HASH_VALNAME  # noqa: F401
+from twitter.deepbird.io.legacy.constants import HashingDiscretizerOptions  # noqa: F401
diff --git a/twml/twml/contrib/__init__.py b/twml/twml/contrib/__init__.py
index 1a5e8efe4..2860971b6 100644
--- a/twml/twml/contrib/__init__.py
+++ b/twml/twml/contrib/__init__.py
@@ -1,21 +1,21 @@
 # pylint: disable=wildcard-import
 """ experimental and contributed modules """
 
-from . import layers  # noqa: F401
-from . import feature_importances  # noqa: F401
-from . import calibrators  # noqa: F401
-from . import readers  # noqa: F401
-from . import utils  # noqa: F401
-from . import build_graphs_fns  # noqa: F401
-from . import feature_config  # noqa: F401
-from . import parsers  # noqa: F401
-from . import initializers  # noqa: F401
-from . import export # noqa: F401
-from . import feature_config_parsers # noqa: F401
-
 # These imports do not work with TF 2.x and are not needed either.
 # If you are using TF 2.x, use the modular targets under src/python/twitter/deepbird.
 import tensorflow
-from . import trainers  # noqa: F401
-from . import metrics  # noqa: F401
+
+from . import build_graphs_fns  # noqa: F401
+from . import calibrators  # noqa: F401
+from . import export  # noqa: F401
+from . import feature_config  # noqa: F401
+from . import feature_config_parsers  # noqa: F401
+from . import feature_importances  # noqa: F401
 from . import hooks  # noqa: F401
+from . import initializers  # noqa: F401
+from . import layers  # noqa: F401
+from . import metrics  # noqa: F401
+from . import parsers  # noqa: F401
+from . import readers  # noqa: F401
+from . import trainers  # noqa: F401
+from . import utils  # noqa: F401
diff --git a/twml/twml/contrib/build_graphs_fns.py b/twml/twml/contrib/build_graphs_fns.py
index 829f61512..1ffa7f4d1 100644
--- a/twml/twml/contrib/build_graphs_fns.py
+++ b/twml/twml/contrib/build_graphs_fns.py
@@ -1,32 +1,34 @@
 # pylint: disable=unused-argument, missing-docstring
-'''
+"""
 Common build graphs that can be reused
-'''
+"""
 import tensorflow.compat.v1 as tf
 
 
 def get_saved_modules_graph(input_graph_fn):
-  """
-  Get common graph for stitching different saved modules for export.
-  This graph is used to save checkpoints; and then export the modules
-  as a unity.
-  Args:
-        features:
-          model features
-        params:
-          model params
-        input_graph_fn:
-          main logic for the stitching
-  Returns:
-    build_graph
-  """
-  def build_graph(features, label, mode, params, config=None):
-    output = input_graph_fn(features, params)
-    # If mode is train, we just need to assign a dummy loss
-    # and update the train op. This is done to save the graph to save_dir.
-    if mode == 'train':
-      loss = tf.constant(1)
-      train_op = tf.assign_add(tf.train.get_global_step(), 1)
-      return {'train_op': train_op, 'loss': loss}
-    return output
-  return build_graph
+    """
+    Get common graph for stitching different saved modules for export.
+    This graph is used to save checkpoints; and then export the modules
+    as a unity.
+    Args:
+          features:
+            model features
+          params:
+            model params
+          input_graph_fn:
+            main logic for the stitching
+    Returns:
+      build_graph
+    """
+
+    def build_graph(features, label, mode, params, config=None):
+        output = input_graph_fn(features, params)
+        # If mode is train, we just need to assign a dummy loss
+        # and update the train op. This is done to save the graph to save_dir.
+        if mode == "train":
+            loss = tf.constant(1)
+            train_op = tf.assign_add(tf.train.get_global_step(), 1)
+            return {"train_op": train_op, "loss": loss}
+        return output
+
+    return build_graph
diff --git a/twml/twml/contrib/calibrators/__init__.py b/twml/twml/contrib/calibrators/__init__.py
index 02181ed12..925c193e2 100644
--- a/twml/twml/contrib/calibrators/__init__.py
+++ b/twml/twml/contrib/calibrators/__init__.py
@@ -9,10 +9,15 @@
 Ultimately, the ``Calibrator`` should produce an initialized layer via its ``to_layer()`` method.
 """
 
-from .common_calibrators import calibrate_discretizer_and_export, add_discretizer_arguments  # noqa: F401
 from .calibrator import Calibrator  # noqa: F401
-from .mdl import MDLCalibrator  # noqa: F401
+from .common_calibrators import (
+    add_discretizer_arguments,  # noqa: F401
+    calibrate_discretizer_and_export,
+)
+from .hashed_percentile_discretizer import (
+    HashedPercentileDiscretizerCalibrator,
+)  # noqa: F401
+from .hashing_discretizer import HashingDiscretizerCalibrator  # noqa: F401
 from .isotonic import IsotonicCalibrator  # noqa: F401
+from .mdl import MDLCalibrator  # noqa: F401
 from .percentile_discretizer import PercentileDiscretizerCalibrator  # noqa: F401
-from .hashed_percentile_discretizer import HashedPercentileDiscretizerCalibrator  # noqa: F401
-from .hashing_discretizer import HashingDiscretizerCalibrator  # noqa: F401
\ No newline at end of file
diff --git a/twml/twml/contrib/calibrators/calibrator.py b/twml/twml/contrib/calibrators/calibrator.py
index 7408412e0..2bb7824c4 100644
--- a/twml/twml/contrib/calibrators/calibrator.py
+++ b/twml/twml/contrib/calibrators/calibrator.py
@@ -1,5 +1,5 @@
 # pylint: disable=missing-docstring, unused-argument
-''' Contains the base classes for CalibrationFeature and Calibrator '''
+""" Contains the base classes for CalibrationFeature and Calibrator """
 
 
 from collections import defaultdict
@@ -7,151 +7,156 @@
 import numpy as np
 import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
+
 import twml
 import twml.util
 
 
 class CalibrationFeature(object):
-  '''
-  Accumulates values and weights for individual features.
-  Typically, each unique feature defined in the accumulated SparseTensor or Tensor
-  would have its own CalibrationFeature instance.
-  '''
-
-  def __init__(self, feature_id):
-    ''' Constructs a CalibrationFeature
-
-    Arguments:
-      feature_id:
-        number identifying the feature.
-    '''
-    self.feature_id = feature_id
-    self._calibrated = False
-    self._features_dict = defaultdict(list)
-
-  def add_values(self, new_features):
-    '''
-    Extends lists to contain the values in this batch
-    '''
-    for key in new_features:
-      self._features_dict[key].append(new_features[key])
-
-  def _concat_arrays(self):
-    '''
-    This class calls this function after you have added all the values.
-    It creates a dictionary with the concatanated arrays
-    '''
-    self._features_dict.update((k, np.concatenate(v)) for k, v in self._features_dict.items())
-
-  def calibrate(self, *args, **kwargs):
-    raise NotImplementedError
+    """
+    Accumulates values and weights for individual features.
+    Typically, each unique feature defined in the accumulated SparseTensor or Tensor
+    would have its own CalibrationFeature instance.
+    """
+
+    def __init__(self, feature_id):
+        """Constructs a CalibrationFeature
+
+        Arguments:
+          feature_id:
+            number identifying the feature.
+        """
+        self.feature_id = feature_id
+        self._calibrated = False
+        self._features_dict = defaultdict(list)
+
+    def add_values(self, new_features):
+        """
+        Extends lists to contain the values in this batch
+        """
+        for key in new_features:
+            self._features_dict[key].append(new_features[key])
+
+    def _concat_arrays(self):
+        """
+        This class calls this function after you have added all the values.
+        It creates a dictionary with the concatanated arrays
+        """
+        self._features_dict.update(
+            (k, np.concatenate(v)) for k, v in self._features_dict.items()
+        )
+
+    def calibrate(self, *args, **kwargs):
+        raise NotImplementedError
 
 
 class Calibrator(object):
-  '''
-  Accumulates features and their respective values for Calibration
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()`` and;
-   2. calibrate by calling ``calibrate()``;
-   3. convert to a twml.layers layer by calling ``to_layer()``.
-
-  Note you can only use one calibrator per Trainer.
-  '''
-
-  def __init__(self, calibrator_name=None, **kwargs):
-    '''
-    Arguments:
-      calibrator_name.
-        Default: if set to None it will be the same as the class name.
-        Please be reminded that if in the model there are many calibrators
-        of the same type the calibrator_name should be changed to avoid confusion.
-    '''
-    self._calibrated = False
-    if calibrator_name is None:
-      calibrator_name = twml.util.to_snake_case(self.__class__.__name__)
-    self._calibrator_name = calibrator_name
-    self._kwargs = kwargs
-
-  @property
-  def is_calibrated(self):
-    return self._calibrated
-
-  @property
-  def name(self):
-    return self._calibrator_name
-
-  def accumulate(self, *args, **kwargs):
-    '''Accumulates features and their respective values for Calibration.'''
-    raise NotImplementedError
-
-  def calibrate(self):
-    '''Calibrates after the accumulation has ended.'''
-    self._calibrated = True
-
-  def to_layer(self, name=None):
-    '''
-    Returns a twml.layers.Layer instance with the result of calibrator.
-
-    Arguments:
-      name:
-        name-scope of the layer
-    '''
-    raise NotImplementedError
-
-  def get_layer_args(self):
-    '''
-    Returns layer arguments required to implement multi-phase training.
-
-    Returns:
-      dictionary of Layer constructor arguments to initialize the
-      layer Variables. Typically, this should contain enough information
-      to initialize empty layer Variables of the correct size, which will then
-      be filled with the right data using init_map.
-    '''
-    raise NotImplementedError
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory. Default (string): "default".
-      name:
-        name for the calibrator.
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      inputs = tf.sparse_placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      output = calibrator_layer(inputs)
-      # creates the signature to the calibrator module
-      hub.add_signature(inputs=inputs, outputs=output, name=name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-  def write_summary(self, writer, sess=None):
     """
-    This method is called by save() to write tensorboard summaries to disk.
-    See MDLCalibrator.write_summary for an example.
-    By default, the method does nothing. It can be overloaded by child-classes.
-
-    Arguments:
-      writer:
-        `tf.summary.FilteWriter
-        <https://www.tensorflow.org/versions/master/api_docs/python/tf/summary/FileWriter>`_
-        instance.
-        The ``writer`` is used to add summaries to event files for inclusion in tensorboard.
-      sess (optional):
-        `tf.Session <https://www.tensorflow.org/versions/master/api_docs/python/tf/Session>`_
-        instance. The ``sess`` is used to produces summaries for the writer.
+    Accumulates features and their respective values for Calibration
+    The steps for calibration are typically as follows:
+
+     1. accumulate feature values from batches by calling ``accumulate()`` and;
+     2. calibrate by calling ``calibrate()``;
+     3. convert to a twml.layers layer by calling ``to_layer()``.
+
+    Note you can only use one calibrator per Trainer.
     """
+
+    def __init__(self, calibrator_name=None, **kwargs):
+        """
+        Arguments:
+          calibrator_name.
+            Default: if set to None it will be the same as the class name.
+            Please be reminded that if in the model there are many calibrators
+            of the same type the calibrator_name should be changed to avoid confusion.
+        """
+        self._calibrated = False
+        if calibrator_name is None:
+            calibrator_name = twml.util.to_snake_case(self.__class__.__name__)
+        self._calibrator_name = calibrator_name
+        self._kwargs = kwargs
+
+    @property
+    def is_calibrated(self):
+        return self._calibrated
+
+    @property
+    def name(self):
+        return self._calibrator_name
+
+    def accumulate(self, *args, **kwargs):
+        """Accumulates features and their respective values for Calibration."""
+        raise NotImplementedError
+
+    def calibrate(self):
+        """Calibrates after the accumulation has ended."""
+        self._calibrated = True
+
+    def to_layer(self, name=None):
+        """
+        Returns a twml.layers.Layer instance with the result of calibrator.
+
+        Arguments:
+          name:
+            name-scope of the layer
+        """
+        raise NotImplementedError
+
+    def get_layer_args(self):
+        """
+        Returns layer arguments required to implement multi-phase training.
+
+        Returns:
+          dictionary of Layer constructor arguments to initialize the
+          layer Variables. Typically, this should contain enough information
+          to initialize empty layer Variables of the correct size, which will then
+          be filled with the right data using init_map.
+        """
+        raise NotImplementedError
+
+    def save(self, save_dir, name="default", verbose=False):
+        """Save the calibrator into the given save_directory.
+        Arguments:
+          save_dir:
+            name of the saving directory. Default (string): "default".
+          name:
+            name for the calibrator.
+        """
+        if not self._calibrated:
+            raise RuntimeError(
+                "Expecting prior call to calibrate().Cannot save() prior to calibrate()"
+            )
+
+        # This module allows for the calibrator to save be saved as part of
+        # Tensorflow Hub (this will allow it to be used in further steps)
+        def calibrator_module():
+            # Note that this is usually expecting a sparse_placeholder
+            inputs = tf.sparse_placeholder(tf.float32)
+            calibrator_layer = self.to_layer()
+            output = calibrator_layer(inputs)
+            # creates the signature to the calibrator module
+            hub.add_signature(inputs=inputs, outputs=output, name=name)
+
+        # exports the module to the save_dir
+        spec = hub.create_module_spec(calibrator_module)
+        with tf.Graph().as_default():
+            module = hub.Module(spec)
+            with tf.Session() as session:
+                module.export(save_dir, session)
+
+    def write_summary(self, writer, sess=None):
+        """
+        This method is called by save() to write tensorboard summaries to disk.
+        See MDLCalibrator.write_summary for an example.
+        By default, the method does nothing. It can be overloaded by child-classes.
+
+        Arguments:
+          writer:
+            `tf.summary.FilteWriter
+            <https://www.tensorflow.org/versions/master/api_docs/python/tf/summary/FileWriter>`_
+            instance.
+            The ``writer`` is used to add summaries to event files for inclusion in tensorboard.
+          sess (optional):
+            `tf.Session <https://www.tensorflow.org/versions/master/api_docs/python/tf/Session>`_
+            instance. The ``sess`` is used to produces summaries for the writer.
+        """
diff --git a/twml/twml/contrib/calibrators/common_calibrators.py b/twml/twml/contrib/calibrators/common_calibrators.py
index 5301901e4..bf967a22e 100644
--- a/twml/twml/contrib/calibrators/common_calibrators.py
+++ b/twml/twml/contrib/calibrators/common_calibrators.py
@@ -13,695 +13,915 @@
 import os
 import time
 
-from absl import logging
 import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
+from absl import logging
+
 import twml
 from twml.argument_parser import SortingHelpFormatter
+from twml.contrib.calibrators.isotonic import IsotonicCalibrator
 from twml.input_fns import data_record_input_fn
 from twml.util import list_files_by_datetime, sanitize_hdfs_path
-from twml.contrib.calibrators.isotonic import IsotonicCalibrator
 
 
 def calibrator_arguments(parser):
-  """
-  Calibrator Parameters to add to relevant parameters to the DataRecordTrainerParser.
-  Otherwise, if alone in a file, it just creates its own default parser.
-  Arguments:
-    parser:
-      Parser with the options to the model
-  """
-  parser.add_argument("--calibrator.save_dir", type=str,
-    dest="calibrator_save_dir",
-    help="Path to save or load calibrator calibration")
-  parser.add_argument("--calibrator_batch_size", type=int, default=128,
-    dest="calibrator_batch_size",
-    help="calibrator batch size")
-  parser.add_argument("--calibrator_parts_downsampling_rate", type=float, default=1,
-    dest="calibrator_parts_downsampling_rate",
-    help="Parts downsampling rate")
-  parser.add_argument("--calibrator_max_steps", type=int, default=None,
-    dest="calibrator_max_steps",
-    help="Max Steps taken by calibrator to accumulate samples")
-  parser.add_argument("--calibrator_num_bins", type=int, default=22,
-    dest="calibrator_num_bins",
-    help="Num bins of calibrator")
-  parser.add_argument("--isotonic_calibrator", dest='isotonic_calibrator', action='store_true',
-    help="Isotonic Calibrator present")
-  parser.add_argument("--calibrator_keep_rate", type=float, default=1.0,
-    dest="calibrator_keep_rate",
-    help="Keep rate")
-  return parser
+    """
+    Calibrator Parameters to add to relevant parameters to the DataRecordTrainerParser.
+    Otherwise, if alone in a file, it just creates its own default parser.
+    Arguments:
+      parser:
+        Parser with the options to the model
+    """
+    parser.add_argument(
+        "--calibrator.save_dir",
+        type=str,
+        dest="calibrator_save_dir",
+        help="Path to save or load calibrator calibration",
+    )
+    parser.add_argument(
+        "--calibrator_batch_size",
+        type=int,
+        default=128,
+        dest="calibrator_batch_size",
+        help="calibrator batch size",
+    )
+    parser.add_argument(
+        "--calibrator_parts_downsampling_rate",
+        type=float,
+        default=1,
+        dest="calibrator_parts_downsampling_rate",
+        help="Parts downsampling rate",
+    )
+    parser.add_argument(
+        "--calibrator_max_steps",
+        type=int,
+        default=None,
+        dest="calibrator_max_steps",
+        help="Max Steps taken by calibrator to accumulate samples",
+    )
+    parser.add_argument(
+        "--calibrator_num_bins",
+        type=int,
+        default=22,
+        dest="calibrator_num_bins",
+        help="Num bins of calibrator",
+    )
+    parser.add_argument(
+        "--isotonic_calibrator",
+        dest="isotonic_calibrator",
+        action="store_true",
+        help="Isotonic Calibrator present",
+    )
+    parser.add_argument(
+        "--calibrator_keep_rate",
+        type=float,
+        default=1.0,
+        dest="calibrator_keep_rate",
+        help="Keep rate",
+    )
+    return parser
 
 
 def _generate_files_by_datetime(params):
+    files = list_files_by_datetime(
+        base_path=sanitize_hdfs_path(params.train_data_dir),
+        start_datetime=params.train_start_datetime,
+        end_datetime=params.train_end_datetime,
+        datetime_prefix_format=params.datetime_format,
+        extension="lzo",
+        parallelism=1,
+        hour_resolution=params.hour_resolution,
+        sort=True,
+    )
 
-  files = list_files_by_datetime(
-    base_path=sanitize_hdfs_path(params.train_data_dir),
-    start_datetime=params.train_start_datetime,
-    end_datetime=params.train_end_datetime,
-    datetime_prefix_format=params.datetime_format,
-    extension="lzo",
-    parallelism=1,
-    hour_resolution=params.hour_resolution,
-    sort=True)
-
-  return files
+    return files
 
 
 def get_calibrate_input_fn(parse_fn, params):
-  """
-  Default input function used for the calibrator.
-  Arguments:
-    parse_fn:
-      Parse_fn
-    params:
-      Parameters
-  Returns:
-    input_fn
-  """
-
-  return lambda: data_record_input_fn(
-    files=_generate_files_by_datetime(params),
-    batch_size=params.calibrator_batch_size,
-    parse_fn=parse_fn,
-    num_threads=1,
-    repeat=False,
-    keep_rate=params.calibrator_keep_rate,
-    parts_downsampling_rate=params.calibrator_parts_downsampling_rate,
-    shards=None,
-    shard_index=None,
-    shuffle=True,
-    shuffle_files=True,
-    interleave=True)
+    """
+    Default input function used for the calibrator.
+    Arguments:
+      parse_fn:
+        Parse_fn
+      params:
+        Parameters
+    Returns:
+      input_fn
+    """
+
+    return lambda: data_record_input_fn(
+        files=_generate_files_by_datetime(params),
+        batch_size=params.calibrator_batch_size,
+        parse_fn=parse_fn,
+        num_threads=1,
+        repeat=False,
+        keep_rate=params.calibrator_keep_rate,
+        parts_downsampling_rate=params.calibrator_parts_downsampling_rate,
+        shards=None,
+        shard_index=None,
+        shuffle=True,
+        shuffle_files=True,
+        interleave=True,
+    )
 
 
 def get_discretize_input_fn(parse_fn, params):
-  """
-  Default input function used for the calibrator.
-  Arguments:
-    parse_fn:
-      Parse_fn
-    params:
-      Parameters
-  Returns:
-    input_fn
-  """
-
-  return lambda: data_record_input_fn(
-    files=_generate_files_by_datetime(params),
-    batch_size=params.discretizer_batch_size,
-    parse_fn=parse_fn,
-    num_threads=1,
-    repeat=False,
-    keep_rate=params.discretizer_keep_rate,
-    parts_downsampling_rate=params.discretizer_parts_downsampling_rate,
-    shards=None,
-    shard_index=None,
-    shuffle=True,
-    shuffle_files=True,
-    interleave=True)
+    """
+    Default input function used for the calibrator.
+    Arguments:
+      parse_fn:
+        Parse_fn
+      params:
+        Parameters
+    Returns:
+      input_fn
+    """
+
+    return lambda: data_record_input_fn(
+        files=_generate_files_by_datetime(params),
+        batch_size=params.discretizer_batch_size,
+        parse_fn=parse_fn,
+        num_threads=1,
+        repeat=False,
+        keep_rate=params.discretizer_keep_rate,
+        parts_downsampling_rate=params.discretizer_parts_downsampling_rate,
+        shards=None,
+        shard_index=None,
+        shuffle=True,
+        shuffle_files=True,
+        interleave=True,
+    )
 
 
 def discretizer_arguments(parser=None):
-  """
-  Discretizer Parameters to add to relevant parameters to the DataRecordTrainerParser.
-  Otherwise, if alone in a file, it just creates its own default parser.
-  Arguments:
-    parser:
-      Parser with the options to the model. Defaults to None
-  """
-
-  if parser is None:
-    parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
-    parser.add_argument(
-      "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true",
-      help="Delete the contents of the current save_dir if it exists")
-    parser.add_argument(
-      "--train.data_dir", "--train_data_dir", type=str, default=None,
-      dest="train_data_dir",
-      help="Path to the training data directory."
-           "Supports local and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--train.start_date", "--train_start_datetime",
-      type=str, default=None,
-      dest="train_start_datetime",
-      help="Starting date for training inside the train data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--train.end_date", "--train_end_datetime", type=str, default=None,
-      dest="train_end_datetime",
-      help="Ending date for training inside the train data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--datetime_format", type=str, default="%Y/%m/%d",
-      help="Date format for training and evaluation datasets."
-           "Has to be a format that is understood by python datetime."
-           "e.g. %Y/%m/%d for 2019/01/15."
-           "Used only if {train/eval}.{start/end}_date are provided.")
+    """
+    Discretizer Parameters to add to relevant parameters to the DataRecordTrainerParser.
+    Otherwise, if alone in a file, it just creates its own default parser.
+    Arguments:
+      parser:
+        Parser with the options to the model. Defaults to None
+    """
+
+    if parser is None:
+        parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
+        parser.add_argument(
+            "--overwrite_save_dir",
+            dest="overwrite_save_dir",
+            action="store_true",
+            help="Delete the contents of the current save_dir if it exists",
+        )
+        parser.add_argument(
+            "--train.data_dir",
+            "--train_data_dir",
+            type=str,
+            default=None,
+            dest="train_data_dir",
+            help="Path to the training data directory."
+            "Supports local and HDFS (hdfs://default/<path> ) paths.",
+        )
+        parser.add_argument(
+            "--train.start_date",
+            "--train_start_datetime",
+            type=str,
+            default=None,
+            dest="train_start_datetime",
+            help="Starting date for training inside the train data dir."
+            "The start datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--train.end_date",
+            "--train_end_datetime",
+            type=str,
+            default=None,
+            dest="train_end_datetime",
+            help="Ending date for training inside the train data dir."
+            "The end datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--datetime_format",
+            type=str,
+            default="%Y/%m/%d",
+            help="Date format for training and evaluation datasets."
+            "Has to be a format that is understood by python datetime."
+            "e.g. %Y/%m/%d for 2019/01/15."
+            "Used only if {train/eval}.{start/end}_date are provided.",
+        )
+        parser.add_argument(
+            "--hour_resolution",
+            type=int,
+            default=None,
+            help="Specify the hourly resolution of the stored data.",
+        )
+        parser.add_argument(
+            "--tensorboard_port",
+            type=int,
+            default=None,
+            help="Port for tensorboard to run on.",
+        )
+        parser.add_argument(
+            "--stats_port",
+            type=int,
+            default=None,
+            help="Port for stats server to run on.",
+        )
+        parser.add_argument(
+            "--health_port",
+            type=int,
+            default=None,
+            help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
+            "Not user-facing as it is set automatically by the twml_cli.",
+        )
+        parser.add_argument(
+            "--data_spec",
+            type=str,
+            default=None,
+            help="Path to data specification JSON file. This file is used to decode DataRecords",
+        )
     parser.add_argument(
-      "--hour_resolution", type=int, default=None,
-      help="Specify the hourly resolution of the stored data.")
+        "--discretizer.save_dir",
+        type=str,
+        dest="discretizer_save_dir",
+        help="Path to save or load discretizer calibration",
+    )
     parser.add_argument(
-      "--tensorboard_port", type=int, default=None,
-      help="Port for tensorboard to run on.")
+        "--discretizer_batch_size",
+        type=int,
+        default=128,
+        dest="discretizer_batch_size",
+        help="Discretizer batch size",
+    )
     parser.add_argument(
-      "--stats_port", type=int, default=None,
-      help="Port for stats server to run on.")
+        "--discretizer_keep_rate",
+        type=float,
+        default=0.0008,
+        dest="discretizer_keep_rate",
+        help="Keep rate",
+    )
     parser.add_argument(
-      "--health_port", type=int, default=None,
-      help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
-           "Not user-facing as it is set automatically by the twml_cli."
+        "--discretizer_parts_downsampling_rate",
+        type=float,
+        default=0.2,
+        dest="discretizer_parts_downsampling_rate",
+        help="Parts downsampling rate",
     )
     parser.add_argument(
-      "--data_spec", type=str, default=None,
-      help="Path to data specification JSON file. This file is used to decode DataRecords")
-  parser.add_argument("--discretizer.save_dir", type=str,
-    dest="discretizer_save_dir",
-    help="Path to save or load discretizer calibration")
-  parser.add_argument("--discretizer_batch_size", type=int, default=128,
-    dest="discretizer_batch_size",
-    help="Discretizer batch size")
-  parser.add_argument("--discretizer_keep_rate", type=float, default=0.0008,
-    dest="discretizer_keep_rate",
-    help="Keep rate")
-  parser.add_argument("--discretizer_parts_downsampling_rate", type=float, default=0.2,
-    dest="discretizer_parts_downsampling_rate",
-    help="Parts downsampling rate")
-  parser.add_argument("--discretizer_max_steps", type=int, default=None,
-    dest="discretizer_max_steps",
-    help="Max Steps taken by discretizer to accumulate samples")
-  return parser
+        "--discretizer_max_steps",
+        type=int,
+        default=None,
+        dest="discretizer_max_steps",
+        help="Max Steps taken by discretizer to accumulate samples",
+    )
+    return parser
 
 
 def calibrate(trainer, params, build_graph, input_fn, debug=False):
-  """
-  Calibrate Isotonic Calibration
-  Arguments:
-    trainer:
-      Trainer
-    params:
-      Parameters
-    build_graph:
-      Build Graph used to be the input to the calibrator
-    input_fn:
-      Input Function specified by the user
-    debug:
-      Defaults to False. Returns the calibrator
-  """
-
-  if trainer._estimator.config.is_chief:
-
-    # overwrite the current save_dir
-    if params.overwrite_save_dir and tf.io.gfile.exists(params.calibrator_save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % params.calibrator_save_dir)
-      tf.io.gfile.rmtree(params.calibrator_save_dir)
-
-    calibrator = IsotonicCalibrator(params.calibrator_num_bins)
-
-    # chief trains discretizer
-    logging.info("Chief training calibrator")
-
-    # Accumulate the features for each calibrator
-    features, labels = input_fn()
-    if 'weights' not in features:
-      raise ValueError("Weights need to be returned as part of the parse_fn")
-    weights = features.pop('weights')
-
-    preds = build_graph(features=features, label=None, mode='infer', params=params, config=None)
-    init = tf.global_variables_initializer()
-    table_init = tf.tables_initializer()
-    with tf.Session() as sess:
-      sess.run(init)
-      sess.run(table_init)
-      count = 0
-      max_steps = params.calibrator_max_steps or -1
-      while max_steps <= 0 or count <= max_steps:
-        try:
-          weights_vals, labels_vals, preds_vals = sess.run([weights, labels, preds['output']])
-          calibrator.accumulate(preds_vals, labels_vals, weights_vals.flatten())
-        except tf.errors.OutOfRangeError:
-          break
-        count += 1
-
-    calibrator.calibrate()
-    calibrator.save(params.calibrator_save_dir)
-    trainer.estimator._params.isotonic_calibrator = True
-
-    if debug:
-      return calibrator
-
-  else:
-    calibrator_save_dir = twml.util.sanitize_hdfs_path(params.calibrator_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(calibrator_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % calibrator_save_dir)
-      time.sleep(60)
+    """
+    Calibrate Isotonic Calibration
+    Arguments:
+      trainer:
+        Trainer
+      params:
+        Parameters
+      build_graph:
+        Build Graph used to be the input to the calibrator
+      input_fn:
+        Input Function specified by the user
+      debug:
+        Defaults to False. Returns the calibrator
+    """
+
+    if trainer._estimator.config.is_chief:
+        # overwrite the current save_dir
+        if params.overwrite_save_dir and tf.io.gfile.exists(params.calibrator_save_dir):
+            logging.info(
+                "Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
+                % params.calibrator_save_dir
+            )
+            tf.io.gfile.rmtree(params.calibrator_save_dir)
+
+        calibrator = IsotonicCalibrator(params.calibrator_num_bins)
+
+        # chief trains discretizer
+        logging.info("Chief training calibrator")
+
+        # Accumulate the features for each calibrator
+        features, labels = input_fn()
+        if "weights" not in features:
+            raise ValueError("Weights need to be returned as part of the parse_fn")
+        weights = features.pop("weights")
+
+        preds = build_graph(
+            features=features, label=None, mode="infer", params=params, config=None
+        )
+        init = tf.global_variables_initializer()
+        table_init = tf.tables_initializer()
+        with tf.Session() as sess:
+            sess.run(init)
+            sess.run(table_init)
+            count = 0
+            max_steps = params.calibrator_max_steps or -1
+            while max_steps <= 0 or count <= max_steps:
+                try:
+                    weights_vals, labels_vals, preds_vals = sess.run(
+                        [weights, labels, preds["output"]]
+                    )
+                    calibrator.accumulate(
+                        preds_vals, labels_vals, weights_vals.flatten()
+                    )
+                except tf.errors.OutOfRangeError:
+                    break
+                count += 1
+
+        calibrator.calibrate()
+        calibrator.save(params.calibrator_save_dir)
+        trainer.estimator._params.isotonic_calibrator = True
+
+        if debug:
+            return calibrator
+
+    else:
+        calibrator_save_dir = twml.util.sanitize_hdfs_path(params.calibrator_save_dir)
+        # workers wait for calibration to be ready
+        while not tf.io.gfile.exists(
+            calibrator_save_dir + os.path.sep + "tfhub_module.pb"
+        ):
+            logging.info("Worker waiting for calibration at %s" % calibrator_save_dir)
+            time.sleep(60)
 
 
 def discretize(params, feature_config, input_fn, debug=False):
-  """
-  Discretizes continuous features
-  Arguments:
-    params:
-      Parameters
-    input_fn:
-      Input Function specified by the user
-    debug:
-      Defaults to False. Returns the calibrator
-  """
-
-  if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or
-    params.num_workers is None):
-
-    # overwrite the current save_dir
-    if params.overwrite_save_dir and tf.io.gfile.exists(params.discretizer_save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % params.discretizer_save_dir)
-      tf.io.gfile.rmtree(params.discretizer_save_dir)
-
-    config_map = feature_config()
-    discretize_dict = config_map['discretize_config']
-
-    # chief trains discretizer
-    logging.info("Chief training discretizer")
-
-    batch = input_fn()
-    # Accumulate the features for each calibrator
-    with tf.Session() as sess:
-      count = 0
-      max_steps = params.discretizer_max_steps or -1
-      while max_steps <= 0 or count <= max_steps:
-        try:
-          inputs = sess.run(batch)
-          for name, clbrt in discretize_dict.items():
-            clbrt.accumulate_features(inputs[0], name)
-        except tf.errors.OutOfRangeError:
-          break
-        count += 1
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      for name, clbrt in discretize_dict.items():
-        clbrt.calibrate()
-        clbrt.add_hub_signatures(name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(params.discretizer_save_dir, session)
-
-    for name, clbrt in discretize_dict.items():
-      clbrt.write_summary_json(params.discretizer_save_dir, name)
-
-    if debug:
-      return discretize_dict
-
-  else:
-    # wait for the file to be removed (if necessary)
-    # should be removed after an actual fix applied
-    time.sleep(60)
-    discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
-      time.sleep(60)
+    """
+    Discretizes continuous features
+    Arguments:
+      params:
+        Parameters
+      input_fn:
+        Input Function specified by the user
+      debug:
+        Defaults to False. Returns the calibrator
+    """
+
+    if (
+        os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief"
+        or "num_workers" not in params
+        or params.num_workers is None
+    ):
+        # overwrite the current save_dir
+        if params.overwrite_save_dir and tf.io.gfile.exists(
+            params.discretizer_save_dir
+        ):
+            logging.info(
+                "Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
+                % params.discretizer_save_dir
+            )
+            tf.io.gfile.rmtree(params.discretizer_save_dir)
+
+        config_map = feature_config()
+        discretize_dict = config_map["discretize_config"]
+
+        # chief trains discretizer
+        logging.info("Chief training discretizer")
+
+        batch = input_fn()
+        # Accumulate the features for each calibrator
+        with tf.Session() as sess:
+            count = 0
+            max_steps = params.discretizer_max_steps or -1
+            while max_steps <= 0 or count <= max_steps:
+                try:
+                    inputs = sess.run(batch)
+                    for name, clbrt in discretize_dict.items():
+                        clbrt.accumulate_features(inputs[0], name)
+                except tf.errors.OutOfRangeError:
+                    break
+                count += 1
+
+        # This module allows for the calibrator to save be saved as part of
+        # Tensorflow Hub (this will allow it to be used in further steps)
+        def calibrator_module():
+            # Note that this is usually expecting a sparse_placeholder
+            for name, clbrt in discretize_dict.items():
+                clbrt.calibrate()
+                clbrt.add_hub_signatures(name)
+
+        # exports the module to the save_dir
+        spec = hub.create_module_spec(calibrator_module)
+        with tf.Graph().as_default():
+            module = hub.Module(spec)
+            with tf.Session() as session:
+                module.export(params.discretizer_save_dir, session)
+
+        for name, clbrt in discretize_dict.items():
+            clbrt.write_summary_json(params.discretizer_save_dir, name)
+
+        if debug:
+            return discretize_dict
+
+    else:
+        # wait for the file to be removed (if necessary)
+        # should be removed after an actual fix applied
+        time.sleep(60)
+        discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
+        # workers wait for calibration to be ready
+        while not tf.io.gfile.exists(
+            discretizer_save_dir + os.path.sep + "tfhub_module.pb"
+        ):
+            logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
+            time.sleep(60)
 
 
 def add_discretizer_arguments(parser):
-  """
-  Add discretizer-specific command-line arguments to a Trainer parser.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-
-  parser.add_argument("--discretizer.save_dir", type=str,
-                      dest="discretizer_save_dir",
-                      help="Path to save or load discretizer calibration")
-  parser.add_argument("--discretizer.batch_size", type=int, default=128,
-                      dest="discretizer_batch_size",
-                      help="Discretizer batch size")
-  parser.add_argument("--discretizer.keep_rate", type=float, default=0.0008,
-                      dest="discretizer_keep_rate",
-                      help="Keep rate")
-  parser.add_argument("--discretizer.parts_downsampling_rate", type=float, default=0.2,
-                      dest="discretizer_parts_downsampling_rate",
-                      help="Parts downsampling rate")
-  parser.add_argument("--discretizer.num_bins", type=int, default=20,
-                      dest="discretizer_num_bins",
-                      help="Number of bins per feature")
-  parser.add_argument("--discretizer.output_size_bits", type=int, default=22,
-                      dest="discretizer_output_size_bits",
-                      help="Number of bits allocated to the output size")
-  return parser
+    """
+    Add discretizer-specific command-line arguments to a Trainer parser.
+
+    Arguments:
+      parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
+
+    Returns:
+      argparse.ArgumentParser instance with discretizer-specific arguments added
+    """
+
+    parser.add_argument(
+        "--discretizer.save_dir",
+        type=str,
+        dest="discretizer_save_dir",
+        help="Path to save or load discretizer calibration",
+    )
+    parser.add_argument(
+        "--discretizer.batch_size",
+        type=int,
+        default=128,
+        dest="discretizer_batch_size",
+        help="Discretizer batch size",
+    )
+    parser.add_argument(
+        "--discretizer.keep_rate",
+        type=float,
+        default=0.0008,
+        dest="discretizer_keep_rate",
+        help="Keep rate",
+    )
+    parser.add_argument(
+        "--discretizer.parts_downsampling_rate",
+        type=float,
+        default=0.2,
+        dest="discretizer_parts_downsampling_rate",
+        help="Parts downsampling rate",
+    )
+    parser.add_argument(
+        "--discretizer.num_bins",
+        type=int,
+        default=20,
+        dest="discretizer_num_bins",
+        help="Number of bins per feature",
+    )
+    parser.add_argument(
+        "--discretizer.output_size_bits",
+        type=int,
+        default=22,
+        dest="discretizer_output_size_bits",
+        help="Number of bits allocated to the output size",
+    )
+    return parser
 
 
 def add_isotonic_calibrator_arguments(parser):
-  """
-  Add discretizer-specific command-line arguments to a Trainer parser.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-  parser.add_argument("--calibrator.num_bins", type=int,
-    default=25000, dest="calibrator_num_bins",
-    help="number of bins for isotonic calibration")
-  parser.add_argument("--calibrator.parts_downsampling_rate", type=float, default=0.1,
-    dest="calibrator_parts_downsampling_rate", help="Parts downsampling rate")
-  parser.add_argument("--calibrator.save_dir", type=str,
-    dest="calibrator_save_dir", help="Path to save or load calibrator output")
-  parser.add_argument("--calibrator.load_tensorflow_module", type=str, default=None,
-    dest="calibrator_load_tensorflow_module",
-    help="Location from where to load a pretrained graph from. \
-                           Typically, this is where the MLP graph is saved")
-  parser.add_argument("--calibrator.export_mlp_module_name", type=str, default='tf_hub_mlp',
-    help="Name for loaded hub signature",
-    dest="export_mlp_module_name")
-  parser.add_argument("--calibrator.export_isotonic_module_name",
-    type=str, default="tf_hub_isotonic",
-    dest="calibrator_export_module_name",
-    help="export module name")
-  parser.add_argument("--calibrator.final_evaluation_steps", type=int,
-    dest="calibrator_final_evaluation_steps", default=None,
-    help="number of steps for final evaluation")
-  parser.add_argument("--calibrator.train_steps", type=int, default=-1,
-    dest="calibrator_train_steps",
-    help="number of steps for calibration")
-  parser.add_argument("--calibrator.batch_size", type=int, default=1024,
-    dest="calibrator_batch_size",
-    help="Calibrator batch size")
-  parser.add_argument("--calibrator.is_calibrating", action='store_true',
-    dest="is_calibrating",
-    help="Dummy argument to allow running in chief worker")
-  return parser
-
-
-def calibrate_calibrator_and_export(name, calibrator, build_graph_fn, params, feature_config,
-                                    run_eval=True, input_fn=None, metric_fn=None,
-                                    export_task_type_overrider=None):
-  """
-  Pre-set `isotonic calibrator` calibrator.
-  Args:
-    name:
-      scope name used for the calibrator
-    calibrator:
-      calibrator that will be calibrated and exported.
-    build_graph_fn:
-      build graph function for the calibrator
-    params:
-      params passed to the calibrator
-    feature_config:
-      feature config which will be passed to the trainer
-    export_task_type_overrider:
-      the task type for exporting the calibrator
-      if specified, this will override the default export task type in trainer.hub_export(..)
-  """
-
-  # create calibrator params
-  params_c = copy.deepcopy(params)
-  params_c.data_threads = 1
-  params_c.num_workers = 1
-  params_c.continue_from_checkpoint = True
-  params_c.overwrite_save_dir = False
-  params_c.stats_port = None
-
-  # Automatically load from the saved Tensorflow Hub module if not specified.
-  if params_c.calibrator_load_tensorflow_module is None:
-    path_saved_tensorflow_model = os.path.join(params.save_dir, params.export_mlp_module_name)
-    params_c.calibrator_load_tensorflow_module = path_saved_tensorflow_model
-
-  if "calibrator_parts_downsampling_rate" in params_c:
-    params_c.train_parts_downsampling_rate = params_c.calibrator_parts_downsampling_rate
-  if "calibrator_save_dir" in params_c:
-    params_c.save_dir = params_c.calibrator_save_dir
-  if "calibrator_batch_size" in params_c:
-    params_c.train_batch_size = params_c.calibrator_batch_size
-    params_c.eval_batch_size = params_c.calibrator_batch_size
-  # TODO: Deprecate this option. It is not actually used. Calibrator
-  #       simply iterates until the end of input_fn.
-  if "calibrator_train_steps" in params_c:
-    params_c.train_steps = params_c.calibrator_train_steps
-
-  if metric_fn is None:
-    metric_fn = twml.metrics.get_multi_binary_class_metric_fn(None)
-
-  # Common Trainer which will also be used by all workers
-  trainer = twml.trainers.DataRecordTrainer(
-    name=name,
-    params=params_c,
-    feature_config=feature_config,
-    build_graph_fn=build_graph_fn,
-    save_dir=params_c.save_dir,
-    metric_fn=metric_fn
-  )
-
-  if trainer._estimator.config.is_chief:
-
-    # Chief trains calibrator
-    logging.info("Chief training calibrator")
-
-    # Disregard hogwild config
-    os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
-    os.environ["TWML_HOGWILD_PORTS"] = ""
-
-    hooks = None
-    if params_c.calibrator_train_steps > 0:
-      hooks = [twml.hooks.StepProgressHook(params_c.calibrator_train_steps)]
-
-    def parse_fn(input_x):
-      fc_parse_fn = feature_config.get_parse_fn()
-      features, labels = fc_parse_fn(input_x)
-      features['labels'] = labels
-      return features, labels
-
-    if input_fn is None:
-      input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
-
-    # Calibrate stage
-    trainer.estimator._params.mode = 'calibrate'
-    trainer.calibrate(calibrator=calibrator,
-                      input_fn=input_fn,
-                      steps=params_c.calibrator_train_steps,
-                      hooks=hooks)
-
-    # Save Checkpoint
-    # We need to train for 1 step, to save the graph to checkpoint.
-    # This is done just by the chief.
-    # We need to set the mode to evaluate to save the graph that will be consumed
-    # In the final evaluation
-    trainer.estimator._params.mode = 'evaluate'
-    trainer.train(input_fn=input_fn, steps=1)
-
-    # Restore hogwild setup
-    if os_twml_hogwild_ports is not None:
-      os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
-  else:
-    # Workers wait for calibration to be ready
-    final_calibrator_path = os.path.join(params_c.calibrator_save_dir,
-                                         params_c.calibrator_export_module_name)
-
-    final_calibrator_path = twml.util.sanitize_hdfs_path(final_calibrator_path)
-
-    while not tf.io.gfile.exists(final_calibrator_path + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % final_calibrator_path)
-      time.sleep(60)
-
-  # Evaluate stage
-  if run_eval:
-    trainer.estimator._params.mode = 'evaluate'
-    # This will allow the Evaluate method to be run in Hogwild
-    # trainer.estimator._params.continue_from_checkpoint = True
-    trainer.evaluate(name='test', input_fn=input_fn, steps=params_c.calibrator_final_evaluation_steps)
-
-  trainer.hub_export(name=params_c.calibrator_export_module_name,
-    export_task_type_overrider=export_task_type_overrider,
-    serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn())
-
-  return trainer
-
-
-def calibrate_discretizer_and_export(name, calibrator, build_graph_fn, params, feature_config):
-  """
-  Pre-set percentile discretizer calibrator.
-  Args:
-    name:
-      scope name used for the calibrator
-    calibrator:
-      calibrator that will be calibrated and exported.
-    build_graph_fn:
-      build graph function for the calibrator
-    params:
-      params passed to the calibrator
-    feature_config:
-      feature config or input_fn which will be passed to the trainer.
-  """
-
-  if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or
-        params.num_workers is None):
-
-    # chief trains discretizer
-    logging.info("Chief training discretizer")
-
-    # disregard hogwild config
-    os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
-    os.environ["TWML_HOGWILD_PORTS"] = ""
-
-    # create discretizer params
+    """
+    Add discretizer-specific command-line arguments to a Trainer parser.
+
+    Arguments:
+      parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
+
+    Returns:
+      argparse.ArgumentParser instance with discretizer-specific arguments added
+    """
+    parser.add_argument(
+        "--calibrator.num_bins",
+        type=int,
+        default=25000,
+        dest="calibrator_num_bins",
+        help="number of bins for isotonic calibration",
+    )
+    parser.add_argument(
+        "--calibrator.parts_downsampling_rate",
+        type=float,
+        default=0.1,
+        dest="calibrator_parts_downsampling_rate",
+        help="Parts downsampling rate",
+    )
+    parser.add_argument(
+        "--calibrator.save_dir",
+        type=str,
+        dest="calibrator_save_dir",
+        help="Path to save or load calibrator output",
+    )
+    parser.add_argument(
+        "--calibrator.load_tensorflow_module",
+        type=str,
+        default=None,
+        dest="calibrator_load_tensorflow_module",
+        help="Location from where to load a pretrained graph from. \
+                           Typically, this is where the MLP graph is saved",
+    )
+    parser.add_argument(
+        "--calibrator.export_mlp_module_name",
+        type=str,
+        default="tf_hub_mlp",
+        help="Name for loaded hub signature",
+        dest="export_mlp_module_name",
+    )
+    parser.add_argument(
+        "--calibrator.export_isotonic_module_name",
+        type=str,
+        default="tf_hub_isotonic",
+        dest="calibrator_export_module_name",
+        help="export module name",
+    )
+    parser.add_argument(
+        "--calibrator.final_evaluation_steps",
+        type=int,
+        dest="calibrator_final_evaluation_steps",
+        default=None,
+        help="number of steps for final evaluation",
+    )
+    parser.add_argument(
+        "--calibrator.train_steps",
+        type=int,
+        default=-1,
+        dest="calibrator_train_steps",
+        help="number of steps for calibration",
+    )
+    parser.add_argument(
+        "--calibrator.batch_size",
+        type=int,
+        default=1024,
+        dest="calibrator_batch_size",
+        help="Calibrator batch size",
+    )
+    parser.add_argument(
+        "--calibrator.is_calibrating",
+        action="store_true",
+        dest="is_calibrating",
+        help="Dummy argument to allow running in chief worker",
+    )
+    return parser
+
+
+def calibrate_calibrator_and_export(
+    name,
+    calibrator,
+    build_graph_fn,
+    params,
+    feature_config,
+    run_eval=True,
+    input_fn=None,
+    metric_fn=None,
+    export_task_type_overrider=None,
+):
+    """
+    Pre-set `isotonic calibrator` calibrator.
+    Args:
+      name:
+        scope name used for the calibrator
+      calibrator:
+        calibrator that will be calibrated and exported.
+      build_graph_fn:
+        build graph function for the calibrator
+      params:
+        params passed to the calibrator
+      feature_config:
+        feature config which will be passed to the trainer
+      export_task_type_overrider:
+        the task type for exporting the calibrator
+        if specified, this will override the default export task type in trainer.hub_export(..)
+    """
+
+    # create calibrator params
     params_c = copy.deepcopy(params)
     params_c.data_threads = 1
-    params_c.train_steps = -1
-    params_c.train_max_steps = None
-    params_c.eval_steps = -1
     params_c.num_workers = 1
-    params_c.tensorboard_port = None
+    params_c.continue_from_checkpoint = True
+    params_c.overwrite_save_dir = False
     params_c.stats_port = None
 
-    if "discretizer_batch_size" in params_c:
-      params_c.train_batch_size = params_c.discretizer_batch_size
-      params_c.eval_batch_size = params_c.discretizer_batch_size
-    if "discretizer_keep_rate" in params_c:
-      params_c.train_keep_rate = params_c.discretizer_keep_rate
-    if "discretizer_parts_downsampling_rate" in params_c:
-      params_c.train_parts_downsampling_rate = params_c.discretizer_parts_downsampling_rate
-    if "discretizer_save_dir" in params_c:
-      params_c.save_dir = params_c.discretizer_save_dir
-
-    # train discretizer
+    # Automatically load from the saved Tensorflow Hub module if not specified.
+    if params_c.calibrator_load_tensorflow_module is None:
+        path_saved_tensorflow_model = os.path.join(
+            params.save_dir, params.export_mlp_module_name
+        )
+        params_c.calibrator_load_tensorflow_module = path_saved_tensorflow_model
+
+    if "calibrator_parts_downsampling_rate" in params_c:
+        params_c.train_parts_downsampling_rate = (
+            params_c.calibrator_parts_downsampling_rate
+        )
+    if "calibrator_save_dir" in params_c:
+        params_c.save_dir = params_c.calibrator_save_dir
+    if "calibrator_batch_size" in params_c:
+        params_c.train_batch_size = params_c.calibrator_batch_size
+        params_c.eval_batch_size = params_c.calibrator_batch_size
+    # TODO: Deprecate this option. It is not actually used. Calibrator
+    #       simply iterates until the end of input_fn.
+    if "calibrator_train_steps" in params_c:
+        params_c.train_steps = params_c.calibrator_train_steps
+
+    if metric_fn is None:
+        metric_fn = twml.metrics.get_multi_binary_class_metric_fn(None)
+
+    # Common Trainer which will also be used by all workers
     trainer = twml.trainers.DataRecordTrainer(
-      name=name,
-      params=params_c,
-      build_graph_fn=build_graph_fn,
-      save_dir=params_c.save_dir,
+        name=name,
+        params=params_c,
+        feature_config=feature_config,
+        build_graph_fn=build_graph_fn,
+        save_dir=params_c.save_dir,
+        metric_fn=metric_fn,
     )
 
-    if isinstance(feature_config, twml.feature_config.FeatureConfig):
-      parse_fn = twml.parsers.get_continuous_parse_fn(feature_config)
-      input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
-    elif callable(feature_config):
-      input_fn = feature_config
+    if trainer._estimator.config.is_chief:
+        # Chief trains calibrator
+        logging.info("Chief training calibrator")
+
+        # Disregard hogwild config
+        os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
+        os.environ["TWML_HOGWILD_PORTS"] = ""
+
+        hooks = None
+        if params_c.calibrator_train_steps > 0:
+            hooks = [twml.hooks.StepProgressHook(params_c.calibrator_train_steps)]
+
+        def parse_fn(input_x):
+            fc_parse_fn = feature_config.get_parse_fn()
+            features, labels = fc_parse_fn(input_x)
+            features["labels"] = labels
+            return features, labels
+
+        if input_fn is None:
+            input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
+
+        # Calibrate stage
+        trainer.estimator._params.mode = "calibrate"
+        trainer.calibrate(
+            calibrator=calibrator,
+            input_fn=input_fn,
+            steps=params_c.calibrator_train_steps,
+            hooks=hooks,
+        )
+
+        # Save Checkpoint
+        # We need to train for 1 step, to save the graph to checkpoint.
+        # This is done just by the chief.
+        # We need to set the mode to evaluate to save the graph that will be consumed
+        # In the final evaluation
+        trainer.estimator._params.mode = "evaluate"
+        trainer.train(input_fn=input_fn, steps=1)
+
+        # Restore hogwild setup
+        if os_twml_hogwild_ports is not None:
+            os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
     else:
-      got_type = type(feature_config).__name__
-      raise ValueError(
-        "Expecting feature_config to be FeatureConfig or function got %s" % got_type)
-
-    hooks = None
-    if params_c.train_steps > 0:
-      hooks = [twml.hooks.StepProgressHook(params_c.train_steps)]
-
-    trainer.calibrate(calibrator=calibrator, input_fn=input_fn,
-                      steps=params_c.train_steps, hooks=hooks)
-    # restore hogwild setup
-    if os_twml_hogwild_ports is not None:
-      os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
-  else:
-    discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
-      time.sleep(60)
+        # Workers wait for calibration to be ready
+        final_calibrator_path = os.path.join(
+            params_c.calibrator_save_dir, params_c.calibrator_export_module_name
+        )
+
+        final_calibrator_path = twml.util.sanitize_hdfs_path(final_calibrator_path)
+
+        while not tf.io.gfile.exists(
+            final_calibrator_path + os.path.sep + "tfhub_module.pb"
+        ):
+            logging.info("Worker waiting for calibration at %s" % final_calibrator_path)
+            time.sleep(60)
+
+    # Evaluate stage
+    if run_eval:
+        trainer.estimator._params.mode = "evaluate"
+        # This will allow the Evaluate method to be run in Hogwild
+        # trainer.estimator._params.continue_from_checkpoint = True
+        trainer.evaluate(
+            name="test",
+            input_fn=input_fn,
+            steps=params_c.calibrator_final_evaluation_steps,
+        )
+
+    trainer.hub_export(
+        name=params_c.calibrator_export_module_name,
+        export_task_type_overrider=export_task_type_overrider,
+        serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn(),
+    )
+
+    return trainer
+
+
+def calibrate_discretizer_and_export(
+    name, calibrator, build_graph_fn, params, feature_config
+):
+    """
+    Pre-set percentile discretizer calibrator.
+    Args:
+      name:
+        scope name used for the calibrator
+      calibrator:
+        calibrator that will be calibrated and exported.
+      build_graph_fn:
+        build graph function for the calibrator
+      params:
+        params passed to the calibrator
+      feature_config:
+        feature config or input_fn which will be passed to the trainer.
+    """
+
+    if (
+        os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief"
+        or "num_workers" not in params
+        or params.num_workers is None
+    ):
+        # chief trains discretizer
+        logging.info("Chief training discretizer")
+
+        # disregard hogwild config
+        os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
+        os.environ["TWML_HOGWILD_PORTS"] = ""
+
+        # create discretizer params
+        params_c = copy.deepcopy(params)
+        params_c.data_threads = 1
+        params_c.train_steps = -1
+        params_c.train_max_steps = None
+        params_c.eval_steps = -1
+        params_c.num_workers = 1
+        params_c.tensorboard_port = None
+        params_c.stats_port = None
+
+        if "discretizer_batch_size" in params_c:
+            params_c.train_batch_size = params_c.discretizer_batch_size
+            params_c.eval_batch_size = params_c.discretizer_batch_size
+        if "discretizer_keep_rate" in params_c:
+            params_c.train_keep_rate = params_c.discretizer_keep_rate
+        if "discretizer_parts_downsampling_rate" in params_c:
+            params_c.train_parts_downsampling_rate = (
+                params_c.discretizer_parts_downsampling_rate
+            )
+        if "discretizer_save_dir" in params_c:
+            params_c.save_dir = params_c.discretizer_save_dir
+
+        # train discretizer
+        trainer = twml.trainers.DataRecordTrainer(
+            name=name,
+            params=params_c,
+            build_graph_fn=build_graph_fn,
+            save_dir=params_c.save_dir,
+        )
+
+        if isinstance(feature_config, twml.feature_config.FeatureConfig):
+            parse_fn = twml.parsers.get_continuous_parse_fn(feature_config)
+            input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
+        elif callable(feature_config):
+            input_fn = feature_config
+        else:
+            got_type = type(feature_config).__name__
+            raise ValueError(
+                "Expecting feature_config to be FeatureConfig or function got %s"
+                % got_type
+            )
+
+        hooks = None
+        if params_c.train_steps > 0:
+            hooks = [twml.hooks.StepProgressHook(params_c.train_steps)]
+
+        trainer.calibrate(
+            calibrator=calibrator,
+            input_fn=input_fn,
+            steps=params_c.train_steps,
+            hooks=hooks,
+        )
+        # restore hogwild setup
+        if os_twml_hogwild_ports is not None:
+            os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
+    else:
+        discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
+        # workers wait for calibration to be ready
+        while not tf.io.gfile.exists(
+            discretizer_save_dir + os.path.sep + "tfhub_module.pb"
+        ):
+            logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
+            time.sleep(60)
 
 
 def build_percentile_discretizer_graph(features, label, mode, params, config=None):
-  """
-  Pre-set Percentile Discretizer Build Graph
-  Follows the same signature as build_graph
-  """
-  sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-  weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1]))
-  if isinstance(sparse_tf, tf.SparseTensor):
-    indices = sparse_tf.indices[:, 1]
-    ids = sparse_tf.indices[:, 0]
-  elif isinstance(sparse_tf, twml.SparseTensor):
-    indices = sparse_tf.indices
-    ids = sparse_tf.ids
-
-  # Return weights, feature_ids, feature_values
-  weights = tf.gather(params=weights, indices=ids)
-  feature_ids = indices
-  feature_values = sparse_tf.values
-  # Update train_op and assign dummy_loss
-  train_op = tf.assign_add(tf.train.get_global_step(), 1)
-  loss = tf.constant(1)
-  if mode == 'train':
-    return {'train_op': train_op, 'loss': loss}
-  return {'feature_ids': feature_ids, 'feature_values': feature_values, 'weights': weights}
+    """
+    Pre-set Percentile Discretizer Build Graph
+    Follows the same signature as build_graph
+    """
+    sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
+    weights = tf.reshape(features["weights"], tf.reshape(features["batch_size"], [1]))
+    if isinstance(sparse_tf, tf.SparseTensor):
+        indices = sparse_tf.indices[:, 1]
+        ids = sparse_tf.indices[:, 0]
+    elif isinstance(sparse_tf, twml.SparseTensor):
+        indices = sparse_tf.indices
+        ids = sparse_tf.ids
+
+    # Return weights, feature_ids, feature_values
+    weights = tf.gather(params=weights, indices=ids)
+    feature_ids = indices
+    feature_values = sparse_tf.values
+    # Update train_op and assign dummy_loss
+    train_op = tf.assign_add(tf.train.get_global_step(), 1)
+    loss = tf.constant(1)
+    if mode == "train":
+        return {"train_op": train_op, "loss": loss}
+    return {
+        "feature_ids": feature_ids,
+        "feature_values": feature_values,
+        "weights": weights,
+    }
 
 
 def isotonic_module(mode, params):
-  """
-  Common Isotonic Calibrator module for Hub Export
-  """
-  inputs = tf.sparse_placeholder(tf.float32, name="sparse_input")
-  mlp = hub.Module(params.calibrator_load_tensorflow_module)
-  logits = mlp(inputs, signature=params.export_mlp_module_name)
-  isotonic_calibrator = hub.Module(params.save_dir)
-  output = isotonic_calibrator(logits, signature="isotonic_calibrator")
-  hub.add_signature(inputs={"sparse_input": inputs},
-    outputs={"default": output},
-    name=params.calibrator_export_module_name)
-
-
-def build_isotonic_graph_from_inputs(inputs, features, label, mode, params, config=None, isotonic_fn=None):
-  """
-  Helper function to build_isotonic_graph
-  Pre-set Isotonic Calibrator Build Graph
-  Follows the same signature as build_graph
-  """
-  if params.mode == 'calibrate':
+    """
+    Common Isotonic Calibrator module for Hub Export
+    """
+    inputs = tf.sparse_placeholder(tf.float32, name="sparse_input")
     mlp = hub.Module(params.calibrator_load_tensorflow_module)
     logits = mlp(inputs, signature=params.export_mlp_module_name)
-    weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1]))
-    # Update train_op and assign dummy_loss
-    train_op = tf.assign_add(tf.train.get_global_step(), 1)
-    loss = tf.constant(1)
-    if mode == 'train':
-      return {'train_op': train_op, 'loss': loss}
-    return {'predictions': logits, 'targets': features['labels'], 'weights': weights}
-  else:
-    if isotonic_fn is None:
-      isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_module, mode=mode, params=params)
+    isotonic_calibrator = hub.Module(params.save_dir)
+    output = isotonic_calibrator(logits, signature="isotonic_calibrator")
+    hub.add_signature(
+        inputs={"sparse_input": inputs},
+        outputs={"default": output},
+        name=params.calibrator_export_module_name,
+    )
+
+
+def build_isotonic_graph_from_inputs(
+    inputs, features, label, mode, params, config=None, isotonic_fn=None
+):
+    """
+    Helper function to build_isotonic_graph
+    Pre-set Isotonic Calibrator Build Graph
+    Follows the same signature as build_graph
+    """
+    if params.mode == "calibrate":
+        mlp = hub.Module(params.calibrator_load_tensorflow_module)
+        logits = mlp(inputs, signature=params.export_mlp_module_name)
+        weights = tf.reshape(
+            features["weights"], tf.reshape(features["batch_size"], [1])
+        )
+        # Update train_op and assign dummy_loss
+        train_op = tf.assign_add(tf.train.get_global_step(), 1)
+        loss = tf.constant(1)
+        if mode == "train":
+            return {"train_op": train_op, "loss": loss}
+        return {
+            "predictions": logits,
+            "targets": features["labels"],
+            "weights": weights,
+        }
     else:
-      isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_fn, mode=mode, params=params)
-    output_hub = hub.Module(isotonic_spec,
-      name=params.calibrator_export_module_name)
-    hub.register_module_for_export(output_hub, params.calibrator_export_module_name)
-    output = output_hub(inputs, signature=params.calibrator_export_module_name)
-    output = tf.clip_by_value(output, 0, 1)
-    loss = tf.reduce_sum(tf.stop_gradient(output))
-    train_op = tf.assign_add(tf.train.get_global_step(), 1)
-    return {'train_op': train_op, 'loss': loss, 'output': output}
-
-
-def build_isotonic_graph(features, label, mode, params, config=None, export_discretizer=True):
-  """
-  Pre-set Isotonic Calibrator Build Graph
-  Follows the same signature as build_graph
-  This assumes that MLP already contains all modules (include percentile
-  discretizer); if export_discretizer is set
-  then it does not export the MDL phase.
-  """
-  sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-  if export_discretizer:
-    return build_isotonic_graph_from_inputs(sparse_tf, features, label, mode, params, config)
-  discretizer = hub.Module(params.discretizer_path)
-
-  if params.discretizer_signature is None:
-    discretizer_signature = "percentile_discretizer_calibrator"
-  else:
-    discretizer_signature = params.discretizer_signature
-  input_sparse = discretizer(sparse_tf, signature=discretizer_signature)
-  return build_isotonic_graph_from_inputs(input_sparse, features, label, mode, params, config)
+        if isotonic_fn is None:
+            isotonic_spec = twml.util.create_module_spec(
+                mlp_fn=isotonic_module, mode=mode, params=params
+            )
+        else:
+            isotonic_spec = twml.util.create_module_spec(
+                mlp_fn=isotonic_fn, mode=mode, params=params
+            )
+        output_hub = hub.Module(
+            isotonic_spec, name=params.calibrator_export_module_name
+        )
+        hub.register_module_for_export(output_hub, params.calibrator_export_module_name)
+        output = output_hub(inputs, signature=params.calibrator_export_module_name)
+        output = tf.clip_by_value(output, 0, 1)
+        loss = tf.reduce_sum(tf.stop_gradient(output))
+        train_op = tf.assign_add(tf.train.get_global_step(), 1)
+        return {"train_op": train_op, "loss": loss, "output": output}
+
+
+def build_isotonic_graph(
+    features, label, mode, params, config=None, export_discretizer=True
+):
+    """
+    Pre-set Isotonic Calibrator Build Graph
+    Follows the same signature as build_graph
+    This assumes that MLP already contains all modules (include percentile
+    discretizer); if export_discretizer is set
+    then it does not export the MDL phase.
+    """
+    sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
+    if export_discretizer:
+        return build_isotonic_graph_from_inputs(
+            sparse_tf, features, label, mode, params, config
+        )
+    discretizer = hub.Module(params.discretizer_path)
+
+    if params.discretizer_signature is None:
+        discretizer_signature = "percentile_discretizer_calibrator"
+    else:
+        discretizer_signature = params.discretizer_signature
+    input_sparse = discretizer(sparse_tf, signature=discretizer_signature)
+    return build_isotonic_graph_from_inputs(
+        input_sparse, features, label, mode, params, config
+    )
diff --git a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
index e14f62303..888dd7170 100644
--- a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
+++ b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
@@ -1,22 +1,27 @@
 # pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains HashedPercentileDiscretizerCalibrator used for calibration '''
-from .percentile_discretizer import PercentileDiscretizerCalibrator
-
+""" Contains HashedPercentileDiscretizerCalibrator used for calibration """
 import twml
 
+from .percentile_discretizer import PercentileDiscretizerCalibrator
+
 
 class HashedPercentileDiscretizerCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for HashedPercentileDiscretizer calibration.
-  This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
-  `to_layer` method returns a HashedPercentileDiscretizer instead.
-  '''
+    """Accumulates features and their respective values for HashedPercentileDiscretizer calibration.
+    This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
+    `to_layer` method returns a HashedPercentileDiscretizer instead.
+    """
 
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    return twml.contrib.layers.HashedPercentileDiscretizer(
-      n_feature=n_feature, n_bin=self._n_bin,
-      name=name, out_bits=self._out_bits,
-      hash_keys=hash_map_keys, hash_values=hash_map_values,
-      bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(),
-      feature_offsets=feature_offsets
-    )
+    def _create_discretizer_layer(
+        self, n_feature, hash_map_keys, hash_map_values, feature_offsets, name
+    ):
+        return twml.contrib.layers.HashedPercentileDiscretizer(
+            n_feature=n_feature,
+            n_bin=self._n_bin,
+            name=name,
+            out_bits=self._out_bits,
+            hash_keys=hash_map_keys,
+            hash_values=hash_map_values,
+            bin_ids=self._bin_ids.flatten(),
+            bin_values=self._bin_vals.flatten(),
+            feature_offsets=feature_offsets,
+        )
diff --git a/twml/twml/contrib/calibrators/hashing_discretizer.py b/twml/twml/contrib/calibrators/hashing_discretizer.py
index 965ced934..7a25ec0f9 100644
--- a/twml/twml/contrib/calibrators/hashing_discretizer.py
+++ b/twml/twml/contrib/calibrators/hashing_discretizer.py
@@ -1,35 +1,37 @@
 # pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains HashedPercentileDiscretizerCalibrator used for calibration '''
-from .percentile_discretizer import PercentileDiscretizerCalibrator
-
+""" Contains HashedPercentileDiscretizerCalibrator used for calibration """
 import numpy as np
+
 import twml
 
+from .percentile_discretizer import PercentileDiscretizerCalibrator
+
 
 class HashingDiscretizerCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for HashingDiscretizer calibration.
-  This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
-  `to_layer` method returns a HashingDiscretizer instead.
-  '''
+    """Accumulates features and their respective values for HashingDiscretizer calibration.
+    This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
+    `to_layer` method returns a HashingDiscretizer instead.
+    """
 
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    # Need to sort hash_map_keys according to hash_map_values
-    # just in case they're not in order of being put in the dict
-    # hash_map_values is already 0 through len(hash_map_values)-1
-    hash_map_keys = hash_map_keys.flatten()
-    # why is this float32 in PercentileDiscretizerCalibrator.to_layer ????
-    # need int for indexing
-    hash_map_values = hash_map_values.flatten().astype(np.int32)
-    feature_ids = np.zeros((len(hash_map_keys),), dtype=np.int64)
-    for idx in range(len(hash_map_keys)):
-      feature_ids[hash_map_values[idx]] = hash_map_keys[idx]
+    def _create_discretizer_layer(
+        self, n_feature, hash_map_keys, hash_map_values, feature_offsets, name
+    ):
+        # Need to sort hash_map_keys according to hash_map_values
+        # just in case they're not in order of being put in the dict
+        # hash_map_values is already 0 through len(hash_map_values)-1
+        hash_map_keys = hash_map_keys.flatten()
+        # why is this float32 in PercentileDiscretizerCalibrator.to_layer ????
+        # need int for indexing
+        hash_map_values = hash_map_values.flatten().astype(np.int32)
+        feature_ids = np.zeros((len(hash_map_keys),), dtype=np.int64)
+        for idx in range(len(hash_map_keys)):
+            feature_ids[hash_map_values[idx]] = hash_map_keys[idx]
 
-    return twml.contrib.layers.HashingDiscretizer(
-      feature_ids=feature_ids,
-      bin_vals=self._bin_vals.flatten(),
-      n_bin=self._n_bin + 1,  # (self._n_bin + 1) bin_vals for each feature_id
-      out_bits=self._out_bits,
-      cost_per_unit=500,
-      name=name
-    )
+        return twml.contrib.layers.HashingDiscretizer(
+            feature_ids=feature_ids,
+            bin_vals=self._bin_vals.flatten(),
+            n_bin=self._n_bin + 1,  # (self._n_bin + 1) bin_vals for each feature_id
+            out_bits=self._out_bits,
+            cost_per_unit=500,
+            name=name,
+        )
diff --git a/twml/twml/contrib/calibrators/isotonic.py b/twml/twml/contrib/calibrators/isotonic.py
index d03a75ff8..aaeb422e6 100644
--- a/twml/twml/contrib/calibrators/isotonic.py
+++ b/twml/twml/contrib/calibrators/isotonic.py
@@ -1,317 +1,334 @@
 # pylint: disable=arguments-differ, unused-argument
-''' Contains Isotonic Calibration'''
+""" Contains Isotonic Calibration"""
 
-from .calibrator import CalibrationFeature, Calibrator
-
-from absl import logging
 import numpy as np
-from sklearn.isotonic import isotonic_regression
 import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
+from absl import logging
+from sklearn.isotonic import isotonic_regression
+
 import twml
 import twml.layers
 
+from .calibrator import CalibrationFeature, Calibrator
 
 DEFAULT_SAMPLE_WEIGHT = 1
 
 
 def sort_values(inputs, target, weight, ascending=True):
-  '''
-  Sorts arrays based on the first array.
-
-  Arguments:
-    inputs:
-      1D array which will dictate the order which the remainder 2 arrays will be sorted
-    target:
-      1D array
-    weight:
-      1D array
-    ascending:
-      Boolean. If set to True (the default), sorts values in ascending order.
-
-  Returns:
-    sorted inputs:
-      1D array sorted by the order of `ascending`
-    sorted targets:
-      1D array
-    sorted weight:
-      1D array
-  '''
-  # assert that the length of inputs and target are the same
-  if len(inputs) != len(target):
-    raise ValueError('Expecting inputs and target sizes to match')
-   # assert that the length of inputs and weight are the same
-  if len(inputs) != len(weight):
-    raise ValueError('Expecting inputs and weight sizes to match')
-  inds = inputs.argsort()
-  if not ascending:
-    inds = inds[::-1]
-  return inputs[inds], target[inds], weight[inds]
-
-
-class IsotonicFeature(CalibrationFeature):
-  '''
-  IsotonicFeature adds values, weights and targets to each feature and then runs
-  isotonic regression by calling `sklearn.isotonic.isotonic_regression
-  <http://scikit-learn.org/stable/auto_examples/plot_isotonic_regression.html>`_
-  '''
-
-  def _get_bin_boundaries(self, n_samples, bins, similar_bins):
     """
-    Calculates the sample indices that define bin boundaries
+    Sorts arrays based on the first array.
 
     Arguments:
-      n_samples:
-        (int) number of samples
-      bins:
-        (int) number of bins. Needs to be smaller or equal than n_samples.
-      similar_bins:
-        (bool) If True, samples will be distributed in bins of equal size (up to one sample).
-        If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
-        Note that equal_bins=False can create a last bins with a very large number of samples.
+      inputs:
+        1D array which will dictate the order which the remainder 2 arrays will be sorted
+      target:
+        1D array
+      weight:
+        1D array
+      ascending:
+        Boolean. If set to True (the default), sorts values in ascending order.
 
     Returns:
-      (list[int]) List of sample indices defining bin boundaries
+      sorted inputs:
+        1D array sorted by the order of `ascending`
+      sorted targets:
+        1D array
+      sorted weight:
+        1D array
     """
+    # assert that the length of inputs and target are the same
+    if len(inputs) != len(target):
+        raise ValueError("Expecting inputs and target sizes to match")
+    # assert that the length of inputs and weight are the same
+    if len(inputs) != len(weight):
+        raise ValueError("Expecting inputs and weight sizes to match")
+    inds = inputs.argsort()
+    if not ascending:
+        inds = inds[::-1]
+    return inputs[inds], target[inds], weight[inds]
 
-    if bins > n_samples:
-      raise ValueError(
-        "The number of bins needs to be less than or equal to the number of samples. "
-        "Currently bins={0} and n_samples={1}.".format(bins, n_samples)
-      )
-
-    step = n_samples // bins
-
-    if similar_bins:
-      # dtype=int will floor the linspace
-      bin_boundaries = np.linspace(0, n_samples - step, num=bins, dtype=int)
-    else:
-      bin_boundaries = range(0, step * bins, step)
-
-    bin_boundaries = np.append(bin_boundaries, n_samples)
-
-    return bin_boundaries
-
-  def calibrate(self, bins, similar_bins=False, debug=False):
-    '''Calibrates the IsotonicFeature into calibrated weights and bias.
-
-    1. Sorts the values of the feature class, based on the order of values
-    2. Performs isotonic regression using sklearn.isotonic.isotonic_regression
-    3. Performs the binning of the samples, in order to obtain the final weight and bias
-      which will be used for inference
-
-    Note that this method can only be called once.
 
-    Arguments:
-      bins:
-        number of bins.
-      similar_bins:
-        If True, samples will be distributed in bins of equal size (up to one sample).
-        If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
-        Note that equal_bins=False can create a last bins with a very large number of samples.
-      debug:
-        Defaults to False. If debug is set to true, output other parameters useful for debugging.
+class IsotonicFeature(CalibrationFeature):
+    """
+    IsotonicFeature adds values, weights and targets to each feature and then runs
+    isotonic regression by calling `sklearn.isotonic.isotonic_regression
+    <http://scikit-learn.org/stable/auto_examples/plot_isotonic_regression.html>`_
+    """
 
-    Returns:
-      [calibrated weight, calibrated bias]
-    '''
-    if self._calibrated:
-      raise RuntimeError("Can only calibrate once")
-    # parse through the dict to obtain the targets, weights and values
-    self._concat_arrays()
-    feature_targets = self._features_dict['targets']
-    feature_values = self._features_dict['values']
-    feature_weights = self._features_dict['weights']
-    srtd_feature_values, srtd_feature_targets, srtd_feature_weights = sort_values(
-      inputs=feature_values,
-      target=feature_targets,
-      weight=feature_weights
-    )
-    calibrated_feature_values = isotonic_regression(
-      srtd_feature_targets, sample_weight=srtd_feature_weights)
-    # create the final outputs for the prediction of each class
-    bpreds = []
-    btargets = []
-    bweights = []
-    rpreds = []
-
-    # Create bin boundaries
-    bin_boundaries = self._get_bin_boundaries(
-      len(calibrated_feature_values), bins, similar_bins=similar_bins)
-
-    for sidx, eidx in zip(bin_boundaries, bin_boundaries[1:]):
-      # separate each one of the arrays based on their respective bins
-      lpreds = srtd_feature_values[int(sidx):int(eidx)]
-      lrpreds = calibrated_feature_values[int(sidx):int(eidx)]
-      ltargets = srtd_feature_targets[int(sidx):int(eidx)]
-      lweights = srtd_feature_weights[int(sidx):int(eidx)]
-
-      # calculate the outputs (including the bpreds and rpreds)
-      bpreds.append(np.sum(lpreds * lweights) / (np.squeeze(np.sum(lweights))))
-      rpreds.append(np.sum(lrpreds * lweights) / (np.squeeze(np.sum(lweights))))
-      btargets.append(np.sum(ltargets * lweights) / (np.squeeze(np.sum(lweights))))
-      bweights.append(np.squeeze(np.sum(lweights)))
-    # transposing the bpreds and rpreds which will be used as input to the inference step
-    bpreds = np.asarray(bpreds).T
-    rpreds = np.asarray(rpreds).T
-    btargets = np.asarray(btargets).T
-    bweights = np.asarray(bweights).T
-    # setting _calibrated to be True which is necessary in order to prevent it to re-calibrate
-    self._calibrated = True
-    if debug:
-      return bpreds, rpreds, btargets, bweights
-    return bpreds, rpreds
+    def _get_bin_boundaries(self, n_samples, bins, similar_bins):
+        """
+        Calculates the sample indices that define bin boundaries
+
+        Arguments:
+          n_samples:
+            (int) number of samples
+          bins:
+            (int) number of bins. Needs to be smaller or equal than n_samples.
+          similar_bins:
+            (bool) If True, samples will be distributed in bins of equal size (up to one sample).
+            If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
+            Note that equal_bins=False can create a last bins with a very large number of samples.
+
+        Returns:
+          (list[int]) List of sample indices defining bin boundaries
+        """
+
+        if bins > n_samples:
+            raise ValueError(
+                "The number of bins needs to be less than or equal to the number of samples. "
+                "Currently bins={0} and n_samples={1}.".format(bins, n_samples)
+            )
+
+        step = n_samples // bins
+
+        if similar_bins:
+            # dtype=int will floor the linspace
+            bin_boundaries = np.linspace(0, n_samples - step, num=bins, dtype=int)
+        else:
+            bin_boundaries = range(0, step * bins, step)
+
+        bin_boundaries = np.append(bin_boundaries, n_samples)
+
+        return bin_boundaries
+
+    def calibrate(self, bins, similar_bins=False, debug=False):
+        """Calibrates the IsotonicFeature into calibrated weights and bias.
+
+        1. Sorts the values of the feature class, based on the order of values
+        2. Performs isotonic regression using sklearn.isotonic.isotonic_regression
+        3. Performs the binning of the samples, in order to obtain the final weight and bias
+          which will be used for inference
+
+        Note that this method can only be called once.
+
+        Arguments:
+          bins:
+            number of bins.
+          similar_bins:
+            If True, samples will be distributed in bins of equal size (up to one sample).
+            If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
+            Note that equal_bins=False can create a last bins with a very large number of samples.
+          debug:
+            Defaults to False. If debug is set to true, output other parameters useful for debugging.
+
+        Returns:
+          [calibrated weight, calibrated bias]
+        """
+        if self._calibrated:
+            raise RuntimeError("Can only calibrate once")
+        # parse through the dict to obtain the targets, weights and values
+        self._concat_arrays()
+        feature_targets = self._features_dict["targets"]
+        feature_values = self._features_dict["values"]
+        feature_weights = self._features_dict["weights"]
+        srtd_feature_values, srtd_feature_targets, srtd_feature_weights = sort_values(
+            inputs=feature_values, target=feature_targets, weight=feature_weights
+        )
+        calibrated_feature_values = isotonic_regression(
+            srtd_feature_targets, sample_weight=srtd_feature_weights
+        )
+        # create the final outputs for the prediction of each class
+        bpreds = []
+        btargets = []
+        bweights = []
+        rpreds = []
+
+        # Create bin boundaries
+        bin_boundaries = self._get_bin_boundaries(
+            len(calibrated_feature_values), bins, similar_bins=similar_bins
+        )
+
+        for sidx, eidx in zip(bin_boundaries, bin_boundaries[1:]):
+            # separate each one of the arrays based on their respective bins
+            lpreds = srtd_feature_values[int(sidx) : int(eidx)]
+            lrpreds = calibrated_feature_values[int(sidx) : int(eidx)]
+            ltargets = srtd_feature_targets[int(sidx) : int(eidx)]
+            lweights = srtd_feature_weights[int(sidx) : int(eidx)]
+
+            # calculate the outputs (including the bpreds and rpreds)
+            bpreds.append(np.sum(lpreds * lweights) / (np.squeeze(np.sum(lweights))))
+            rpreds.append(np.sum(lrpreds * lweights) / (np.squeeze(np.sum(lweights))))
+            btargets.append(
+                np.sum(ltargets * lweights) / (np.squeeze(np.sum(lweights)))
+            )
+            bweights.append(np.squeeze(np.sum(lweights)))
+        # transposing the bpreds and rpreds which will be used as input to the inference step
+        bpreds = np.asarray(bpreds).T
+        rpreds = np.asarray(rpreds).T
+        btargets = np.asarray(btargets).T
+        bweights = np.asarray(bweights).T
+        # setting _calibrated to be True which is necessary in order to prevent it to re-calibrate
+        self._calibrated = True
+        if debug:
+            return bpreds, rpreds, btargets, bweights
+        return bpreds, rpreds
 
 
 class IsotonicCalibrator(Calibrator):
-  ''' Accumulates features and their respective values for isotonic calibration.
-  Internally, each feature's values is accumulated via its own isotonicFeature object.
-  The steps for calibration are typically as follows:
+    """Accumulates features and their respective values for isotonic calibration.
+    Internally, each feature's values is accumulated via its own isotonicFeature object.
+    The steps for calibration are typically as follows:
 
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into Isotonic ``bpreds``, ``rpreds`` by calling ``calibrate()``; and
-   3. convert to a ``twml.layers.Isotonic`` layer by calling ``to_layer()``.
-
-  '''
-
-  def __init__(self, n_bin, similar_bins=False, **kwargs):
-    ''' Constructs an isotonicCalibrator instance.
-
-    Arguments:
-      n_bin:
-        the number of bins per feature to use for isotonic.
-        Note that each feature actually maps to ``n_bin+1`` output IDs.
-    '''
-    super(IsotonicCalibrator, self).__init__(**kwargs)
-    self._n_bin = n_bin
-    self._similar_bins = similar_bins
-    self._ys_input = []
-    self._xs_input = []
-    self._isotonic_feature_dict = {}
-
-  def accumulate_feature(self, output):
-    '''
-    Wrapper around accumulate for trainer API.
-    Arguments:
-      output: output of prediction of build_graph for calibrator
-    '''
-    weights = output['weights'] if 'weights' in output else None
-    return self.accumulate(output['predictions'], output['targets'], weights)
+     1. accumulate feature values from batches by calling ``accumulate()``;
+     2. calibrate all feature into Isotonic ``bpreds``, ``rpreds`` by calling ``calibrate()``; and
+     3. convert to a ``twml.layers.Isotonic`` layer by calling ``to_layer()``.
 
-  def accumulate(self, predictions, targets, weights=None):
-    '''
-    Accumulate a single batch of class predictions, class targets and class weights.
-    These are accumulated until calibrate() is called.
-
-    Arguments:
-      predictions:
-        float matrix of class values. Each dimension corresponds to a different class.
-        Shape is ``[n, d]``, where d is the number of classes.
-      targets:
-        float matrix of class targets. Each dimension corresponds to a different class.
-        Shape ``[n, d]``, where d is the number of classes.
-      weights:
-        Defaults to weights of 1.
-        1D array containing the weights of each prediction.
-    '''
-    if predictions.shape != targets.shape:
-      raise ValueError(
-        'Expecting predictions.shape == targets.shape, got %s and %s instead' %
-        (str(predictions.shape), str(targets.shape)))
-    if weights is not None:
-      if weights.ndim != 1:
-        raise ValueError('Expecting 1D weight, got %dD instead' % weights.ndim)
-      elif weights.size != predictions.shape[0]:
-        raise ValueError(
-          'Expecting predictions.shape[0] == weights.size, got %d != %d instead' %
-          (predictions.shape[0], weights.size))
-    # iterate through the rows of predictions and sets one class to each row
-    if weights is None:
-      weights = np.full(predictions.shape[0], fill_value=DEFAULT_SAMPLE_WEIGHT)
-    for class_key in range(predictions.shape[1]):
-      # gets the predictions and targets for that class
-      class_predictions = predictions[:, class_key]
-      class_targets = targets[:, class_key]
-      if class_key not in self._isotonic_feature_dict:
-        isotonic_feature = IsotonicFeature(class_key)
-        self._isotonic_feature_dict[class_key] = isotonic_feature
-      else:
-        isotonic_feature = self._isotonic_feature_dict[class_key]
-      isotonic_feature.add_values({'values': class_predictions, 'weights': weights,
-                                   'targets': class_targets})
-
-  def calibrate(self, debug=False):
-    '''
-    Calibrates each IsotonicFeature after accumulation is complete.
-    Results are stored in ``self._ys_input`` and ``self._xs_input``
-
-    Arguments:
-      debug:
-        Defaults to False. If set to true, returns the ``xs_input`` and ``ys_input``.
-    '''
-    super(IsotonicCalibrator, self).calibrate()
-    bias_temp = []
-    weight_temp = []
-    logging.info("Beginning isotonic calibration.")
-    isotonic_features_dict = self._isotonic_feature_dict
-    for class_id in isotonic_features_dict:
-      bpreds, rpreds = isotonic_features_dict[class_id].calibrate(bins=self._n_bin, similar_bins=self._similar_bins)
-      weight_temp.append(bpreds)
-      bias_temp.append(rpreds)
-    # save isotonic results onto a matrix
-    self._xs_input = np.array(weight_temp, dtype=np.float32)
-    self._ys_input = np.array(bias_temp, dtype=np.float32)
-    logging.info("Isotonic calibration finished.")
-    if debug:
-      return np.array(weight_temp), np.array(bias_temp)
-    return None
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory. Default (string): "default".
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    logging.info("You probably do not need to save the isotonic layer. \
-                  So feel free to set save to False in the Trainer. \
-                  Additionally this only saves the layer not the whole graph.")
-
-    def calibrator_module():
-      '''
-      Way to save Isotonic layer
-      '''
-      # The input to isotonic is a dense layer
-      inputs = tf.placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      output = calibrator_layer(inputs)
-      # creates the signature to the calibrator module
-      hub.add_signature(inputs=inputs, outputs=output, name=name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-  def to_layer(self):
-    """ Returns a twml.layers.Isotonic Layer that can be used for feature discretization.
     """
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    isotonic_layer = twml.layers.Isotonic(
-      n_unit=self._xs_input.shape[0], n_bin=self._xs_input.shape[1],
-      xs_input=self._xs_input, ys_input=self._ys_input,
-      **self._kwargs)
 
-    return isotonic_layer
-
-  def get_layer_args(self, name=None):
-    """ Returns layer args. See ``Calibrator.get_layer_args`` for more detailed documentation """
-    return {'n_unit': self._xs_input.shape[0], 'n_bin': self._xs_input.shape[1]}
+    def __init__(self, n_bin, similar_bins=False, **kwargs):
+        """Constructs an isotonicCalibrator instance.
+
+        Arguments:
+          n_bin:
+            the number of bins per feature to use for isotonic.
+            Note that each feature actually maps to ``n_bin+1`` output IDs.
+        """
+        super(IsotonicCalibrator, self).__init__(**kwargs)
+        self._n_bin = n_bin
+        self._similar_bins = similar_bins
+        self._ys_input = []
+        self._xs_input = []
+        self._isotonic_feature_dict = {}
+
+    def accumulate_feature(self, output):
+        """
+        Wrapper around accumulate for trainer API.
+        Arguments:
+          output: output of prediction of build_graph for calibrator
+        """
+        weights = output["weights"] if "weights" in output else None
+        return self.accumulate(output["predictions"], output["targets"], weights)
+
+    def accumulate(self, predictions, targets, weights=None):
+        """
+        Accumulate a single batch of class predictions, class targets and class weights.
+        These are accumulated until calibrate() is called.
+
+        Arguments:
+          predictions:
+            float matrix of class values. Each dimension corresponds to a different class.
+            Shape is ``[n, d]``, where d is the number of classes.
+          targets:
+            float matrix of class targets. Each dimension corresponds to a different class.
+            Shape ``[n, d]``, where d is the number of classes.
+          weights:
+            Defaults to weights of 1.
+            1D array containing the weights of each prediction.
+        """
+        if predictions.shape != targets.shape:
+            raise ValueError(
+                "Expecting predictions.shape == targets.shape, got %s and %s instead"
+                % (str(predictions.shape), str(targets.shape))
+            )
+        if weights is not None:
+            if weights.ndim != 1:
+                raise ValueError("Expecting 1D weight, got %dD instead" % weights.ndim)
+            elif weights.size != predictions.shape[0]:
+                raise ValueError(
+                    "Expecting predictions.shape[0] == weights.size, got %d != %d instead"
+                    % (predictions.shape[0], weights.size)
+                )
+        # iterate through the rows of predictions and sets one class to each row
+        if weights is None:
+            weights = np.full(predictions.shape[0], fill_value=DEFAULT_SAMPLE_WEIGHT)
+        for class_key in range(predictions.shape[1]):
+            # gets the predictions and targets for that class
+            class_predictions = predictions[:, class_key]
+            class_targets = targets[:, class_key]
+            if class_key not in self._isotonic_feature_dict:
+                isotonic_feature = IsotonicFeature(class_key)
+                self._isotonic_feature_dict[class_key] = isotonic_feature
+            else:
+                isotonic_feature = self._isotonic_feature_dict[class_key]
+            isotonic_feature.add_values(
+                {
+                    "values": class_predictions,
+                    "weights": weights,
+                    "targets": class_targets,
+                }
+            )
+
+    def calibrate(self, debug=False):
+        """
+        Calibrates each IsotonicFeature after accumulation is complete.
+        Results are stored in ``self._ys_input`` and ``self._xs_input``
+
+        Arguments:
+          debug:
+            Defaults to False. If set to true, returns the ``xs_input`` and ``ys_input``.
+        """
+        super(IsotonicCalibrator, self).calibrate()
+        bias_temp = []
+        weight_temp = []
+        logging.info("Beginning isotonic calibration.")
+        isotonic_features_dict = self._isotonic_feature_dict
+        for class_id in isotonic_features_dict:
+            bpreds, rpreds = isotonic_features_dict[class_id].calibrate(
+                bins=self._n_bin, similar_bins=self._similar_bins
+            )
+            weight_temp.append(bpreds)
+            bias_temp.append(rpreds)
+        # save isotonic results onto a matrix
+        self._xs_input = np.array(weight_temp, dtype=np.float32)
+        self._ys_input = np.array(bias_temp, dtype=np.float32)
+        logging.info("Isotonic calibration finished.")
+        if debug:
+            return np.array(weight_temp), np.array(bias_temp)
+        return None
+
+    def save(self, save_dir, name="default", verbose=False):
+        """Save the calibrator into the given save_directory.
+        Arguments:
+          save_dir:
+            name of the saving directory. Default (string): "default".
+        """
+        if not self._calibrated:
+            raise RuntimeError(
+                "Expecting prior call to calibrate().Cannot save() prior to calibrate()"
+            )
+
+        # This module allows for the calibrator to save be saved as part of
+        # Tensorflow Hub (this will allow it to be used in further steps)
+        logging.info(
+            "You probably do not need to save the isotonic layer. \
+                  So feel free to set save to False in the Trainer. \
+                  Additionally this only saves the layer not the whole graph."
+        )
+
+        def calibrator_module():
+            """
+            Way to save Isotonic layer
+            """
+            # The input to isotonic is a dense layer
+            inputs = tf.placeholder(tf.float32)
+            calibrator_layer = self.to_layer()
+            output = calibrator_layer(inputs)
+            # creates the signature to the calibrator module
+            hub.add_signature(inputs=inputs, outputs=output, name=name)
+
+        # exports the module to the save_dir
+        spec = hub.create_module_spec(calibrator_module)
+        with tf.Graph().as_default():
+            module = hub.Module(spec)
+            with tf.Session() as session:
+                module.export(save_dir, session)
+
+    def to_layer(self):
+        """Returns a twml.layers.Isotonic Layer that can be used for feature discretization."""
+        if not self._calibrated:
+            raise RuntimeError("Expecting prior call to calibrate()")
+
+        isotonic_layer = twml.layers.Isotonic(
+            n_unit=self._xs_input.shape[0],
+            n_bin=self._xs_input.shape[1],
+            xs_input=self._xs_input,
+            ys_input=self._ys_input,
+            **self._kwargs
+        )
+
+        return isotonic_layer
+
+    def get_layer_args(self, name=None):
+        """Returns layer args. See ``Calibrator.get_layer_args`` for more detailed documentation"""
+        return {"n_unit": self._xs_input.shape[0], "n_bin": self._xs_input.shape[1]}
diff --git a/twml/twml/contrib/calibrators/mdl.py b/twml/twml/contrib/calibrators/mdl.py
index 0fe3265a4..114d5f3a3 100644
--- a/twml/twml/contrib/calibrators/mdl.py
+++ b/twml/twml/contrib/calibrators/mdl.py
@@ -1,118 +1,136 @@
 # pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains MDLFeature and MDLCalibrator used for MDL calibration '''
+""" Contains MDLFeature and MDLCalibrator used for MDL calibration """
 
 
 import os
 
-from .percentile_discretizer import PercentileDiscretizerCalibrator, PercentileDiscretizerFeature
-
-from absl import logging
 import numpy as np
 import tensorflow.compat.v1 as tf
+from absl import logging
+
 import twml
 import twml.layers
 
+from .percentile_discretizer import (
+    PercentileDiscretizerCalibrator,
+    PercentileDiscretizerFeature,
+)
 
 DEFAULT_SAMPLE_WEIGHT = 1
 
 
 class MDLFeature(PercentileDiscretizerFeature):
-  ''' Accumulates and calibrates a single sparse MDL feature. '''
+    """Accumulates and calibrates a single sparse MDL feature."""
 
 
 class MDLCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for MDL calibration.
-  Internally, each feature's values is accumulated via its own ``MDLFeature`` object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into MDL bin_vals by calling ``calibrate()``; and
-   3. convert to a twml.layers.MDL layer by calling ``to_layer()``.
+    """Accumulates features and their respective values for MDL calibration.
+    Internally, each feature's values is accumulated via its own ``MDLFeature`` object.
+    The steps for calibration are typically as follows:
 
-  '''
+     1. accumulate feature values from batches by calling ``accumulate()``;
+     2. calibrate all feature into MDL bin_vals by calling ``calibrate()``; and
+     3. convert to a twml.layers.MDL layer by calling ``to_layer()``.
 
-  def to_layer(self, name=None):
     """
-    Returns a twml.layers.PercentileDiscretizer Layer
-    that can be used for feature discretization.
 
-    Arguments:
-      name:
-        name-scope of the PercentileDiscretizer layer
-    """
-    n_feature = len(self._discretizer_feature_dict)
-    max_discretizer_feature = n_feature * (self._n_bin + 1)
-
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    if self._bin_ids.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_ids.shape[0] \
-        != len(self._discretizer_feature_dict)")
-    if self._bin_vals.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_vals.shape[0] \
-        != len(self._discretizer_feature_dict)")
-
-    # can add at most #features * (n_bin+1) new feature ids
-    if 2**self._out_bits <= max_discretizer_feature:
-      raise ValueError("""Maximum number of features created by discretizer is
+    def to_layer(self, name=None):
+        """
+        Returns a twml.layers.PercentileDiscretizer Layer
+        that can be used for feature discretization.
+
+        Arguments:
+          name:
+            name-scope of the PercentileDiscretizer layer
+        """
+        n_feature = len(self._discretizer_feature_dict)
+        max_discretizer_feature = n_feature * (self._n_bin + 1)
+
+        if not self._calibrated:
+            raise RuntimeError("Expecting prior call to calibrate()")
+
+        if self._bin_ids.shape[0] != n_feature:
+            raise RuntimeError(
+                "Expecting self._bin_ids.shape[0] \
+        != len(self._discretizer_feature_dict)"
+            )
+        if self._bin_vals.shape[0] != n_feature:
+            raise RuntimeError(
+                "Expecting self._bin_vals.shape[0] \
+        != len(self._discretizer_feature_dict)"
+            )
+
+        # can add at most #features * (n_bin+1) new feature ids
+        if 2**self._out_bits <= max_discretizer_feature:
+            raise ValueError(
+                """Maximum number of features created by discretizer is
         %d but requested that the output be limited to %d values (%d bits),
         which is smaller than that. Please ensure the output has enough bits
         to represent at least the new features"""
-                       % (max_discretizer_feature, 2**self._out_bits, self._out_bits))
-
-    # build feature_offsets, hash_map_keys, hash_map_values
-    feature_offsets = np.arange(0, max_discretizer_feature,
-                                self._n_bin + 1, dtype='int64')
-    hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
-    hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
-
-    discretizer = twml.layers.MDL(
-      n_feature=n_feature, n_bin=self._n_bin,
-      name=name, out_bits=self._out_bits,
-      hash_keys=hash_map_keys, hash_values=hash_map_values,
-      bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(),
-      feature_offsets=feature_offsets,
-      **self._kwargs
-    )
-
-    return discretizer
-
-  def save(self, save_dir, name='calibrator', verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory
-      name:
-        name for the graph scope. Passed to to_layer(name=name) to set
-        scope of layer.
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    layer_args = self.get_layer_args()
-
-    calibrator_filename = os.path.join(save_dir, name + '.json.tf')
-    calibrator_dict = {
-      'layer_args': layer_args,
-      'saved_layer_scope': name + '/',
-    }
-    twml.write_file(calibrator_filename, calibrator_dict, encode='json')
-
-    if verbose:
-      logging.info("The layer graph and other information necessary ")
-      logging.info("for multi-phase training is saved in directory:")
-      logging.info(save_dir)
-      logging.info("This directory can be specified as --init_from_dir argument.")
-      logging.info("")
-      logging.info("Other information is available in: %s.json.tf", name)
-      logging.info("This file can be loaded with twml.read_file(decode='json) to obtain ")
-      logging.info("layer_args, saved_layer_scope and variable_names")
-
-    graph = tf.Graph()
-    # save graph for tensorboard as well
-    writer = tf.summary.FileWriter(logdir=save_dir, graph=graph)
-
-    with tf.Session(graph=graph) as sess:
-      self.write_summary(writer, sess)
-    writer.flush()
+                % (max_discretizer_feature, 2**self._out_bits, self._out_bits)
+            )
+
+        # build feature_offsets, hash_map_keys, hash_map_values
+        feature_offsets = np.arange(
+            0, max_discretizer_feature, self._n_bin + 1, dtype="int64"
+        )
+        hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
+        hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
+
+        discretizer = twml.layers.MDL(
+            n_feature=n_feature,
+            n_bin=self._n_bin,
+            name=name,
+            out_bits=self._out_bits,
+            hash_keys=hash_map_keys,
+            hash_values=hash_map_values,
+            bin_ids=self._bin_ids.flatten(),
+            bin_values=self._bin_vals.flatten(),
+            feature_offsets=feature_offsets,
+            **self._kwargs
+        )
+
+        return discretizer
+
+    def save(self, save_dir, name="calibrator", verbose=False):
+        """Save the calibrator into the given save_directory.
+        Arguments:
+          save_dir:
+            name of the saving directory
+          name:
+            name for the graph scope. Passed to to_layer(name=name) to set
+            scope of layer.
+        """
+        if not self._calibrated:
+            raise RuntimeError(
+                "Expecting prior call to calibrate().Cannot save() prior to calibrate()"
+            )
+
+        layer_args = self.get_layer_args()
+
+        calibrator_filename = os.path.join(save_dir, name + ".json.tf")
+        calibrator_dict = {
+            "layer_args": layer_args,
+            "saved_layer_scope": name + "/",
+        }
+        twml.write_file(calibrator_filename, calibrator_dict, encode="json")
+
+        if verbose:
+            logging.info("The layer graph and other information necessary ")
+            logging.info("for multi-phase training is saved in directory:")
+            logging.info(save_dir)
+            logging.info("This directory can be specified as --init_from_dir argument.")
+            logging.info("")
+            logging.info("Other information is available in: %s.json.tf", name)
+            logging.info(
+                "This file can be loaded with twml.read_file(decode='json) to obtain "
+            )
+            logging.info("layer_args, saved_layer_scope and variable_names")
+
+        graph = tf.Graph()
+        # save graph for tensorboard as well
+        writer = tf.summary.FileWriter(logdir=save_dir, graph=graph)
+
+        with tf.Session(graph=graph) as sess:
+            self.write_summary(writer, sess)
+        writer.flush()
diff --git a/twml/twml/contrib/calibrators/percentile_discretizer.py b/twml/twml/contrib/calibrators/percentile_discretizer.py
index eefce62c2..f9bc420c5 100644
--- a/twml/twml/contrib/calibrators/percentile_discretizer.py
+++ b/twml/twml/contrib/calibrators/percentile_discretizer.py
@@ -1,577 +1,623 @@
 # pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains PercentileDiscretizerFeature and PercentileDiscretizerCalibrator used \
-    for PercentileDiscretizer calibration '''
+""" Contains PercentileDiscretizerFeature and PercentileDiscretizerCalibrator used \
+    for PercentileDiscretizer calibration """
 
 
-
-from .calibrator import CalibrationFeature, Calibrator
-
 import os
+
 import numpy as np
 import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
+
 import twml
 import twml.layers
 
+from .calibrator import CalibrationFeature, Calibrator
 
 DEFAULT_SAMPLE_WEIGHT = 1
 
 
 class PercentileDiscretizerFeature(CalibrationFeature):
-  ''' Accumulates and calibrates a single sparse PercentileDiscretizer feature. '''
-
-  @staticmethod
-  def _gather_debug_info(values, indices, bin_vals, bin_counts_buffer):
-    '''
-    Determine how many training values fell into a given bin during calibration.
-    This is calculated by finding the index of the first appearance of each bin
-    boundary in values (values may repeat, so that isn't trivially in indices.)
-    Subtracting each bin boundary index from the next tells you how many values fall in
-    that bin.
-    To get this to calculate the last bin correctly, len(values) is appended to the
-    list of bound indices.
-
-    This assumes that ``bin_vals`` excludes np.inf bin boundaries when
-    PercentileDiscretizer was calibrated
-    with fewer values than bins.
-
-    Arguments:
-      values:
-        1D ndarray of the PercentileDiscretizerFeature's accumulated values, sorted ascending
-      indices:
-        1D int32 ndarray of the indices (in values) of the bin boundaries
-      bin_vals:
-        1D ndarray containing the bin boundaries
-      bin_counts_buffer:
-        ndarray buffer for returning the PercentileDiscretizer histogram
-    '''
-    # np.flatnonzero(np.diff(x)) gives you the indices i in x s.t. x[i] != x[i+1]
-    # append index of the last bin since that cannot be empty with how
-    # PercentileDiscretizer is implemented
-    nonempty_bins = np.append(np.flatnonzero(np.diff(bin_vals)), len(bin_vals) - 1)
-    bin_start_indices = indices.take(nonempty_bins)
-
-    # if multiples of a bin's lower bound value exist, find the first one
-    for (i, idx) in enumerate(bin_start_indices):
-      cur_idx = idx
-      while cur_idx > 0 and values[cur_idx] == values[cur_idx - 1]:
-        bin_start_indices[i] = cur_idx = cur_idx - 1
-
-    # the end of each bin is the start of the next bin,
-    # until the last, which is the end of the array
-    # broadcast the counts to the nonempty bins, 0 otherwise
-    bin_counts_buffer[:] = 0
-    bin_counts_buffer[nonempty_bins] = np.diff(np.append(bin_start_indices, values.size))
-
-  def calibrate(
-          self,
-          bin_vals, percentiles, percentile_indices,
-          bin_counts_buffer=None):
-    '''Calibrates the PercentileDiscretizerFeature into bin values for
-    use in PercentileDiscretizerCalibrator.
-    Note that this method can only be called once.
-
-    Arguments:
-      bin_vals:
-        Row in the PercentileDiscretizerCalibrator.bin_vals matrix corresponding to this feature.
-        Will be updated with the results of the calibration.
-        A 1D ndarray.
-      percentiles:
-        1D array of size n_bin with values ranging from 0 to 1.
-        For example, ``percentiles = np.linspace(0, 1, num=self._n_bin+1, dtype=np.float32)``
-      percentile_indices:
-        Empty 1D array of size n_bin used to store intermediate results when
-        calling twml.twml_optim_nearest_interpolation().
-        For example, np.empty(self._n_bin + 1, dtype=np.float32).
-      bin_counts_buffer:
-        optional ndarray buffer used for retaining count of values per PercentileDiscretizer
-        bucket (for debug and feature exploration purposes)
-
-    Returns:
-      calibrated bin_vals for use by ``PercentileDiscretizerCalibrator``
-    '''
-    if self._calibrated:
-      raise RuntimeError("Can only calibrate once")
-    if bin_vals.ndim != 1:
-      raise RuntimeError("Expecting bin_vals row")
-
-    # # concatenate values and weights buffers
-    self._concat_arrays()
-    feature_values = self._features_dict['values']
-    feature_weights = self._features_dict['weights']
-
-    # get features ready for the bins, order array indices by feature values.
-    indices = np.argsort(feature_values)
-
-    # get ordered values and weights using array indices
-    values = feature_values.take(indices)
-    weights = feature_weights.take(indices)
-
-    # Normalizes the sum of weights to be between 0 and 1
-    weights = np.cumsum(weights, out=feature_weights)
-    weights -= weights[0]
-    if weights[-1] > 0:  # prevent zero-division
-      weights /= weights[-1]
-
-    # Check if we have less values than bin_vals
-    if values.size < bin_vals.size:
-      # Fills all the bins with a value that won't ever be reached
-      bin_vals.fill(np.inf)
-      # Forces the first to be -inf
-      bin_vals[0] = -np.inf
-      # Copies the values as boundaries
-      bin_vals[1:values.size + 1] = values
-
-      if bin_counts_buffer is not None:
-        # slice out bins with +/-np.inf boundary -- their count will be zero anyway
-        # we can't just assume all other bins will have 1 value since there can be dups
-        short_indices = np.arange(values.size, dtype=np.int32)
-        bin_counts_buffer.fill(0)
-        self._gather_debug_info(
-          values, short_indices, bin_vals[1:values.size + 1],
-          bin_counts_buffer[1:values.size + 1])
-
-    else:
-      # Gets the indices for the values that define the boundary for the bins
-      indices_float = np.arange(0, weights.size, dtype=np.float32)
-
-      # Gets things in the correct shape for the linear interpolation
-      weights = weights.reshape(1, weights.size)
-      indices_float = indices_float.reshape(1, weights.size)
-
-      # wrap ndarrays into twml.Array
-      percentiles_tarray = twml.Array(percentiles.reshape(percentiles.size, 1))
-      weights_tarray = twml.Array(weights)
-      indices_float_tarray = twml.Array(indices_float)
-      percentile_indices_tarray = twml.Array(percentile_indices.reshape(percentiles.size, 1))
-
-      # Performs the binary search to find the indices corresponding to the percentiles
-      err = twml.CLIB.twml_optim_nearest_interpolation(
-        percentile_indices_tarray.handle, percentiles_tarray.handle,  # output, input
-        weights_tarray.handle, indices_float_tarray.handle  # xs, ys
-      )
-      if err != 1000:
-        raise ValueError("""twml.CLIB.twml_optim_nearest_interpolation
-          caught an error (see previous stdout). Error code: """ % err)
-
-      indices = indices[:bin_vals.size]
-      indices[:] = percentile_indices
-      indices[0] = 0
-      indices[-1] = weights.size - 1
-
-      # Gets the values at those indices and copies them into bin_vals
-      values.take(indices, out=bin_vals)
-
-      # get # of values per bucket
-      if bin_counts_buffer is not None:
-        self._gather_debug_info(values, indices, bin_vals, bin_counts_buffer)
-
-    self._calibrated = True
+    """Accumulates and calibrates a single sparse PercentileDiscretizer feature."""
+
+    @staticmethod
+    def _gather_debug_info(values, indices, bin_vals, bin_counts_buffer):
+        """
+        Determine how many training values fell into a given bin during calibration.
+        This is calculated by finding the index of the first appearance of each bin
+        boundary in values (values may repeat, so that isn't trivially in indices.)
+        Subtracting each bin boundary index from the next tells you how many values fall in
+        that bin.
+        To get this to calculate the last bin correctly, len(values) is appended to the
+        list of bound indices.
+
+        This assumes that ``bin_vals`` excludes np.inf bin boundaries when
+        PercentileDiscretizer was calibrated
+        with fewer values than bins.
+
+        Arguments:
+          values:
+            1D ndarray of the PercentileDiscretizerFeature's accumulated values, sorted ascending
+          indices:
+            1D int32 ndarray of the indices (in values) of the bin boundaries
+          bin_vals:
+            1D ndarray containing the bin boundaries
+          bin_counts_buffer:
+            ndarray buffer for returning the PercentileDiscretizer histogram
+        """
+        # np.flatnonzero(np.diff(x)) gives you the indices i in x s.t. x[i] != x[i+1]
+        # append index of the last bin since that cannot be empty with how
+        # PercentileDiscretizer is implemented
+        nonempty_bins = np.append(np.flatnonzero(np.diff(bin_vals)), len(bin_vals) - 1)
+        bin_start_indices = indices.take(nonempty_bins)
+
+        # if multiples of a bin's lower bound value exist, find the first one
+        for i, idx in enumerate(bin_start_indices):
+            cur_idx = idx
+            while cur_idx > 0 and values[cur_idx] == values[cur_idx - 1]:
+                bin_start_indices[i] = cur_idx = cur_idx - 1
+
+        # the end of each bin is the start of the next bin,
+        # until the last, which is the end of the array
+        # broadcast the counts to the nonempty bins, 0 otherwise
+        bin_counts_buffer[:] = 0
+        bin_counts_buffer[nonempty_bins] = np.diff(
+            np.append(bin_start_indices, values.size)
+        )
+
+    def calibrate(
+        self, bin_vals, percentiles, percentile_indices, bin_counts_buffer=None
+    ):
+        """Calibrates the PercentileDiscretizerFeature into bin values for
+        use in PercentileDiscretizerCalibrator.
+        Note that this method can only be called once.
+
+        Arguments:
+          bin_vals:
+            Row in the PercentileDiscretizerCalibrator.bin_vals matrix corresponding to this feature.
+            Will be updated with the results of the calibration.
+            A 1D ndarray.
+          percentiles:
+            1D array of size n_bin with values ranging from 0 to 1.
+            For example, ``percentiles = np.linspace(0, 1, num=self._n_bin+1, dtype=np.float32)``
+          percentile_indices:
+            Empty 1D array of size n_bin used to store intermediate results when
+            calling twml.twml_optim_nearest_interpolation().
+            For example, np.empty(self._n_bin + 1, dtype=np.float32).
+          bin_counts_buffer:
+            optional ndarray buffer used for retaining count of values per PercentileDiscretizer
+            bucket (for debug and feature exploration purposes)
+
+        Returns:
+          calibrated bin_vals for use by ``PercentileDiscretizerCalibrator``
+        """
+        if self._calibrated:
+            raise RuntimeError("Can only calibrate once")
+        if bin_vals.ndim != 1:
+            raise RuntimeError("Expecting bin_vals row")
+
+        # # concatenate values and weights buffers
+        self._concat_arrays()
+        feature_values = self._features_dict["values"]
+        feature_weights = self._features_dict["weights"]
+
+        # get features ready for the bins, order array indices by feature values.
+        indices = np.argsort(feature_values)
+
+        # get ordered values and weights using array indices
+        values = feature_values.take(indices)
+        weights = feature_weights.take(indices)
+
+        # Normalizes the sum of weights to be between 0 and 1
+        weights = np.cumsum(weights, out=feature_weights)
+        weights -= weights[0]
+        if weights[-1] > 0:  # prevent zero-division
+            weights /= weights[-1]
+
+        # Check if we have less values than bin_vals
+        if values.size < bin_vals.size:
+            # Fills all the bins with a value that won't ever be reached
+            bin_vals.fill(np.inf)
+            # Forces the first to be -inf
+            bin_vals[0] = -np.inf
+            # Copies the values as boundaries
+            bin_vals[1 : values.size + 1] = values
+
+            if bin_counts_buffer is not None:
+                # slice out bins with +/-np.inf boundary -- their count will be zero anyway
+                # we can't just assume all other bins will have 1 value since there can be dups
+                short_indices = np.arange(values.size, dtype=np.int32)
+                bin_counts_buffer.fill(0)
+                self._gather_debug_info(
+                    values,
+                    short_indices,
+                    bin_vals[1 : values.size + 1],
+                    bin_counts_buffer[1 : values.size + 1],
+                )
+
+        else:
+            # Gets the indices for the values that define the boundary for the bins
+            indices_float = np.arange(0, weights.size, dtype=np.float32)
+
+            # Gets things in the correct shape for the linear interpolation
+            weights = weights.reshape(1, weights.size)
+            indices_float = indices_float.reshape(1, weights.size)
+
+            # wrap ndarrays into twml.Array
+            percentiles_tarray = twml.Array(percentiles.reshape(percentiles.size, 1))
+            weights_tarray = twml.Array(weights)
+            indices_float_tarray = twml.Array(indices_float)
+            percentile_indices_tarray = twml.Array(
+                percentile_indices.reshape(percentiles.size, 1)
+            )
+
+            # Performs the binary search to find the indices corresponding to the percentiles
+            err = twml.CLIB.twml_optim_nearest_interpolation(
+                percentile_indices_tarray.handle,
+                percentiles_tarray.handle,  # output, input
+                weights_tarray.handle,
+                indices_float_tarray.handle,  # xs, ys
+            )
+            if err != 1000:
+                raise ValueError(
+                    """twml.CLIB.twml_optim_nearest_interpolation
+          caught an error (see previous stdout). Error code: """
+                    % err
+                )
+
+            indices = indices[: bin_vals.size]
+            indices[:] = percentile_indices
+            indices[0] = 0
+            indices[-1] = weights.size - 1
+
+            # Gets the values at those indices and copies them into bin_vals
+            values.take(indices, out=bin_vals)
+
+            # get # of values per bucket
+            if bin_counts_buffer is not None:
+                self._gather_debug_info(values, indices, bin_vals, bin_counts_buffer)
+
+        self._calibrated = True
 
 
 class PercentileDiscretizerCalibrator(Calibrator):
-  ''' Accumulates features and their respective values for PercentileDiscretizer calibration.
-  Internally, each feature's values is accumulated via its own
-  ``PercentileDiscretizerFeature`` object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into PercentileDiscretizer bin_vals by calling ``calibrate()``; and
-   3. convert to a twml.layers.PercentileDiscretizer layer by calling ``to_layer()``.
-
-  '''
-
-  def __init__(self, n_bin, out_bits, bin_histogram=True,
-               allow_empty_calibration=False, **kwargs):
-    ''' Constructs an PercentileDiscretizerCalibrator instance.
-
-    Arguments:
-      n_bin:
-        the number of bins per feature to use for PercentileDiscretizer.
-        Note that each feature actually maps to n_bin+1 output IDs.
-      out_bits:
-        The maximum number of bits to use for the output IDs.
-        2**out_bits must be greater than bin_ids.size or an error is raised.
-      bin_histogram:
-        When True (the default), gathers information during calibration
-        to build a bin_histogram.
-      allow_empty_calibration:
-        allows operation where we might not calibrate any features.
-        Default False to error out if no features were calibrated.
-        Typically, values of uncalibrated features pass through discretizers
-        untouched (though the feature ids will be truncated to obey out_bits).
-    '''
-    super(PercentileDiscretizerCalibrator, self).__init__(**kwargs)
-    self._n_bin = n_bin
-    self._out_bits = out_bits
-
-    self._bin_ids = None
-    self._bin_vals = np.empty(0, dtype=np.float32)  # Note changed from 64 (v1) to 32 (v2)
-
-    self._bin_histogram = bin_histogram
-    self._bin_histogram_dict = None
-
-    self._hash_map_counter = 0
-    self._hash_map = {}
-
-    self._discretizer_feature_dict = {}
-    self._allow_empty_calibration = allow_empty_calibration
-
-  @property
-  def bin_ids(self):
-    '''
-    Gets bin_ids
-    '''
-    return self._bin_ids
-
-  @property
-  def bin_vals(self):
-    '''
-    Gets bin_vals
-    '''
-    return self._bin_vals
-
-  @property
-  def hash_map(self):
-    '''
-    Gets hash_map
-    '''
-    return self._hash_map
-
-  @property
-  def discretizer_feature_dict(self):
-    '''
-    Gets feature_dict
-    '''
-    return self._discretizer_feature_dict
-
-  def accumulate_features(self, inputs, name):
-    '''
-    Wrapper around accumulate for PercentileDiscretizer.
-    Arguments:
-      inputs:
-        batch that will be accumulated
-      name:
-        name of the tensor that will be accumulated
-
-    '''
-    sparse_tf = inputs[name]
-    indices = sparse_tf.indices[:, 1]
-    ids = sparse_tf.indices[:, 0]
-    weights = np.take(inputs["weights"], ids)
-    return self.accumulate(indices, sparse_tf.values, weights)
-
-  def accumulate_feature(self, output):
-    '''
-    Wrapper around accumulate for trainer API.
-    Arguments:
-      output:
-        output of prediction of build_graph for calibrator
-    '''
-    return self.accumulate(output['feature_ids'], output['feature_values'], output['weights'])
-
-  def accumulate(self, feature_keys, feature_vals, weights=None):
-    '''Accumulate a single batch of feature keys, values and weights.
-
-    These are accumulate until ``calibrate()`` is called.
-
-    Arguments:
-      feature_keys:
-        1D int64 array of feature keys.
-      feature_vals:
-        1D float array of feature values. Each element of this array
-        maps to the commensurate element in ``feature_keys``.
-      weights:
-        Defaults to weights of 1.
-        1D array containing the weights of each feature key, value pair.
-        Typically, this is the weight of each sample (but you still need
-        to provide one weight per key,value pair).
-        Each element of this array maps to the commensurate element in feature_keys.
-    '''
-    if feature_keys.ndim != 1:
-      raise ValueError('Expecting 1D feature_keys, got %dD' % feature_keys.ndim)
-    if feature_vals.ndim != 1:
-      raise ValueError('Expecting 1D feature_values, got %dD' % feature_vals.ndim)
-    if feature_vals.size != feature_keys.size:
-      raise ValueError(
-        'Expecting feature_keys.size == feature_values.size, got %d != %d' %
-        (feature_keys.size, feature_vals.size))
-    if weights is not None:
-      weights = np.squeeze(weights)
-      if weights.ndim != 1:
-        raise ValueError('Expecting 1D weights, got %dD' % weights.ndim)
-      elif weights.size != feature_keys.size:
-        raise ValueError(
-          'Expecting feature_keys.size == weights.size, got %d != %d' %
-          (feature_keys.size, weights.size))
-    if weights is None:
-      weights = np.full(feature_vals.size, fill_value=DEFAULT_SAMPLE_WEIGHT)
-    unique_keys = np.unique(feature_keys)
-    for feature_id in unique_keys:
-      idx = np.where(feature_keys == feature_id)
-      if feature_id not in self._discretizer_feature_dict:
-        self._hash_map[feature_id] = self._hash_map_counter
-        # unlike v1, the hash_map_counter is incremented AFTER assignment.
-        # This makes the hash_map features zero-indexed: 0, 1, 2 instead of 1, 2, 3
-        self._hash_map_counter += 1
-        # creates a new cache if we never saw the feature before
-        discretizer_feature = PercentileDiscretizerFeature(feature_id)
-        self._discretizer_feature_dict[feature_id] = discretizer_feature
-      else:
-        discretizer_feature = self._discretizer_feature_dict[feature_id]
-      discretizer_feature.add_values({'values': feature_vals[idx], 'weights': weights[idx]})
-
-  def calibrate(self, debug=False):
-    '''
-    Calibrates each PercentileDiscretizer feature after accumulation is complete.
-
-    Arguments:
-      debug:
-        Boolean to request debug info be returned by the method.
-        (see Returns section below)
-
-    The calibration results are stored in two matrices:
-      bin_ids:
-        2D array of size number of accumulate ``features x n_bin+1``.
-        Contains the new IDs generated by PercentileDiscretizer. Each row maps to a feature.
-        Each row maps to different value bins. The IDs
-        are in the range ``1 -> bin_ids.size+1``
-      bin_vals:
-        2D array of the same size as bin_ids.
-        Each row maps to a feature. Each row contains the bin boundaries.
-        These boundaries represent feature values.
-
-    Returns:
-      if debug is True, the method returns
-
-        - 1D int64 array of feature_ids
-        - 2D float32 array copy of bin_vals (the bin boundaries) for each feature
-        - 2D int64 array of bin counts corresponding to the bin boundaries
-
-    '''
-    n_feature = len(self._discretizer_feature_dict)
-    if n_feature == 0 and not self._allow_empty_calibration:
-      raise RuntimeError("Need to accumulate some features for calibration\n"
-                         "Likely, the calibration data is empty. This can\n"
-                         "happen if the dataset is small, or if the following\n"
-                         "cli args are set too low:\n"
-                         "  --discretizer_keep_rate (default=0.0008)\n"
-                         "  --discretizer_parts_downsampling_rate (default=0.2)\n"
-                         "Consider increasing the values of these args.\n"
-                         "To allow empty calibration data (and degenerate discretizer),\n"
-                         "use the allow_empty_calibration input of the constructor.")
-
-    self._bin_ids = np.arange(1, n_feature * (self._n_bin + 1) + 1)
-    self._bin_ids = self._bin_ids.reshape(n_feature, self._n_bin + 1)
-
-    self._bin_vals.resize(n_feature, self._n_bin + 1)
-
-    # buffers shared by PercentileDiscretizerFeature.calibrate()
-    percentile_indices = np.empty(self._n_bin + 1, dtype=np.float32)
-
-    # Tensor from 0 to 1 in the number of steps provided
-    percentiles = np.linspace(0, 1, num=self._n_bin + 1, dtype=np.float32)
-
-    if debug or self._bin_histogram:
-      debug_feature_ids = np.empty(n_feature, dtype=np.int64)
-      bin_counts = np.empty((n_feature, self._n_bin + 1), dtype=np.int64)
-
-    # progress bar for calibration phase
-    progress_bar = tf.keras.utils.Progbar(n_feature)
-
-    discretizer_features_dict = self._discretizer_feature_dict
-    for i, feature_id in enumerate(discretizer_features_dict):
-      if debug or self._bin_histogram:
-        debug_feature_ids[self._hash_map[feature_id]] = feature_id
-        bin_counts_buffer = bin_counts[self._hash_map[feature_id]]
-      else:
-        bin_counts_buffer = None
-
-      # calibrate each PercentileDiscretizer feature (puts results in bin_vals)
-      discretizer_features_dict[feature_id].calibrate(
-        self._bin_vals[self._hash_map[feature_id]],  # Gets feature-values
-        percentiles, percentile_indices,
-        bin_counts_buffer=bin_counts_buffer
-      )
-
-      # update progress bar 20 times
-      if (i % max(1.0, round(n_feature / 20)) == 0) or (i == n_feature - 1):
-        progress_bar.update(i + 1)
-
-    super(PercentileDiscretizerCalibrator, self).calibrate()
-
-    if self._bin_histogram:
-      # save bin histogram data for later
-      self._bin_histogram_dict = {
-        'feature_ids': debug_feature_ids,
-        'bin_counts': bin_counts,
-        'bin_vals': self._bin_vals,
-        'out_bits': self._out_bits,
-      }
-
-    if debug:
-      return debug_feature_ids, self._bin_vals.copy(), bin_counts
-
-    return None
-
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    return twml.layers.PercentileDiscretizer(
-      n_feature=n_feature,
-      n_bin=self._n_bin,
-      out_bits=self._out_bits,
-      bin_values=self._bin_vals.flatten(),
-      hash_keys=hash_map_keys,
-      hash_values=hash_map_values.astype(np.int64),
-      bin_ids=self._bin_ids.flatten().astype(np.int64),
-      feature_offsets=feature_offsets,
-      name=name,
-      **self._kwargs
-    )
-
-  def to_layer(self, name=None):
-    """
-    Returns a twml.layers.PercentileDiscretizer Layer
-    that can be used for feature discretization.
+    """Accumulates features and their respective values for PercentileDiscretizer calibration.
+    Internally, each feature's values is accumulated via its own
+    ``PercentileDiscretizerFeature`` object.
+    The steps for calibration are typically as follows:
+
+     1. accumulate feature values from batches by calling ``accumulate()``;
+     2. calibrate all feature into PercentileDiscretizer bin_vals by calling ``calibrate()``; and
+     3. convert to a twml.layers.PercentileDiscretizer layer by calling ``to_layer()``.
 
-    Arguments:
-      name:
-        name-scope of the PercentileDiscretizer layer
     """
-    n_feature = len(self._discretizer_feature_dict)
-    max_discretizer_feature = n_feature * (self._n_bin + 1)
-
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    if self._bin_ids.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_ids.shape[0] \
-        != len(self._discretizer_feature_dict)")
-    if self._bin_vals.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_vals.shape[0] \
-        != len(self._discretizer_feature_dict)")
-
-    # can add at most #features * (n_bin+1) new feature ids
-    if 2**self._out_bits <= max_discretizer_feature:
-      raise ValueError("""Maximum number of features created by discretizer is
+
+    def __init__(
+        self,
+        n_bin,
+        out_bits,
+        bin_histogram=True,
+        allow_empty_calibration=False,
+        **kwargs
+    ):
+        """Constructs an PercentileDiscretizerCalibrator instance.
+
+        Arguments:
+          n_bin:
+            the number of bins per feature to use for PercentileDiscretizer.
+            Note that each feature actually maps to n_bin+1 output IDs.
+          out_bits:
+            The maximum number of bits to use for the output IDs.
+            2**out_bits must be greater than bin_ids.size or an error is raised.
+          bin_histogram:
+            When True (the default), gathers information during calibration
+            to build a bin_histogram.
+          allow_empty_calibration:
+            allows operation where we might not calibrate any features.
+            Default False to error out if no features were calibrated.
+            Typically, values of uncalibrated features pass through discretizers
+            untouched (though the feature ids will be truncated to obey out_bits).
+        """
+        super(PercentileDiscretizerCalibrator, self).__init__(**kwargs)
+        self._n_bin = n_bin
+        self._out_bits = out_bits
+
+        self._bin_ids = None
+        self._bin_vals = np.empty(
+            0, dtype=np.float32
+        )  # Note changed from 64 (v1) to 32 (v2)
+
+        self._bin_histogram = bin_histogram
+        self._bin_histogram_dict = None
+
+        self._hash_map_counter = 0
+        self._hash_map = {}
+
+        self._discretizer_feature_dict = {}
+        self._allow_empty_calibration = allow_empty_calibration
+
+    @property
+    def bin_ids(self):
+        """
+        Gets bin_ids
+        """
+        return self._bin_ids
+
+    @property
+    def bin_vals(self):
+        """
+        Gets bin_vals
+        """
+        return self._bin_vals
+
+    @property
+    def hash_map(self):
+        """
+        Gets hash_map
+        """
+        return self._hash_map
+
+    @property
+    def discretizer_feature_dict(self):
+        """
+        Gets feature_dict
+        """
+        return self._discretizer_feature_dict
+
+    def accumulate_features(self, inputs, name):
+        """
+        Wrapper around accumulate for PercentileDiscretizer.
+        Arguments:
+          inputs:
+            batch that will be accumulated
+          name:
+            name of the tensor that will be accumulated
+
+        """
+        sparse_tf = inputs[name]
+        indices = sparse_tf.indices[:, 1]
+        ids = sparse_tf.indices[:, 0]
+        weights = np.take(inputs["weights"], ids)
+        return self.accumulate(indices, sparse_tf.values, weights)
+
+    def accumulate_feature(self, output):
+        """
+        Wrapper around accumulate for trainer API.
+        Arguments:
+          output:
+            output of prediction of build_graph for calibrator
+        """
+        return self.accumulate(
+            output["feature_ids"], output["feature_values"], output["weights"]
+        )
+
+    def accumulate(self, feature_keys, feature_vals, weights=None):
+        """Accumulate a single batch of feature keys, values and weights.
+
+        These are accumulate until ``calibrate()`` is called.
+
+        Arguments:
+          feature_keys:
+            1D int64 array of feature keys.
+          feature_vals:
+            1D float array of feature values. Each element of this array
+            maps to the commensurate element in ``feature_keys``.
+          weights:
+            Defaults to weights of 1.
+            1D array containing the weights of each feature key, value pair.
+            Typically, this is the weight of each sample (but you still need
+            to provide one weight per key,value pair).
+            Each element of this array maps to the commensurate element in feature_keys.
+        """
+        if feature_keys.ndim != 1:
+            raise ValueError("Expecting 1D feature_keys, got %dD" % feature_keys.ndim)
+        if feature_vals.ndim != 1:
+            raise ValueError("Expecting 1D feature_values, got %dD" % feature_vals.ndim)
+        if feature_vals.size != feature_keys.size:
+            raise ValueError(
+                "Expecting feature_keys.size == feature_values.size, got %d != %d"
+                % (feature_keys.size, feature_vals.size)
+            )
+        if weights is not None:
+            weights = np.squeeze(weights)
+            if weights.ndim != 1:
+                raise ValueError("Expecting 1D weights, got %dD" % weights.ndim)
+            elif weights.size != feature_keys.size:
+                raise ValueError(
+                    "Expecting feature_keys.size == weights.size, got %d != %d"
+                    % (feature_keys.size, weights.size)
+                )
+        if weights is None:
+            weights = np.full(feature_vals.size, fill_value=DEFAULT_SAMPLE_WEIGHT)
+        unique_keys = np.unique(feature_keys)
+        for feature_id in unique_keys:
+            idx = np.where(feature_keys == feature_id)
+            if feature_id not in self._discretizer_feature_dict:
+                self._hash_map[feature_id] = self._hash_map_counter
+                # unlike v1, the hash_map_counter is incremented AFTER assignment.
+                # This makes the hash_map features zero-indexed: 0, 1, 2 instead of 1, 2, 3
+                self._hash_map_counter += 1
+                # creates a new cache if we never saw the feature before
+                discretizer_feature = PercentileDiscretizerFeature(feature_id)
+                self._discretizer_feature_dict[feature_id] = discretizer_feature
+            else:
+                discretizer_feature = self._discretizer_feature_dict[feature_id]
+            discretizer_feature.add_values(
+                {"values": feature_vals[idx], "weights": weights[idx]}
+            )
+
+    def calibrate(self, debug=False):
+        """
+        Calibrates each PercentileDiscretizer feature after accumulation is complete.
+
+        Arguments:
+          debug:
+            Boolean to request debug info be returned by the method.
+            (see Returns section below)
+
+        The calibration results are stored in two matrices:
+          bin_ids:
+            2D array of size number of accumulate ``features x n_bin+1``.
+            Contains the new IDs generated by PercentileDiscretizer. Each row maps to a feature.
+            Each row maps to different value bins. The IDs
+            are in the range ``1 -> bin_ids.size+1``
+          bin_vals:
+            2D array of the same size as bin_ids.
+            Each row maps to a feature. Each row contains the bin boundaries.
+            These boundaries represent feature values.
+
+        Returns:
+          if debug is True, the method returns
+
+            - 1D int64 array of feature_ids
+            - 2D float32 array copy of bin_vals (the bin boundaries) for each feature
+            - 2D int64 array of bin counts corresponding to the bin boundaries
+
+        """
+        n_feature = len(self._discretizer_feature_dict)
+        if n_feature == 0 and not self._allow_empty_calibration:
+            raise RuntimeError(
+                "Need to accumulate some features for calibration\n"
+                "Likely, the calibration data is empty. This can\n"
+                "happen if the dataset is small, or if the following\n"
+                "cli args are set too low:\n"
+                "  --discretizer_keep_rate (default=0.0008)\n"
+                "  --discretizer_parts_downsampling_rate (default=0.2)\n"
+                "Consider increasing the values of these args.\n"
+                "To allow empty calibration data (and degenerate discretizer),\n"
+                "use the allow_empty_calibration input of the constructor."
+            )
+
+        self._bin_ids = np.arange(1, n_feature * (self._n_bin + 1) + 1)
+        self._bin_ids = self._bin_ids.reshape(n_feature, self._n_bin + 1)
+
+        self._bin_vals.resize(n_feature, self._n_bin + 1)
+
+        # buffers shared by PercentileDiscretizerFeature.calibrate()
+        percentile_indices = np.empty(self._n_bin + 1, dtype=np.float32)
+
+        # Tensor from 0 to 1 in the number of steps provided
+        percentiles = np.linspace(0, 1, num=self._n_bin + 1, dtype=np.float32)
+
+        if debug or self._bin_histogram:
+            debug_feature_ids = np.empty(n_feature, dtype=np.int64)
+            bin_counts = np.empty((n_feature, self._n_bin + 1), dtype=np.int64)
+
+        # progress bar for calibration phase
+        progress_bar = tf.keras.utils.Progbar(n_feature)
+
+        discretizer_features_dict = self._discretizer_feature_dict
+        for i, feature_id in enumerate(discretizer_features_dict):
+            if debug or self._bin_histogram:
+                debug_feature_ids[self._hash_map[feature_id]] = feature_id
+                bin_counts_buffer = bin_counts[self._hash_map[feature_id]]
+            else:
+                bin_counts_buffer = None
+
+            # calibrate each PercentileDiscretizer feature (puts results in bin_vals)
+            discretizer_features_dict[feature_id].calibrate(
+                self._bin_vals[self._hash_map[feature_id]],  # Gets feature-values
+                percentiles,
+                percentile_indices,
+                bin_counts_buffer=bin_counts_buffer,
+            )
+
+            # update progress bar 20 times
+            if (i % max(1.0, round(n_feature / 20)) == 0) or (i == n_feature - 1):
+                progress_bar.update(i + 1)
+
+        super(PercentileDiscretizerCalibrator, self).calibrate()
+
+        if self._bin_histogram:
+            # save bin histogram data for later
+            self._bin_histogram_dict = {
+                "feature_ids": debug_feature_ids,
+                "bin_counts": bin_counts,
+                "bin_vals": self._bin_vals,
+                "out_bits": self._out_bits,
+            }
+
+        if debug:
+            return debug_feature_ids, self._bin_vals.copy(), bin_counts
+
+        return None
+
+    def _create_discretizer_layer(
+        self, n_feature, hash_map_keys, hash_map_values, feature_offsets, name
+    ):
+        return twml.layers.PercentileDiscretizer(
+            n_feature=n_feature,
+            n_bin=self._n_bin,
+            out_bits=self._out_bits,
+            bin_values=self._bin_vals.flatten(),
+            hash_keys=hash_map_keys,
+            hash_values=hash_map_values.astype(np.int64),
+            bin_ids=self._bin_ids.flatten().astype(np.int64),
+            feature_offsets=feature_offsets,
+            name=name,
+            **self._kwargs
+        )
+
+    def to_layer(self, name=None):
+        """
+        Returns a twml.layers.PercentileDiscretizer Layer
+        that can be used for feature discretization.
+
+        Arguments:
+          name:
+            name-scope of the PercentileDiscretizer layer
+        """
+        n_feature = len(self._discretizer_feature_dict)
+        max_discretizer_feature = n_feature * (self._n_bin + 1)
+
+        if not self._calibrated:
+            raise RuntimeError("Expecting prior call to calibrate()")
+
+        if self._bin_ids.shape[0] != n_feature:
+            raise RuntimeError(
+                "Expecting self._bin_ids.shape[0] \
+        != len(self._discretizer_feature_dict)"
+            )
+        if self._bin_vals.shape[0] != n_feature:
+            raise RuntimeError(
+                "Expecting self._bin_vals.shape[0] \
+        != len(self._discretizer_feature_dict)"
+            )
+
+        # can add at most #features * (n_bin+1) new feature ids
+        if 2**self._out_bits <= max_discretizer_feature:
+            raise ValueError(
+                """Maximum number of features created by discretizer is
         %d but requested that the output be limited to %d values (%d bits),
         which is smaller than that. Please ensure the output has enough bits
         to represent at least the new features"""
-                       % (max_discretizer_feature, 2**self._out_bits, self._out_bits))
-
-    # build feature_offsets, hash_map_keys, hash_map_values
-    feature_offsets = np.arange(0, max_discretizer_feature,
-                                self._n_bin + 1, dtype='int64')
-    hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
-    hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
-
-    discretizer = self._create_discretizer_layer(n_feature, hash_map_keys,
-                                                 hash_map_values, feature_offsets, name)
-
-    return discretizer
-
-  def get_layer_args(self):
-    '''
-    Returns layer arguments required to implement multi-phase training.
-    See twml.calibrator.Calibrator.get_layer_args for more detailed documentation.
-    '''
-    layer_args = {
-      'n_feature': len(self._discretizer_feature_dict),
-      'n_bin': self._n_bin,
-      'out_bits': self._out_bits,
-    }
-
-    return layer_args
-
-  def add_hub_signatures(self, name):
-    """
-    Add Hub Signatures for each calibrator
-
-    Arguments:
-      name:
-        Calibrator name
-    """
-    sparse_tf = tf.sparse_placeholder(tf.float32)
-    calibrator_layer = self.to_layer()
-    hub.add_signature(
-      inputs=sparse_tf,
-      outputs=calibrator_layer(sparse_tf, keep_inputs=False),
-      name=name)
-
-  def write_summary(self, writer, sess=None):
-    """
-    This method is called by save() to write a histogram of
-    PercentileDiscretizer feature bins to disk. A histogram is included for each
-    feature.
-
-    Arguments:
-      writer:
-        tf.summary.FilteWriter instance.
-        used to add summaries to event files for inclusion in tensorboard.
-      sess:
-        tf.Session instance. Used to produces summaries for the writer.
-    """
-    bin_counts_ph = tf.placeholder(tf.int64)
-    bin_counts = self._bin_histogram_dict['bin_counts']
-
-    # Record that distribution into a histogram summary
-    histo = tf.summary.histogram("discretizer_feature_bin_counts", bin_counts_ph)
-    for i in range(bin_counts.shape[0]):
-      bin_counts_summary = sess.run(histo, feed_dict={bin_counts_ph: bin_counts[i]})
-      writer.add_summary(bin_counts_summary, global_step=i)
-
-  def write_summary_json(self, save_dir, name="default"):
-    """
-    Export bin information to HDFS.
-    
-    Arguments:
-      save_dir:
-        name of the saving directory.
-      name:
-        prefix of the saved hub signature. Default (string): "default".
-    """
-    # Since the size is small: (# of bins) * (# of features), we always dump the file.
-    discretizer_export_bin_filename = os.path.join(save_dir, name + '_bin.json')
-    discretizer_export_bin_dict = {
-      'feature_ids': self._bin_histogram_dict['feature_ids'].tolist(),
-      'bin_boundaries': self._bin_histogram_dict['bin_vals'].tolist(),
-      'output_bits': self._bin_histogram_dict['out_bits']
-    }
-    twml.write_file(discretizer_export_bin_filename, discretizer_export_bin_dict, encode='json')
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory using TF Hub.
-    Arguments:
-      save_dir:
-        name of the saving directory.
-      name:
-        prefix of the saved hub signature. Default (string): "default".
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      inputs = tf.sparse_placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      # creates the signature to the calibrator module
-      hub.add_signature(
-        inputs=inputs,
-        outputs=calibrator_layer(inputs, keep_inputs=False),
-        name=name)
-      # and another signature for keep_inputs mode
-      hub.add_signature(
-        inputs=inputs,
-        outputs=calibrator_layer(inputs, keep_inputs=True),
-        name=name + '_keep_inputs')
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-    self.write_summary_json(save_dir, name)
+                % (max_discretizer_feature, 2**self._out_bits, self._out_bits)
+            )
+
+        # build feature_offsets, hash_map_keys, hash_map_values
+        feature_offsets = np.arange(
+            0, max_discretizer_feature, self._n_bin + 1, dtype="int64"
+        )
+        hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
+        hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
+
+        discretizer = self._create_discretizer_layer(
+            n_feature, hash_map_keys, hash_map_values, feature_offsets, name
+        )
+
+        return discretizer
+
+    def get_layer_args(self):
+        """
+        Returns layer arguments required to implement multi-phase training.
+        See twml.calibrator.Calibrator.get_layer_args for more detailed documentation.
+        """
+        layer_args = {
+            "n_feature": len(self._discretizer_feature_dict),
+            "n_bin": self._n_bin,
+            "out_bits": self._out_bits,
+        }
+
+        return layer_args
+
+    def add_hub_signatures(self, name):
+        """
+        Add Hub Signatures for each calibrator
+
+        Arguments:
+          name:
+            Calibrator name
+        """
+        sparse_tf = tf.sparse_placeholder(tf.float32)
+        calibrator_layer = self.to_layer()
+        hub.add_signature(
+            inputs=sparse_tf,
+            outputs=calibrator_layer(sparse_tf, keep_inputs=False),
+            name=name,
+        )
+
+    def write_summary(self, writer, sess=None):
+        """
+        This method is called by save() to write a histogram of
+        PercentileDiscretizer feature bins to disk. A histogram is included for each
+        feature.
+
+        Arguments:
+          writer:
+            tf.summary.FilteWriter instance.
+            used to add summaries to event files for inclusion in tensorboard.
+          sess:
+            tf.Session instance. Used to produces summaries for the writer.
+        """
+        bin_counts_ph = tf.placeholder(tf.int64)
+        bin_counts = self._bin_histogram_dict["bin_counts"]
+
+        # Record that distribution into a histogram summary
+        histo = tf.summary.histogram("discretizer_feature_bin_counts", bin_counts_ph)
+        for i in range(bin_counts.shape[0]):
+            bin_counts_summary = sess.run(
+                histo, feed_dict={bin_counts_ph: bin_counts[i]}
+            )
+            writer.add_summary(bin_counts_summary, global_step=i)
+
+    def write_summary_json(self, save_dir, name="default"):
+        """
+        Export bin information to HDFS.
+
+        Arguments:
+          save_dir:
+            name of the saving directory.
+          name:
+            prefix of the saved hub signature. Default (string): "default".
+        """
+        # Since the size is small: (# of bins) * (# of features), we always dump the file.
+        discretizer_export_bin_filename = os.path.join(save_dir, name + "_bin.json")
+        discretizer_export_bin_dict = {
+            "feature_ids": self._bin_histogram_dict["feature_ids"].tolist(),
+            "bin_boundaries": self._bin_histogram_dict["bin_vals"].tolist(),
+            "output_bits": self._bin_histogram_dict["out_bits"],
+        }
+        twml.write_file(
+            discretizer_export_bin_filename, discretizer_export_bin_dict, encode="json"
+        )
+
+    def save(self, save_dir, name="default", verbose=False):
+        """Save the calibrator into the given save_directory using TF Hub.
+        Arguments:
+          save_dir:
+            name of the saving directory.
+          name:
+            prefix of the saved hub signature. Default (string): "default".
+        """
+        if not self._calibrated:
+            raise RuntimeError(
+                "Expecting prior call to calibrate().Cannot save() prior to calibrate()"
+            )
+
+        # This module allows for the calibrator to save be saved as part of
+        # Tensorflow Hub (this will allow it to be used in further steps)
+        def calibrator_module():
+            # Note that this is usually expecting a sparse_placeholder
+            inputs = tf.sparse_placeholder(tf.float32)
+            calibrator_layer = self.to_layer()
+            # creates the signature to the calibrator module
+            hub.add_signature(
+                inputs=inputs,
+                outputs=calibrator_layer(inputs, keep_inputs=False),
+                name=name,
+            )
+            # and another signature for keep_inputs mode
+            hub.add_signature(
+                inputs=inputs,
+                outputs=calibrator_layer(inputs, keep_inputs=True),
+                name=name + "_keep_inputs",
+            )
+
+        # exports the module to the save_dir
+        spec = hub.create_module_spec(calibrator_module)
+        with tf.Graph().as_default():
+            module = hub.Module(spec)
+            with tf.Session() as session:
+                module.export(save_dir, session)
+
+        self.write_summary_json(save_dir, name)
diff --git a/twml/twml/contrib/eventbus/input_fn.py b/twml/twml/contrib/eventbus/input_fn.py
index c184d9434..957a8f10c 100644
--- a/twml/twml/contrib/eventbus/input_fn.py
+++ b/twml/twml/contrib/eventbus/input_fn.py
@@ -1,7 +1,7 @@
-from reader import EventBusPipedBinaryRecordReader
 import tensorflow.compat.v1 as tf
-import twml
+from reader import EventBusPipedBinaryRecordReader
 
+import twml
 
 """
 This module provides input function for DeepBird v2 training.
@@ -10,50 +10,64 @@
 
 
 def get_eventbus_data_record_generator(eventbus_reader):
-  """
-  This module provides a data record generater from EventBus reader.
-
-  Args:
-    eventbus_reader: EventBus reader
-
-  Returns:
-    gen: Data record generater
-  """
-  eventbus_reader.initialize()
-  counter = [0]
-
-  def gen():
-    while True:
-      record = eventbus_reader.read()
-      if eventbus_reader.debug:
-        tf.logging.warn("counter: {}".format(counter[0]))
-        with open('tmp_record_{}.bin'.format(counter[0]), 'wb') as f:
-          f.write(record)
-        counter[0] = counter[0] + 1
-      yield record
-  return gen
+    """
+    This module provides a data record generater from EventBus reader.
+
+    Args:
+      eventbus_reader: EventBus reader
+
+    Returns:
+      gen: Data record generater
+    """
+    eventbus_reader.initialize()
+    counter = [0]
+
+    def gen():
+        while True:
+            record = eventbus_reader.read()
+            if eventbus_reader.debug:
+                tf.logging.warn("counter: {}".format(counter[0]))
+                with open("tmp_record_{}.bin".format(counter[0]), "wb") as f:
+                    f.write(record)
+                counter[0] = counter[0] + 1
+            yield record
+
+    return gen
 
 
 def get_eventbus_data_record_dataset(eventbus_reader, parse_fn, batch_size):
-  """
-  This module generates batch data for training from a data record generator.
-  """
-  dataset = tf.data.Dataset.from_generator(
-    get_eventbus_data_record_generator(eventbus_reader), tf.string, tf.TensorShape([]))
-  return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=4).prefetch(buffer_size=10)
+    """
+    This module generates batch data for training from a data record generator.
+    """
+    dataset = tf.data.Dataset.from_generator(
+        get_eventbus_data_record_generator(eventbus_reader),
+        tf.string,
+        tf.TensorShape([]),
+    )
+    return (
+        dataset.batch(batch_size)
+        .map(parse_fn, num_parallel_calls=4)
+        .prefetch(buffer_size=10)
+    )
 
 
 def get_train_input_fn(feature_config, params, parse_fn=None):
-  """
-  This module provides input function for DeepBird v2 training.
-  It gets batched training data from data record generator.
-  """
-  eventbus_reader = EventBusPipedBinaryRecordReader(
-    params.jar_file, params.num_eb_threads, params.subscriber_id,
-    filter_str=params.filter_str, debug=params.debug)
-
-  train_parse_fn = parse_fn or twml.parsers.get_sparse_parse_fn(
-    feature_config, ["ids", "keys", "values", "batch_size", "weights"])
-
-  return lambda: get_eventbus_data_record_dataset(
-    eventbus_reader, train_parse_fn, params.train_batch_size)
+    """
+    This module provides input function for DeepBird v2 training.
+    It gets batched training data from data record generator.
+    """
+    eventbus_reader = EventBusPipedBinaryRecordReader(
+        params.jar_file,
+        params.num_eb_threads,
+        params.subscriber_id,
+        filter_str=params.filter_str,
+        debug=params.debug,
+    )
+
+    train_parse_fn = parse_fn or twml.parsers.get_sparse_parse_fn(
+        feature_config, ["ids", "keys", "values", "batch_size", "weights"]
+    )
+
+    return lambda: get_eventbus_data_record_dataset(
+        eventbus_reader, train_parse_fn, params.train_batch_size
+    )
diff --git a/twml/twml/contrib/eventbus/reader.py b/twml/twml/contrib/eventbus/reader.py
index 2f8e2749e..88b635417 100644
--- a/twml/twml/contrib/eventbus/reader.py
+++ b/twml/twml/contrib/eventbus/reader.py
@@ -12,108 +12,133 @@
 
 
 class BinaryRecordReader(object):
-  def initialize(self):
-    pass
+    def initialize(self):
+        pass
 
-  def read(self):
-    """Read raw bytes for one record
-    """
-    raise NotImplementedError
+    def read(self):
+        """Read raw bytes for one record"""
+        raise NotImplementedError
 
-  def close(self):
-    pass
+    def close(self):
+        pass
 
 
 class ReadableWrapper(object):
-  def __init__(self, internal):
-    self.internal = internal
+    def __init__(self, internal):
+        self.internal = internal
 
-  def __getattr__(self, name):
-    return getattr(self.internal, name)
+    def __getattr__(self, name):
+        return getattr(self.internal, name)
 
-  def readable(self):
-    return True
+    def readable(self):
+        return True
 
 
 class EventBusPipedBinaryRecordReader(BinaryRecordReader):
-
-  JAVA = '/usr/lib/jvm/java-11-twitter/bin/java'
-  RECORD_SEPARATOR_HEX = [
-    0x29, 0xd8, 0xd5, 0x06, 0x58, 0xcd, 0x4c, 0x29,
-    0xb2, 0xbc, 0x57, 0x99, 0x21, 0x71, 0xbd, 0xff
-  ]
-  RECORD_SEPARATOR = ''.join([chr(i) for i in RECORD_SEPARATOR_HEX])
-  RECORD_SEPARATOR_LENGTH = len(RECORD_SEPARATOR)
-  CHUNK_SIZE = 8192
-
-  def __init__(self, jar_file, num_eb_threads, subscriber_id,
-               filter_str=None, buffer_size=32768, debug=False):
-    self.jar_file = jar_file
-    self.num_eb_threads = num_eb_threads
-    self.subscriber_id = subscriber_id
-    self.filter_str = filter_str if filter_str else '""'
-    self.buffer_size = buffer_size
-    self.lock = Lock()
-    self._pipe = None
-    self._buffered_reader = None
-    self._bytes_buffer = None
-
-    self.debug = debug
-
-  def initialize(self):
-    if not self._pipe:
-      self._pipe = subprocess.Popen(
-        [
-          self.JAVA, '-jar', self.jar_file,
-          '-subscriberId', self.subscriber_id,
-          '-numThreads', str(self.num_eb_threads),
-          '-dataFilter', self.filter_str,
-          '-debug' if self.debug else ''
-        ],
-        stdout=subprocess.PIPE
-      )
-      self._buffered_reader = io.BufferedReader(
-        ReadableWrapper(self._pipe.stdout), self.buffer_size)
-      self._bytes_buffer = io.BytesIO()
-    else:
-      logging.warning('Already initialized')
-
-  def _find_next_record(self):
-    tail = ['']
-    while True:
-      chunk = tail[0] + self._buffered_reader.read(self.CHUNK_SIZE)
-      index = chunk.find(self.RECORD_SEPARATOR)
-      if index < 0:
-        self._bytes_buffer.write(chunk[:-self.RECORD_SEPARATOR_LENGTH])
-        tail[0] = chunk[-self.RECORD_SEPARATOR_LENGTH:]
-      else:
-        self._bytes_buffer.write(chunk[:index])
-        return chunk[(index + self.RECORD_SEPARATOR_LENGTH):]
-
-  def _read(self):
-    with self.lock:
-      remaining = self._find_next_record()
-      record = self._bytes_buffer.getvalue()
-      # clean up buffer
-      self._bytes_buffer.close()
-      self._bytes_buffer = io.BytesIO()
-      self._bytes_buffer.write(remaining)
-
-      return record
-
-  def read(self):
-    while True:
-      try:
-        return self._read()
-      except Exception as e:
-        logging.error("Error reading bytes for next record: {}".format(e))
-        if self.debug:
-          raise
-
-  def close(self):
-    try:
-      self._bytes_buffer.close()
-      self._buffered_reader.close()
-      self._pipe.terminate()
-    except Exception as e:
-      logging.error("Error closing reader: {}".format(e))
+    JAVA = "/usr/lib/jvm/java-11-twitter/bin/java"
+    RECORD_SEPARATOR_HEX = [
+        0x29,
+        0xD8,
+        0xD5,
+        0x06,
+        0x58,
+        0xCD,
+        0x4C,
+        0x29,
+        0xB2,
+        0xBC,
+        0x57,
+        0x99,
+        0x21,
+        0x71,
+        0xBD,
+        0xFF,
+    ]
+    RECORD_SEPARATOR = "".join([chr(i) for i in RECORD_SEPARATOR_HEX])
+    RECORD_SEPARATOR_LENGTH = len(RECORD_SEPARATOR)
+    CHUNK_SIZE = 8192
+
+    def __init__(
+        self,
+        jar_file,
+        num_eb_threads,
+        subscriber_id,
+        filter_str=None,
+        buffer_size=32768,
+        debug=False,
+    ):
+        self.jar_file = jar_file
+        self.num_eb_threads = num_eb_threads
+        self.subscriber_id = subscriber_id
+        self.filter_str = filter_str if filter_str else '""'
+        self.buffer_size = buffer_size
+        self.lock = Lock()
+        self._pipe = None
+        self._buffered_reader = None
+        self._bytes_buffer = None
+
+        self.debug = debug
+
+    def initialize(self):
+        if not self._pipe:
+            self._pipe = subprocess.Popen(
+                [
+                    self.JAVA,
+                    "-jar",
+                    self.jar_file,
+                    "-subscriberId",
+                    self.subscriber_id,
+                    "-numThreads",
+                    str(self.num_eb_threads),
+                    "-dataFilter",
+                    self.filter_str,
+                    "-debug" if self.debug else "",
+                ],
+                stdout=subprocess.PIPE,
+            )
+            self._buffered_reader = io.BufferedReader(
+                ReadableWrapper(self._pipe.stdout), self.buffer_size
+            )
+            self._bytes_buffer = io.BytesIO()
+        else:
+            logging.warning("Already initialized")
+
+    def _find_next_record(self):
+        tail = [""]
+        while True:
+            chunk = tail[0] + self._buffered_reader.read(self.CHUNK_SIZE)
+            index = chunk.find(self.RECORD_SEPARATOR)
+            if index < 0:
+                self._bytes_buffer.write(chunk[: -self.RECORD_SEPARATOR_LENGTH])
+                tail[0] = chunk[-self.RECORD_SEPARATOR_LENGTH :]
+            else:
+                self._bytes_buffer.write(chunk[:index])
+                return chunk[(index + self.RECORD_SEPARATOR_LENGTH) :]
+
+    def _read(self):
+        with self.lock:
+            remaining = self._find_next_record()
+            record = self._bytes_buffer.getvalue()
+            # clean up buffer
+            self._bytes_buffer.close()
+            self._bytes_buffer = io.BytesIO()
+            self._bytes_buffer.write(remaining)
+
+            return record
+
+    def read(self):
+        while True:
+            try:
+                return self._read()
+            except Exception as e:
+                logging.error("Error reading bytes for next record: {}".format(e))
+                if self.debug:
+                    raise
+
+    def close(self):
+        try:
+            self._bytes_buffer.close()
+            self._buffered_reader.close()
+            self._pipe.terminate()
+        except Exception as e:
+            logging.error("Error closing reader: {}".format(e))
diff --git a/twml/twml/contrib/export/__init__.py b/twml/twml/contrib/export/__init__.py
index 99892dcfa..2a6e0f86d 100644
--- a/twml/twml/contrib/export/__init__.py
+++ b/twml/twml/contrib/export/__init__.py
@@ -1,2 +1,2 @@
-from . import export_fn # noqa: F401
-from . import exporters # noqa: F401
+from . import export_fn  # noqa: F401
+from . import exporters  # noqa: F401
diff --git a/twml/twml/contrib/export/export_fn.py b/twml/twml/contrib/export/export_fn.py
index 6e59fff07..10be4a2aa 100644
--- a/twml/twml/contrib/export/export_fn.py
+++ b/twml/twml/contrib/export/export_fn.py
@@ -1,263 +1,296 @@
 """
 Functions for exporting models for different modes.
 """
-from collections import OrderedDict
 import os
+from collections import OrderedDict
 
 import tensorflow.compat.v1 as tf
+import yaml
 from tensorflow.python.estimator.export import export
+
 import twml
-import yaml
 
 
 def get_sparse_batch_supervised_input_receiver_fn(feature_config, keep_fields=None):
-  """Gets supervised_input_receiver_fn that decodes a BatchPredictionRequest as sparse tensors
-  with labels and weights as defined in feature_config.
-  This input_receiver_fn is required for exporting models with 'train' mode to be trained with
-  Java API
+    """Gets supervised_input_receiver_fn that decodes a BatchPredictionRequest as sparse tensors
+    with labels and weights as defined in feature_config.
+    This input_receiver_fn is required for exporting models with 'train' mode to be trained with
+    Java API
+
+    Args:
+      feature_config (FeatureConfig): deepbird v2 feature config object
+      keep_fields (list): list of fields to keep
 
-  Args:
-    feature_config (FeatureConfig): deepbird v2 feature config object
-    keep_fields (list): list of fields to keep
+    Returns:
+      supervised_input_receiver_fn: input_receiver_fn used for train mode
+    """
 
-  Returns:
-    supervised_input_receiver_fn: input_receiver_fn used for train mode
-  """
-  def supervised_input_receiver_fn():
-    serialized_request = tf.placeholder(dtype=tf.uint8, name='request')
-    receiver_tensors = {'request': serialized_request}
+    def supervised_input_receiver_fn():
+        serialized_request = tf.placeholder(dtype=tf.uint8, name="request")
+        receiver_tensors = {"request": serialized_request}
 
-    bpr = twml.contrib.readers.HashedBatchPredictionRequest(serialized_request, feature_config)
-    features = bpr.get_sparse_features() if keep_fields is None else bpr.get_features(keep_fields)
-    features['weights'] = bpr.weights
-    labels = bpr.labels
-    features, labels = bpr.apply_filter(features, labels)
+        bpr = twml.contrib.readers.HashedBatchPredictionRequest(
+            serialized_request, feature_config
+        )
+        features = (
+            bpr.get_sparse_features()
+            if keep_fields is None
+            else bpr.get_features(keep_fields)
+        )
+        features["weights"] = bpr.weights
+        labels = bpr.labels
+        features, labels = bpr.apply_filter(features, labels)
 
-    return export.SupervisedInputReceiver(features, labels, receiver_tensors)
+        return export.SupervisedInputReceiver(features, labels, receiver_tensors)
 
-  return supervised_input_receiver_fn
+    return supervised_input_receiver_fn
 
 
 def update_build_graph_fn_for_train(build_graph_fn):
-  """Updates a build_graph_fn by inserting in graph output a serialized BatchPredictionResponse
-  similar to the export_output_fns for serving.
-  The key difference here is that
-  1. We insert serialized BatchPredictionResponse in graph output with key 'prediction' instead of
-     creating an export_output object. This is because of the way estimators export model in 'train'
-     mode doesn't take custom export_output
-  2. We only do it when `mode == 'train'` to avoid altering the graph when exporting
-     for 'infer' mode
-
-  Args:
-    build_graph_fn (Callable): deepbird v2 build graph function
-
-  Returns:
-    new_build_graph_fn: An updated build_graph_fn that inserts serialized BatchPredictResponse
-                        to graph output when in 'train' mode
-  """
-  def new_build_graph_fn(features, label, mode, params, config=None):
-    output = build_graph_fn(features, label, mode, params, config)
-    if mode == tf.estimator.ModeKeys.TRAIN:
-      output.update(
-        twml.export_output_fns.batch_prediction_continuous_output_fn(output)[
-          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs
-      )
-    return output
-  return new_build_graph_fn
+    """Updates a build_graph_fn by inserting in graph output a serialized BatchPredictionResponse
+    similar to the export_output_fns for serving.
+    The key difference here is that
+    1. We insert serialized BatchPredictionResponse in graph output with key 'prediction' instead of
+       creating an export_output object. This is because of the way estimators export model in 'train'
+       mode doesn't take custom export_output
+    2. We only do it when `mode == 'train'` to avoid altering the graph when exporting
+       for 'infer' mode
+
+    Args:
+      build_graph_fn (Callable): deepbird v2 build graph function
+
+    Returns:
+      new_build_graph_fn: An updated build_graph_fn that inserts serialized BatchPredictResponse
+                          to graph output when in 'train' mode
+    """
+
+    def new_build_graph_fn(features, label, mode, params, config=None):
+        output = build_graph_fn(features, label, mode, params, config)
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            output.update(
+                twml.export_output_fns.batch_prediction_continuous_output_fn(output)[
+                    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+                ].outputs
+            )
+        return output
+
+    return new_build_graph_fn
 
 
 def export_model_for_train_and_infer(
-    trainer, feature_config, keep_fields, export_dir, as_text=False):
-  """Function for exporting model with both 'train' and 'infer' mode.
-
-  This means the exported saved_model.pb will contain two meta graphs, one with tag 'train'
-  and the other with tag 'serve', and it can be loaded in Java API with either tag depending on
-  the use case
-
-  Args:
-    trainer (DataRecordTrainer): deepbird v2 DataRecordTrainer
-    feature_config (FeatureConfig): deepbird v2 feature config
-    keep_fields (list of string): list of field keys, e.g.
-                                  ('ids', 'keys', 'values', 'batch_size', 'total_size', 'codes')
-    export_dir (str): a directory (local or hdfs) to export model to
-    as_text (bool): if True, write 'saved_model.pb' as binary file, else write
-                    'saved_model.pbtxt' as human readable text file. Default False
-  """
-  train_input_receiver_fn = get_sparse_batch_supervised_input_receiver_fn(
-    feature_config, keep_fields)
-  predict_input_receiver_fn = twml.parsers.get_sparse_serving_input_receiver_fn(
-    feature_config, keep_fields)
-  trainer._export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn
-  trainer._build_graph_fn = update_build_graph_fn_for_train(trainer._build_graph_fn)
-  trainer._estimator._export_all_saved_models(
-    export_dir_base=export_dir,
-    input_receiver_fn_map={
-      tf.estimator.ModeKeys.TRAIN: train_input_receiver_fn,
-      tf.estimator.ModeKeys.PREDICT: predict_input_receiver_fn
-    },
-    as_text=as_text,
-  )
-
-  trainer.export_model_effects(export_dir)
-
-
-def export_all_models_with_receivers(estimator, export_dir,
-                                     train_input_receiver_fn,
-                                     eval_input_receiver_fn,
-                                     predict_input_receiver_fn,
-                                     export_output_fn,
-                                     export_modes=('train', 'eval', 'predict'),
-                                     register_model_fn=None,
-                                     feature_spec=None,
-                                     checkpoint_path=None,
-                                     log_features=True):
-  """
-  Function for exporting a model with train, eval, and infer modes.
-
-  Args:
-    estimator:
-      Should be of type tf.estimator.Estimator.
-      You can get this from trainer using trainer.estimator
-    export_dir:
-      Directory to export the model.
-    train_input_receiver_fn:
-      Input receiver for train interface.
-    eval_input_receiver_fn:
-      Input receiver for eval interface.
-    predict_input_receiver_fn:
-      Input receiver for predict interface.
-    export_output_fn:
-      export_output_fn to be used for serving.
-    export_modes:
-      A list to Specify what modes to export. Can be "train", "eval", "predict".
-      Defaults to ["train", "eval", "predict"]
-    register_model_fn:
-      An optional function which is called with export_dir after models are exported.
-      Defaults to None.
-  Returns:
-     The timestamped directory the models are exported to.
-  """
-  # TODO: Fix for hogwild / distributed training.
-
-  if export_dir is None:
-    raise ValueError("export_dir can not be None")
-  export_dir = twml.util.sanitize_hdfs_path(export_dir)
-  input_receiver_fn_map = {}
-
-  if "train" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.TRAIN] = train_input_receiver_fn
-
-  if "eval" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.EVAL] = eval_input_receiver_fn
-
-  if "predict" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.PREDICT] = predict_input_receiver_fn
-
-  export_dir = estimator._export_all_saved_models(
-    export_dir_base=export_dir,
-    input_receiver_fn_map=input_receiver_fn_map,
-    checkpoint_path=checkpoint_path,
-  )
-
-  if register_model_fn is not None:
-    register_model_fn(export_dir, feature_spec, log_features)
-
-  return export_dir
-
-
-def export_all_models(trainer,
-                      export_dir,
-                      parse_fn,
-                      serving_input_receiver_fn,
-                      export_output_fn=None,
-                      export_modes=('train', 'eval', 'predict'),
-                      feature_spec=None,
-                      checkpoint=None,
-                      log_features=True):
-  """
-  Function for exporting a model with train, eval, and infer modes.
-
-  Args:
-    trainer:
-      An object of type twml.trainers.Trainer.
-    export_dir:
-      Directory to export the model.
-    parse_fn:
-      The parse function used parse the inputs for train and eval.
-    serving_input_receiver_fn:
-      The input receiver function used during serving.
-    export_output_fn:
-      export_output_fn to be used for serving.
-    export_modes:
-      A list to Specify what modes to export. Can be "train", "eval", "predict".
-      Defaults to ["train", "eval", "predict"]
-    feature_spec:
-      A dictionary obtained from FeatureConfig.get_feature_spec() to serialize
-      as feature_spec.yaml in export_dir.
-      Defaults to None
-  Returns:
-     The timestamped directory the models are exported to.
-  """
-  # Only export from chief in hogwild or distributed modes.
-  if trainer.params.get('distributed', False) and not trainer.estimator.config.is_chief:
-    tf.logging.info("Trainer.export_model ignored due to instance not being chief.")
-    return
-
-  if feature_spec is None:
-    if getattr(trainer, '_feature_config') is None:
-      raise ValueError("feature_spec is set to None."
-                       "Please pass feature_spec=feature_config.get_feature_spec() to the export_all_model function")
-    else:
-      feature_spec = trainer._feature_config.get_feature_spec()
-
-  export_dir = twml.util.sanitize_hdfs_path(export_dir)
-  old_export_output_fn = trainer._export_output_fn
-  trainer._export_output_fn = export_output_fn
-  supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn(parse_fn)
-  if not checkpoint:
-    checkpoint = trainer.best_or_latest_checkpoint
-
-  export_dir = export_all_models_with_receivers(estimator=trainer.estimator,
-                                                export_dir=export_dir,
-                                                train_input_receiver_fn=supervised_input_receiver_fn,
-                                                eval_input_receiver_fn=supervised_input_receiver_fn,
-                                                predict_input_receiver_fn=serving_input_receiver_fn,
-                                                export_output_fn=export_output_fn,
-                                                export_modes=export_modes,
-                                                register_model_fn=trainer.export_model_effects,
-                                                feature_spec=feature_spec,
-                                                checkpoint_path=checkpoint,
-                                                log_features=log_features)
-  trainer._export_output_fn = old_export_output_fn
-  return export_dir
+    trainer, feature_config, keep_fields, export_dir, as_text=False
+):
+    """Function for exporting model with both 'train' and 'infer' mode.
+
+    This means the exported saved_model.pb will contain two meta graphs, one with tag 'train'
+    and the other with tag 'serve', and it can be loaded in Java API with either tag depending on
+    the use case
+
+    Args:
+      trainer (DataRecordTrainer): deepbird v2 DataRecordTrainer
+      feature_config (FeatureConfig): deepbird v2 feature config
+      keep_fields (list of string): list of field keys, e.g.
+                                    ('ids', 'keys', 'values', 'batch_size', 'total_size', 'codes')
+      export_dir (str): a directory (local or hdfs) to export model to
+      as_text (bool): if True, write 'saved_model.pb' as binary file, else write
+                      'saved_model.pbtxt' as human readable text file. Default False
+    """
+    train_input_receiver_fn = get_sparse_batch_supervised_input_receiver_fn(
+        feature_config, keep_fields
+    )
+    predict_input_receiver_fn = twml.parsers.get_sparse_serving_input_receiver_fn(
+        feature_config, keep_fields
+    )
+    trainer._export_output_fn = (
+        twml.export_output_fns.batch_prediction_continuous_output_fn
+    )
+    trainer._build_graph_fn = update_build_graph_fn_for_train(trainer._build_graph_fn)
+    trainer._estimator._export_all_saved_models(
+        export_dir_base=export_dir,
+        input_receiver_fn_map={
+            tf.estimator.ModeKeys.TRAIN: train_input_receiver_fn,
+            tf.estimator.ModeKeys.PREDICT: predict_input_receiver_fn,
+        },
+        as_text=as_text,
+    )
+
+    trainer.export_model_effects(export_dir)
+
+
+def export_all_models_with_receivers(
+    estimator,
+    export_dir,
+    train_input_receiver_fn,
+    eval_input_receiver_fn,
+    predict_input_receiver_fn,
+    export_output_fn,
+    export_modes=("train", "eval", "predict"),
+    register_model_fn=None,
+    feature_spec=None,
+    checkpoint_path=None,
+    log_features=True,
+):
+    """
+    Function for exporting a model with train, eval, and infer modes.
+
+    Args:
+      estimator:
+        Should be of type tf.estimator.Estimator.
+        You can get this from trainer using trainer.estimator
+      export_dir:
+        Directory to export the model.
+      train_input_receiver_fn:
+        Input receiver for train interface.
+      eval_input_receiver_fn:
+        Input receiver for eval interface.
+      predict_input_receiver_fn:
+        Input receiver for predict interface.
+      export_output_fn:
+        export_output_fn to be used for serving.
+      export_modes:
+        A list to Specify what modes to export. Can be "train", "eval", "predict".
+        Defaults to ["train", "eval", "predict"]
+      register_model_fn:
+        An optional function which is called with export_dir after models are exported.
+        Defaults to None.
+    Returns:
+       The timestamped directory the models are exported to.
+    """
+    # TODO: Fix for hogwild / distributed training.
+
+    if export_dir is None:
+        raise ValueError("export_dir can not be None")
+    export_dir = twml.util.sanitize_hdfs_path(export_dir)
+    input_receiver_fn_map = {}
+
+    if "train" in export_modes:
+        input_receiver_fn_map[tf.estimator.ModeKeys.TRAIN] = train_input_receiver_fn
+
+    if "eval" in export_modes:
+        input_receiver_fn_map[tf.estimator.ModeKeys.EVAL] = eval_input_receiver_fn
+
+    if "predict" in export_modes:
+        input_receiver_fn_map[tf.estimator.ModeKeys.PREDICT] = predict_input_receiver_fn
+
+    export_dir = estimator._export_all_saved_models(
+        export_dir_base=export_dir,
+        input_receiver_fn_map=input_receiver_fn_map,
+        checkpoint_path=checkpoint_path,
+    )
+
+    if register_model_fn is not None:
+        register_model_fn(export_dir, feature_spec, log_features)
+
+    return export_dir
+
+
+def export_all_models(
+    trainer,
+    export_dir,
+    parse_fn,
+    serving_input_receiver_fn,
+    export_output_fn=None,
+    export_modes=("train", "eval", "predict"),
+    feature_spec=None,
+    checkpoint=None,
+    log_features=True,
+):
+    """
+    Function for exporting a model with train, eval, and infer modes.
+
+    Args:
+      trainer:
+        An object of type twml.trainers.Trainer.
+      export_dir:
+        Directory to export the model.
+      parse_fn:
+        The parse function used parse the inputs for train and eval.
+      serving_input_receiver_fn:
+        The input receiver function used during serving.
+      export_output_fn:
+        export_output_fn to be used for serving.
+      export_modes:
+        A list to Specify what modes to export. Can be "train", "eval", "predict".
+        Defaults to ["train", "eval", "predict"]
+      feature_spec:
+        A dictionary obtained from FeatureConfig.get_feature_spec() to serialize
+        as feature_spec.yaml in export_dir.
+        Defaults to None
+    Returns:
+       The timestamped directory the models are exported to.
+    """
+    # Only export from chief in hogwild or distributed modes.
+    if (
+        trainer.params.get("distributed", False)
+        and not trainer.estimator.config.is_chief
+    ):
+        tf.logging.info("Trainer.export_model ignored due to instance not being chief.")
+        return
+
+    if feature_spec is None:
+        if getattr(trainer, "_feature_config") is None:
+            raise ValueError(
+                "feature_spec is set to None."
+                "Please pass feature_spec=feature_config.get_feature_spec() to the export_all_model function"
+            )
+        else:
+            feature_spec = trainer._feature_config.get_feature_spec()
+
+    export_dir = twml.util.sanitize_hdfs_path(export_dir)
+    old_export_output_fn = trainer._export_output_fn
+    trainer._export_output_fn = export_output_fn
+    supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn(
+        parse_fn
+    )
+    if not checkpoint:
+        checkpoint = trainer.best_or_latest_checkpoint
+
+    export_dir = export_all_models_with_receivers(
+        estimator=trainer.estimator,
+        export_dir=export_dir,
+        train_input_receiver_fn=supervised_input_receiver_fn,
+        eval_input_receiver_fn=supervised_input_receiver_fn,
+        predict_input_receiver_fn=serving_input_receiver_fn,
+        export_output_fn=export_output_fn,
+        export_modes=export_modes,
+        register_model_fn=trainer.export_model_effects,
+        feature_spec=feature_spec,
+        checkpoint_path=checkpoint,
+        log_features=log_features,
+    )
+    trainer._export_output_fn = old_export_output_fn
+    return export_dir
 
 
 def export_feature_spec(dir_path, feature_spec_dict):
-  """
-  Exports a FeatureConfig.get_feature_spec() dict to <dir_path>/feature_spec.yaml.
-  """
-  def ordered_dict_representer(dumper, data):
-    return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())
-
-  try:
-    # needed for Python 2
-    yaml.add_representer(str, yaml.representer.SafeRepresenter.represent_str)
-    yaml.add_representer(unicode, yaml.representer.SafeRepresenter.represent_unicode)
-  except NameError:
-    # 'unicode' type doesn't exist on Python 3
-    # PyYAML handles unicode correctly in Python 3
-    pass
-
-  yaml.add_representer(OrderedDict, ordered_dict_representer)
-
-  fbase = "feature_spec.yaml"
-  fname = fbase.encode('utf-8') if type(dir_path) != str else fbase
-  file_path = os.path.join(dir_path, fname)
-  with tf.io.gfile.GFile(file_path, mode='w') as f:
-    yaml.dump(feature_spec_dict, f, default_flow_style=False, allow_unicode=True)
-  tf.logging.info("Exported feature spec to %s" % file_path)
-
-  return file_path
+    """
+    Exports a FeatureConfig.get_feature_spec() dict to <dir_path>/feature_spec.yaml.
+    """
+
+    def ordered_dict_representer(dumper, data):
+        return dumper.represent_mapping("tag:yaml.org,2002:map", data.items())
+
+    try:
+        # needed for Python 2
+        yaml.add_representer(str, yaml.representer.SafeRepresenter.represent_str)
+        yaml.add_representer(
+            unicode, yaml.representer.SafeRepresenter.represent_unicode
+        )
+    except NameError:
+        # 'unicode' type doesn't exist on Python 3
+        # PyYAML handles unicode correctly in Python 3
+        pass
+
+    yaml.add_representer(OrderedDict, ordered_dict_representer)
+
+    fbase = "feature_spec.yaml"
+    fname = fbase.encode("utf-8") if type(dir_path) != str else fbase
+    file_path = os.path.join(dir_path, fname)
+    with tf.io.gfile.GFile(file_path, mode="w") as f:
+        yaml.dump(feature_spec_dict, f, default_flow_style=False, allow_unicode=True)
+    tf.logging.info("Exported feature spec to %s" % file_path)
+
+    return file_path
 
 
 # Keep the alias for compatibility.
diff --git a/twml/twml/contrib/export/exporters.py b/twml/twml/contrib/export/exporters.py
index 122955cbc..cc0602b39 100644
--- a/twml/twml/contrib/export/exporters.py
+++ b/twml/twml/contrib/export/exporters.py
@@ -5,141 +5,167 @@
 
 import tensorflow.compat.v1 as tf
 from tensorflow.python.estimator import exporter
+
 import twml
 
 
 class _AllSavedModelsExporter(tf.estimator.Exporter):
-  """Internal exporter class to be used for exporting models for different modes."""
-
-  def __init__(self,
-               name,
-               input_receiver_fn_map,
-               backup_checkpoints,
-               assets_extra=None,
-               as_text=False):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-      assets_extra: Additional assets to be included in the exported model.
-      as_text: Specifies if the exported model should be in a human readable text format.
-    """
-    self._name = name
-    self._input_receiver_fn_map = input_receiver_fn_map
-    self._backup_checkpoints = backup_checkpoints
-    self._assets_extra = assets_extra
-    self._as_text = as_text
-
-  @property
-  def name(self):
-    return self._name
-
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    del is_the_final_export
-
-    export_path = twml.util.sanitize_hdfs_path(export_path)
-    checkpoint_path = twml.util.sanitize_hdfs_path(checkpoint_path)
-
-    if self._backup_checkpoints:
-      backup_path = os.path.join(export_path, "checkpoints")
-      # Ensure backup_path is created. makedirs passes if dir already exists.
-      tf.io.gfile.makedirs(backup_path)
-      twml.util.backup_checkpoint(checkpoint_path, backup_path, empty_backup=False)
-
-    export_result = estimator.experimental_export_all_saved_models(
-      export_path,
-      self._input_receiver_fn_map,
-      assets_extra=self._assets_extra,
-      as_text=self._as_text,
-      checkpoint_path=checkpoint_path)
-
-    return export_result
+    """Internal exporter class to be used for exporting models for different modes."""
+
+    def __init__(
+        self,
+        name,
+        input_receiver_fn_map,
+        backup_checkpoints,
+        assets_extra=None,
+        as_text=False,
+    ):
+        """
+        Args:
+          name: A unique name to be used for the exporter. This is used in the export path.
+          input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
+          backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
+          assets_extra: Additional assets to be included in the exported model.
+          as_text: Specifies if the exported model should be in a human readable text format.
+        """
+        self._name = name
+        self._input_receiver_fn_map = input_receiver_fn_map
+        self._backup_checkpoints = backup_checkpoints
+        self._assets_extra = assets_extra
+        self._as_text = as_text
+
+    @property
+    def name(self):
+        return self._name
+
+    def export(
+        self, estimator, export_path, checkpoint_path, eval_result, is_the_final_export
+    ):
+        del is_the_final_export
+
+        export_path = twml.util.sanitize_hdfs_path(export_path)
+        checkpoint_path = twml.util.sanitize_hdfs_path(checkpoint_path)
+
+        if self._backup_checkpoints:
+            backup_path = os.path.join(export_path, "checkpoints")
+            # Ensure backup_path is created. makedirs passes if dir already exists.
+            tf.io.gfile.makedirs(backup_path)
+            twml.util.backup_checkpoint(
+                checkpoint_path, backup_path, empty_backup=False
+            )
+
+        export_result = estimator.experimental_export_all_saved_models(
+            export_path,
+            self._input_receiver_fn_map,
+            assets_extra=self._assets_extra,
+            as_text=self._as_text,
+            checkpoint_path=checkpoint_path,
+        )
+
+        return export_result
 
 
 class BestExporter(tf.estimator.BestExporter):
-  """
-  This class inherits from tf.estimator.BestExporter with the following differences:
-    - It also creates a backup of the best checkpoint.
-    - It can export the model for multiple modes.
-
-  A backup / export is performed everytime the evaluated metric is better
-  than previous models.
-  """
-
-  def __init__(self,
-               name='best_exporter',
-               input_receiver_fn_map=None,
-               backup_checkpoints=True,
-               event_file_pattern='eval/*.tfevents.*',
-               compare_fn=exporter._loss_smaller,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-
-    Note:
-      Check the following documentation for more information about the remaining args:
-      https://www.tensorflow.org/api_docs/python/tf/estimator/BestExporter
     """
-    serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT)
+    This class inherits from tf.estimator.BestExporter with the following differences:
+      - It also creates a backup of the best checkpoint.
+      - It can export the model for multiple modes.
 
-    super(BestExporter, self).__init__(
-      name, serving_input_receiver_fn, event_file_pattern, compare_fn,
-      assets_extra, as_text, exports_to_keep)
-
-    if not hasattr(self, "_saved_model_exporter"):
-      raise AttributeError(
-        "_saved_model_exporter needs to exist for this exporter to work."
-        " This is potentially broken because of an internal change in Tensorflow")
+    A backup / export is performed everytime the evaluated metric is better
+    than previous models.
+    """
 
-    # Override the saved_model_exporter with SaveAllmodelsexporter
-    self._saved_model_exporter = _AllSavedModelsExporter(
-      name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text)
+    def __init__(
+        self,
+        name="best_exporter",
+        input_receiver_fn_map=None,
+        backup_checkpoints=True,
+        event_file_pattern="eval/*.tfevents.*",
+        compare_fn=exporter._loss_smaller,
+        assets_extra=None,
+        as_text=False,
+        exports_to_keep=5,
+    ):
+        """
+        Args:
+          name: A unique name to be used for the exporter. This is used in the export path.
+          input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
+          backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
+
+        Note:
+          Check the following documentation for more information about the remaining args:
+          https://www.tensorflow.org/api_docs/python/tf/estimator/BestExporter
+        """
+        serving_input_receiver_fn = input_receiver_fn_map.get(
+            tf.estimator.ModeKeys.PREDICT
+        )
+
+        super(BestExporter, self).__init__(
+            name,
+            serving_input_receiver_fn,
+            event_file_pattern,
+            compare_fn,
+            assets_extra,
+            as_text,
+            exports_to_keep,
+        )
+
+        if not hasattr(self, "_saved_model_exporter"):
+            raise AttributeError(
+                "_saved_model_exporter needs to exist for this exporter to work."
+                " This is potentially broken because of an internal change in Tensorflow"
+            )
+
+        # Override the saved_model_exporter with SaveAllmodelsexporter
+        self._saved_model_exporter = _AllSavedModelsExporter(
+            name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text
+        )
 
 
 class LatestExporter(tf.estimator.LatestExporter):
-  """
-  This class inherits from tf.estimator.LatestExporter with the following differences:
-    - It also creates a backup of the latest checkpoint.
-    - It can export the model for multiple modes.
-
-  A backup / export is performed everytime the evaluated metric is better
-  than previous models.
-  """
-
-  def __init__(self,
-               name='latest_exporter',
-               input_receiver_fn_map=None,
-               backup_checkpoints=True,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
     """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-
-    Note:
-      Check the following documentation for more information about the remaining args:
-      https://www.tensorflow.org/api_docs/python/tf/estimator/LatestExporter
-    """
-    serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT)
-
-    super(LatestExporter, self).__init__(
-      name, serving_input_receiver_fn, assets_extra, as_text, exports_to_keep)
+    This class inherits from tf.estimator.LatestExporter with the following differences:
+      - It also creates a backup of the latest checkpoint.
+      - It can export the model for multiple modes.
 
-    if not hasattr(self, "_saved_model_exporter"):
-      raise AttributeError(
-        "_saved_model_exporter needs to exist for this exporter to work."
-        " This is potentially broken because of an internal change in Tensorflow")
+    A backup / export is performed everytime the evaluated metric is better
+    than previous models.
+    """
 
-    # Override the saved_model_exporter with SaveAllmodelsexporter
-    self._saved_model_exporter = _AllSavedModelsExporter(
-      name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text)
+    def __init__(
+        self,
+        name="latest_exporter",
+        input_receiver_fn_map=None,
+        backup_checkpoints=True,
+        assets_extra=None,
+        as_text=False,
+        exports_to_keep=5,
+    ):
+        """
+        Args:
+          name: A unique name to be used for the exporter. This is used in the export path.
+          input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
+          backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
+
+        Note:
+          Check the following documentation for more information about the remaining args:
+          https://www.tensorflow.org/api_docs/python/tf/estimator/LatestExporter
+        """
+        serving_input_receiver_fn = input_receiver_fn_map.get(
+            tf.estimator.ModeKeys.PREDICT
+        )
+
+        super(LatestExporter, self).__init__(
+            name, serving_input_receiver_fn, assets_extra, as_text, exports_to_keep
+        )
+
+        if not hasattr(self, "_saved_model_exporter"):
+            raise AttributeError(
+                "_saved_model_exporter needs to exist for this exporter to work."
+                " This is potentially broken because of an internal change in Tensorflow"
+            )
+
+        # Override the saved_model_exporter with SaveAllmodelsexporter
+        self._saved_model_exporter = _AllSavedModelsExporter(
+            name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text
+        )
diff --git a/twml/twml/contrib/feature_config.py b/twml/twml/contrib/feature_config.py
index 833695751..9f1e19abd 100644
--- a/twml/twml/contrib/feature_config.py
+++ b/twml/twml/contrib/feature_config.py
@@ -2,78 +2,85 @@
 Feature configuration for DeepBird jobs returns dictionary of sparse and dense Features
 """
 from twitter.deepbird.io.legacy.contrib import feature_config
+
 import twml
 
 
 class FeatureConfig(feature_config.FeatureConfig):
-  def get_feature_spec(self):
-    """
-    Generates a serialization-friendly dict representing this FeatureConfig.
-    """
-    doc = super(FeatureConfig, self).get_feature_spec()
+    def get_feature_spec(self):
+        """
+        Generates a serialization-friendly dict representing this FeatureConfig.
+        """
+        doc = super(FeatureConfig, self).get_feature_spec()
 
-    # Override the class in the spec.
-    doc["class"] = "twml.contrib.FeatureConfig"
+        # Override the class in the spec.
+        doc["class"] = "twml.contrib.FeatureConfig"
 
-    return doc
+        return doc
 
 
 class FeatureConfigBuilder(feature_config.FeatureConfigBuilder):
-  # Overwrite self.build() to return twml.FeatureConfig instead
-  def build(self):
-    """
-    Returns an instance of FeatureConfig with the features passed to the FeatureConfigBuilder.
-    """
+    # Overwrite self.build() to return twml.FeatureConfig instead
+    def build(self):
+        """
+        Returns an instance of FeatureConfig with the features passed to the FeatureConfigBuilder.
+        """
 
-    (
-      keep_tensors,
-      keep_sparse_tensors,
-      feature_map,
-      features_add,
-      feature_name_to_feature_parser,
-      feature_in_bq_name,
-    ) = self._build()
+        (
+            keep_tensors,
+            keep_sparse_tensors,
+            feature_map,
+            features_add,
+            feature_name_to_feature_parser,
+            feature_in_bq_name,
+        ) = self._build()
 
-    discretize_dict = {}
-    for config in self._sparse_extraction_configs:
-      if config.discretize_num_bins and config.discretize_output_size_bits:
-        if config.discretize_type == "percentile":
-          calibrator = twml.contrib.calibrators.PercentileDiscretizerCalibrator
-        elif config.discretize_type == "hashed_percentile":
-          calibrator = twml.contrib.calibrators.HashedPercentileDiscretizerCalibrator
-        elif config.discretize_type == "hashing":
-          calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator
-        else:
-          raise ValueError("Unsupported discretizer type: " + config.discretize_type)
-        discretize_dict[config.output_name] = calibrator(
-          config.discretize_num_bins,
-          config.discretize_output_size_bits,
-          allow_empty_calibration=config.allow_empty_calibration,
-        )
-      elif config.discretize_num_bins or config.discretize_output_size_bits:
-        raise ValueError(
-          "Discretize_num_bins AND discretize_output_size_bits need to be in the FeatureConfig"
-        )
+        discretize_dict = {}
+        for config in self._sparse_extraction_configs:
+            if config.discretize_num_bins and config.discretize_output_size_bits:
+                if config.discretize_type == "percentile":
+                    calibrator = (
+                        twml.contrib.calibrators.PercentileDiscretizerCalibrator
+                    )
+                elif config.discretize_type == "hashed_percentile":
+                    calibrator = (
+                        twml.contrib.calibrators.HashedPercentileDiscretizerCalibrator
+                    )
+                elif config.discretize_type == "hashing":
+                    calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator
+                else:
+                    raise ValueError(
+                        "Unsupported discretizer type: " + config.discretize_type
+                    )
+                discretize_dict[config.output_name] = calibrator(
+                    config.discretize_num_bins,
+                    config.discretize_output_size_bits,
+                    allow_empty_calibration=config.allow_empty_calibration,
+                )
+            elif config.discretize_num_bins or config.discretize_output_size_bits:
+                raise ValueError(
+                    "Discretize_num_bins AND discretize_output_size_bits need to be in the FeatureConfig"
+                )
 
-    return FeatureConfig(
-      features={},
-      labels=self._labels,
-      weight=self._weight,
-      filters=self._filter_features,
-      tensor_types=keep_tensors,
-      sparse_tensor_types=keep_sparse_tensors,
-      feature_types=feature_map,
-      sparse_extraction_configs=self._sparse_extraction_configs,
-      feature_extraction_configs=self._feature_extraction_configs,
-      feature_group_extraction_configs=self._feature_group_extraction_configs,
-      image_configs=self._image_configs,
-      discretize_config=discretize_dict,
-      feature_ids=features_add,
-      decode_mode=self._decode_mode,
-      legacy_sparse=self._legacy_sparse,
-      feature_name_to_feature_parser=feature_name_to_feature_parser,
-      feature_in_bq_name=feature_in_bq_name,
-    )
+        return FeatureConfig(
+            features={},
+            labels=self._labels,
+            weight=self._weight,
+            filters=self._filter_features,
+            tensor_types=keep_tensors,
+            sparse_tensor_types=keep_sparse_tensors,
+            feature_types=feature_map,
+            sparse_extraction_configs=self._sparse_extraction_configs,
+            feature_extraction_configs=self._feature_extraction_configs,
+            feature_group_extraction_configs=self._feature_group_extraction_configs,
+            image_configs=self._image_configs,
+            discretize_config=discretize_dict,
+            feature_ids=features_add,
+            decode_mode=self._decode_mode,
+            legacy_sparse=self._legacy_sparse,
+            feature_name_to_feature_parser=feature_name_to_feature_parser,
+            feature_in_bq_name=feature_in_bq_name,
+        )
 
 
 TensorExtractionConfig = feature_config.TensorExtractionConfig
diff --git a/twml/twml/contrib/feature_config_parsers.py b/twml/twml/contrib/feature_config_parsers.py
index 83c402e2e..c6486f463 100644
--- a/twml/twml/contrib/feature_config_parsers.py
+++ b/twml/twml/contrib/feature_config_parsers.py
@@ -4,221 +4,240 @@
 
 import tensorflow.compat.v1 as tf
 import yaml
-from twml.feature_config import FeatureConfigBuilder
+
 from twml.contrib.feature_config import FeatureConfigBuilder as FeatureConfigBuilderV2
+from twml.feature_config import FeatureConfigBuilder
 
 
 def _get_config_version(config_dict):
-  doc = config_dict
-  supported_classes = {
-    "twml.FeatureConfig": "v1",
-    "twml.contrib.FeatureConfig": "v2"
-  }
-  if "class" not in doc:
-    raise ValueError("'class' key not found")
-  if doc["class"] not in supported_classes.keys():
-    raise ValueError("Class %s not supported. Supported clases are %s"
-                     % (doc["class"], supported_classes.keys()))
-  return supported_classes[doc["class"]]
+    doc = config_dict
+    supported_classes = {"twml.FeatureConfig": "v1", "twml.contrib.FeatureConfig": "v2"}
+    if "class" not in doc:
+        raise ValueError("'class' key not found")
+    if doc["class"] not in supported_classes.keys():
+        raise ValueError(
+            "Class %s not supported. Supported clases are %s"
+            % (doc["class"], supported_classes.keys())
+        )
+    return supported_classes[doc["class"]]
 
 
 def _validate_config_dict_v1(config_dict):
-  """
-  Validate spec exported by twml.FeatureConfig
-  """
-  doc = config_dict
-
-  def malformed_error(msg):
-    raise ValueError("twml.FeatureConfig: Malformed feature_spec. %s" % msg)
-
-  if doc["class"] != "twml.FeatureConfig":
-    malformed_error("'class' is not twml.FeatureConfig")
-  if "format" not in doc:
-    malformed_error("'format' key not found")
-
-  # validate spec exported by twml.FeatureConfig
-  if doc["format"] == "exported":
-    dict_keys = ["features", "labels", "weight", "tensors", "sparse_tensors"]
-    for key in dict_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != dict:
-        malformed_error("'%s' is not a dict" % key)
-    if "filters" not in doc:
-      malformed_error("'filters' key not found")
-    elif type(doc["filters"]) != list:
-      malformed_error("'filters' is not a list")
-
-  # validate spec provided by modeler
-  elif doc["format"] == "manual":
-    raise NotImplementedError("Manual config support not yet implemented")
-  else:
-    malformed_error("'format' must be 'exported' or 'manual'")
+    """
+    Validate spec exported by twml.FeatureConfig
+    """
+    doc = config_dict
+
+    def malformed_error(msg):
+        raise ValueError("twml.FeatureConfig: Malformed feature_spec. %s" % msg)
+
+    if doc["class"] != "twml.FeatureConfig":
+        malformed_error("'class' is not twml.FeatureConfig")
+    if "format" not in doc:
+        malformed_error("'format' key not found")
+
+    # validate spec exported by twml.FeatureConfig
+    if doc["format"] == "exported":
+        dict_keys = ["features", "labels", "weight", "tensors", "sparse_tensors"]
+        for key in dict_keys:
+            if key not in doc:
+                malformed_error("'%s' key not found" % key)
+            if type(doc[key]) != dict:
+                malformed_error("'%s' is not a dict" % key)
+        if "filters" not in doc:
+            malformed_error("'filters' key not found")
+        elif type(doc["filters"]) != list:
+            malformed_error("'filters' is not a list")
+
+    # validate spec provided by modeler
+    elif doc["format"] == "manual":
+        raise NotImplementedError("Manual config support not yet implemented")
+    else:
+        malformed_error("'format' must be 'exported' or 'manual'")
 
 
 def _validate_config_dict_v2(config_dict):
-  """
-  Validate spec exported by twml.contrib.FeatureConfig
-  """
-  doc = config_dict
-
-  def malformed_error(msg):
-    raise ValueError("twml.contrib.FeatureConfig: Malformed feature_spec. %s" % msg)
-
-  if doc["class"] != "twml.contrib.FeatureConfig":
-    malformed_error("'class' is not twml.contrib.FeatureConfig")
-  if "format" not in doc:
-    malformed_error("'format key not found'")
-
-  # validate spec exported by twml.contrib.FeatureConfig (basic validation only)
-  if doc["format"] == "exported":
-    dict_keys = ["features", "labels", "weight", "tensors", "sparseTensors", "discretizeConfig"]
-    for key in dict_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != dict:
-        malformed_error("'%s' is not a dict" % key)
-    list_keys = ["sparseFeatureGroups", "denseFeatureGroups", "denseFeatures", "images", "filters"]
-    for key in list_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != list:
-        malformed_error("'%s' is not a list" % key)
-
-  # validate spec provided by modeler
-  elif doc["format"] == "manual":
-    raise NotImplementedError("Manual config support not yet implemented")
-  else:
-    malformed_error("'format' must be 'exported' or 'manual'")
+    """
+    Validate spec exported by twml.contrib.FeatureConfig
+    """
+    doc = config_dict
+
+    def malformed_error(msg):
+        raise ValueError("twml.contrib.FeatureConfig: Malformed feature_spec. %s" % msg)
+
+    if doc["class"] != "twml.contrib.FeatureConfig":
+        malformed_error("'class' is not twml.contrib.FeatureConfig")
+    if "format" not in doc:
+        malformed_error("'format key not found'")
+
+    # validate spec exported by twml.contrib.FeatureConfig (basic validation only)
+    if doc["format"] == "exported":
+        dict_keys = [
+            "features",
+            "labels",
+            "weight",
+            "tensors",
+            "sparseTensors",
+            "discretizeConfig",
+        ]
+        for key in dict_keys:
+            if key not in doc:
+                malformed_error("'%s' key not found" % key)
+            if type(doc[key]) != dict:
+                malformed_error("'%s' is not a dict" % key)
+        list_keys = [
+            "sparseFeatureGroups",
+            "denseFeatureGroups",
+            "denseFeatures",
+            "images",
+            "filters",
+        ]
+        for key in list_keys:
+            if key not in doc:
+                malformed_error("'%s' key not found" % key)
+            if type(doc[key]) != list:
+                malformed_error("'%s' is not a list" % key)
+
+    # validate spec provided by modeler
+    elif doc["format"] == "manual":
+        raise NotImplementedError("Manual config support not yet implemented")
+    else:
+        malformed_error("'format' must be 'exported' or 'manual'")
 
 
 def _create_feature_config_v1(config_dict, data_spec_path):
-  fc_builder = FeatureConfigBuilder(data_spec_path)
-
-  if config_dict["format"] == "exported":
-    # add features
-    for feature_info in config_dict["features"].values():
-      feature_name = re.escape(feature_info["featureName"])
-      feature_group = feature_info["featureGroup"]
-      fc_builder.add_feature(feature_name, feature_group)
-    # add labels
-    labels = []
-    for label_info in config_dict["labels"].values():
-      labels.append(label_info["featureName"])
-    fc_builder.add_labels(labels)
-    # feature filters
-    for feature_name in config_dict["filters"]:
-      fc_builder.add_filter(feature_name)
-    # weight
-    if config_dict["weight"]:
-      weight_feature = list(config_dict["weight"].values())[0]["featureName"]
-      fc_builder.define_weight(weight_feature)
-  else:
-    raise ValueError("Format '%s' not implemented" % config_dict["format"])
-
-  return fc_builder.build()
+    fc_builder = FeatureConfigBuilder(data_spec_path)
+
+    if config_dict["format"] == "exported":
+        # add features
+        for feature_info in config_dict["features"].values():
+            feature_name = re.escape(feature_info["featureName"])
+            feature_group = feature_info["featureGroup"]
+            fc_builder.add_feature(feature_name, feature_group)
+        # add labels
+        labels = []
+        for label_info in config_dict["labels"].values():
+            labels.append(label_info["featureName"])
+        fc_builder.add_labels(labels)
+        # feature filters
+        for feature_name in config_dict["filters"]:
+            fc_builder.add_filter(feature_name)
+        # weight
+        if config_dict["weight"]:
+            weight_feature = list(config_dict["weight"].values())[0]["featureName"]
+            fc_builder.define_weight(weight_feature)
+    else:
+        raise ValueError("Format '%s' not implemented" % config_dict["format"])
+
+    return fc_builder.build()
 
 
 def _create_feature_config_v2(config_dict, data_spec_path):
-  fc_builder = FeatureConfigBuilderV2(data_spec_path)
-
-  if config_dict["format"] == "exported":
-    # add sparse group extraction configs
-    for sparse_group in config_dict["sparseFeatureGroups"]:
-      fids = sparse_group["features"].keys()
-      fnames = [sparse_group["features"][fid]["featureName"] for fid in fids]
-      fc_builder.extract_features_as_hashed_sparse(
-        feature_regexes=[re.escape(fname) for fname in fnames],
-        output_tensor_name=sparse_group["outputName"],
-        hash_space_size_bits=sparse_group["hashSpaceBits"],
-        discretize_num_bins=sparse_group["discretize"]["numBins"],
-        discretize_output_size_bits=sparse_group["discretize"]["outputSizeBits"],
-        discretize_type=sparse_group["discretize"]["type"],
-        type_filter=sparse_group["filterType"])
-
-    # add dense group extraction configs
-    for dense_group in config_dict["denseFeatureGroups"]:
-      fids = dense_group["features"].keys()
-      fnames = [dense_group["features"][fid]["featureName"] for fid in fids]
-      fc_builder.extract_feature_group(
-        feature_regexes=[re.escape(fname) for fname in fnames],
-        group_name=dense_group["outputName"],
-        type_filter=dense_group["filterType"],
-        default_value=dense_group["defaultValue"])
-
-    # add dense feature configs
-    for dense_features in config_dict["denseFeatures"]:
-      fids = dense_features["features"].keys()
-      fnames = [dense_features["features"][fid]["featureName"] for fid in fids]
-      default_value = dense_features["defaultValue"]
-      if len(fnames) == 1 and type(default_value) != dict:
-        fc_builder.extract_feature(
-          feature_name=re.escape(fnames[0]),
-          expected_shape=dense_features["expectedShape"],
-          default_value=dense_features["defaultValue"])
-      else:
-        fc_builder.extract_features(
-          feature_regexes=[re.escape(fname) for fname in fnames],
-          default_value_map=dense_features["defaultValue"])
-
-    # add image feature configs
-    for image in config_dict["images"]:
-      fc_builder.extract_image(
-        feature_name=image["featureName"],
-        preprocess=image["preprocess"],
-        out_type=tf.as_dtype(image["outType"].lower()),
-        channels=image["channels"],
-        default_image=image["defaultImage"],
-      )
-
-    # add other tensor features (non-image)
-    tensor_fnames = []
-    image_fnames = [img["featureName"] for img in config_dict["images"]]
-    for tensor_fname in config_dict["tensors"]:
-      if tensor_fname not in image_fnames:
-        tensor_fnames.append(tensor_fname)
-    for sparse_tensor_fname in config_dict["sparseTensors"]:
-      tensor_fnames.append(sparse_tensor_fname)
-    fc_builder.extract_tensors(tensor_fnames)
-
-    # add labels
-    labels = []
-    for label_info in config_dict["labels"].values():
-      labels.append(label_info["featureName"])
-    fc_builder.add_labels(labels)
-
-  else:
-    raise ValueError("Format '%s' not implemented" % config_dict["format"])
-
-  return fc_builder.build()
+    fc_builder = FeatureConfigBuilderV2(data_spec_path)
+
+    if config_dict["format"] == "exported":
+        # add sparse group extraction configs
+        for sparse_group in config_dict["sparseFeatureGroups"]:
+            fids = sparse_group["features"].keys()
+            fnames = [sparse_group["features"][fid]["featureName"] for fid in fids]
+            fc_builder.extract_features_as_hashed_sparse(
+                feature_regexes=[re.escape(fname) for fname in fnames],
+                output_tensor_name=sparse_group["outputName"],
+                hash_space_size_bits=sparse_group["hashSpaceBits"],
+                discretize_num_bins=sparse_group["discretize"]["numBins"],
+                discretize_output_size_bits=sparse_group["discretize"][
+                    "outputSizeBits"
+                ],
+                discretize_type=sparse_group["discretize"]["type"],
+                type_filter=sparse_group["filterType"],
+            )
+
+        # add dense group extraction configs
+        for dense_group in config_dict["denseFeatureGroups"]:
+            fids = dense_group["features"].keys()
+            fnames = [dense_group["features"][fid]["featureName"] for fid in fids]
+            fc_builder.extract_feature_group(
+                feature_regexes=[re.escape(fname) for fname in fnames],
+                group_name=dense_group["outputName"],
+                type_filter=dense_group["filterType"],
+                default_value=dense_group["defaultValue"],
+            )
+
+        # add dense feature configs
+        for dense_features in config_dict["denseFeatures"]:
+            fids = dense_features["features"].keys()
+            fnames = [dense_features["features"][fid]["featureName"] for fid in fids]
+            default_value = dense_features["defaultValue"]
+            if len(fnames) == 1 and type(default_value) != dict:
+                fc_builder.extract_feature(
+                    feature_name=re.escape(fnames[0]),
+                    expected_shape=dense_features["expectedShape"],
+                    default_value=dense_features["defaultValue"],
+                )
+            else:
+                fc_builder.extract_features(
+                    feature_regexes=[re.escape(fname) for fname in fnames],
+                    default_value_map=dense_features["defaultValue"],
+                )
+
+        # add image feature configs
+        for image in config_dict["images"]:
+            fc_builder.extract_image(
+                feature_name=image["featureName"],
+                preprocess=image["preprocess"],
+                out_type=tf.as_dtype(image["outType"].lower()),
+                channels=image["channels"],
+                default_image=image["defaultImage"],
+            )
+
+        # add other tensor features (non-image)
+        tensor_fnames = []
+        image_fnames = [img["featureName"] for img in config_dict["images"]]
+        for tensor_fname in config_dict["tensors"]:
+            if tensor_fname not in image_fnames:
+                tensor_fnames.append(tensor_fname)
+        for sparse_tensor_fname in config_dict["sparseTensors"]:
+            tensor_fnames.append(sparse_tensor_fname)
+        fc_builder.extract_tensors(tensor_fnames)
+
+        # add labels
+        labels = []
+        for label_info in config_dict["labels"].values():
+            labels.append(label_info["featureName"])
+        fc_builder.add_labels(labels)
+
+    else:
+        raise ValueError("Format '%s' not implemented" % config_dict["format"])
+
+    return fc_builder.build()
 
 
 def create_feature_config_from_dict(config_dict, data_spec_path):
-  """
-  Create a FeatureConfig object from a feature spec dict.
-  """
-  config_version = _get_config_version(config_dict)
-  if config_version == "v1":
-    _validate_config_dict_v1(config_dict)
-    feature_config = _create_feature_config_v1(config_dict, data_spec_path)
-  elif config_version == "v2":
-    _validate_config_dict_v2(config_dict)
-    feature_config = _create_feature_config_v2(config_dict, data_spec_path)
-  else:
-    raise ValueError("version not supported")
-
-  return feature_config
+    """
+    Create a FeatureConfig object from a feature spec dict.
+    """
+    config_version = _get_config_version(config_dict)
+    if config_version == "v1":
+        _validate_config_dict_v1(config_dict)
+        feature_config = _create_feature_config_v1(config_dict, data_spec_path)
+    elif config_version == "v2":
+        _validate_config_dict_v2(config_dict)
+        feature_config = _create_feature_config_v2(config_dict, data_spec_path)
+    else:
+        raise ValueError("version not supported")
+
+    return feature_config
 
 
 def create_feature_config(config_path, data_spec_path):
-  """
-  Create a FeatureConfig object from a feature_spec.yaml file.
-  """
-  _, ext = os.path.splitext(config_path)
-  if ext not in ['.yaml', '.yml']:
-    raise ValueError("create_feature_config_from_yaml: Only .yaml/.yml supported")
+    """
+    Create a FeatureConfig object from a feature_spec.yaml file.
+    """
+    _, ext = os.path.splitext(config_path)
+    if ext not in [".yaml", ".yml"]:
+        raise ValueError("create_feature_config_from_yaml: Only .yaml/.yml supported")
 
-  with tf.io.gfile.GFile(config_path, mode='r') as fs:
-    config_dict = yaml.safe_load(fs)
+    with tf.io.gfile.GFile(config_path, mode="r") as fs:
+        config_dict = yaml.safe_load(fs)
 
-  return create_feature_config_from_dict(config_dict, data_spec_path)
+    return create_feature_config_from_dict(config_dict, data_spec_path)
diff --git a/twml/twml/contrib/feature_importances/feature_importances.py b/twml/twml/contrib/feature_importances/feature_importances.py
index a8bfcc129..0063b47a8 100644
--- a/twml/twml/contrib/feature_importances/feature_importances.py
+++ b/twml/twml/contrib/feature_importances/feature_importances.py
@@ -2,26 +2,25 @@
 
 import time
 from collections import defaultdict
+from queue import Queue
 
 from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
 from com.twitter.mlmetastore.modelrepo.core import FeatureImportance, FeatureNames
+from requests.exceptions import HTTPError, RetryError
+from tensorflow.compat.v1 import logging
 from twitter.deepbird.io.util import match_feature_regex_list
 
-from twml.contrib.feature_importances.helpers import (
-  _get_feature_name_from_config,
-  _get_feature_types_from_records,
-  _get_metrics_hook,
-  _expand_prefix,
-  longest_common_prefix,
-  write_list_to_hdfs_gfile)
 from twml.contrib.feature_importances.feature_permutation import PermutedInputFnFactory
+from twml.contrib.feature_importances.helpers import (
+    _expand_prefix,
+    _get_feature_name_from_config,
+    _get_feature_types_from_records,
+    _get_metrics_hook,
+    longest_common_prefix,
+    write_list_to_hdfs_gfile,
+)
 from twml.tracking import ExperimentTracker
 
-from tensorflow.compat.v1 import logging
-from requests.exceptions import HTTPError, RetryError
-from queue import Queue
-
-
 SERIAL = "serial"
 TREE = "tree"
 INDIVIDUAL = "Individual"
@@ -32,383 +31,561 @@
 
 
 def _repartition(feature_list_queue, fnames_ftypes, split_feature_group_on_period):
-  """
-  Iterate through letters to partition each feature by prefix, and then put each tuple
-    (prefix, feature_partition) into the feature_list_queue
-  Args:
-    prefix (str): The prefix shared by each feature in list_of_feature_types
-    feature_list_queue (Queue<(str, list<(str, str)>)>): The queue of feature groups
-    fnames_ftypes (list<(str, str)>): List of (fname, ftype) pairs. Each fname begins with prefix
-    split_feature_group_on_period (str): If true, require that feature groups end in a period
-  Returns:
-    Updated queue with each group in fnames_ftypes
-  """
-  assert len(fnames_ftypes) > 1
-
-  split_character = "." if split_feature_group_on_period else None
-  # Compute the longest prefix of the words
-  prefix = longest_common_prefix(
-    strings=[fname for fname, _ in fnames_ftypes], split_character=split_character)
-
-  # Separate the features by prefix
-  prefix_to_features = defaultdict(list)
-  for fname, ftype in fnames_ftypes:
-    assert fname.startswith(prefix)
-    new_prefix = _expand_prefix(fname=fname, prefix=prefix, split_character=split_character)
-    prefix_to_features[new_prefix].append((fname, ftype))
-
-  # Add all of the new partitions to the queue
-  for new_prefix, fname_ftype_list in prefix_to_features.items():
-    extended_new_prefix = longest_common_prefix(
-      strings=[fname for fname, _ in fname_ftype_list], split_character=split_character)
-    assert extended_new_prefix.startswith(new_prefix)
-    feature_list_queue.put((extended_new_prefix, fname_ftype_list))
-  return feature_list_queue
+    """
+    Iterate through letters to partition each feature by prefix, and then put each tuple
+      (prefix, feature_partition) into the feature_list_queue
+    Args:
+      prefix (str): The prefix shared by each feature in list_of_feature_types
+      feature_list_queue (Queue<(str, list<(str, str)>)>): The queue of feature groups
+      fnames_ftypes (list<(str, str)>): List of (fname, ftype) pairs. Each fname begins with prefix
+      split_feature_group_on_period (str): If true, require that feature groups end in a period
+    Returns:
+      Updated queue with each group in fnames_ftypes
+    """
+    assert len(fnames_ftypes) > 1
+
+    split_character = "." if split_feature_group_on_period else None
+    # Compute the longest prefix of the words
+    prefix = longest_common_prefix(
+        strings=[fname for fname, _ in fnames_ftypes], split_character=split_character
+    )
+
+    # Separate the features by prefix
+    prefix_to_features = defaultdict(list)
+    for fname, ftype in fnames_ftypes:
+        assert fname.startswith(prefix)
+        new_prefix = _expand_prefix(
+            fname=fname, prefix=prefix, split_character=split_character
+        )
+        prefix_to_features[new_prefix].append((fname, ftype))
+
+    # Add all of the new partitions to the queue
+    for new_prefix, fname_ftype_list in prefix_to_features.items():
+        extended_new_prefix = longest_common_prefix(
+            strings=[fname for fname, _ in fname_ftype_list],
+            split_character=split_character,
+        )
+        assert extended_new_prefix.startswith(new_prefix)
+        feature_list_queue.put((extended_new_prefix, fname_ftype_list))
+    return feature_list_queue
 
 
 def _infer_if_is_metric_larger_the_better(stopping_metric):
-  # Infers whether a metric should be interpreted such that larger numbers are better (e.g. ROC_AUC), as opposed to
-  #   larger numbers being worse (e.g. LOSS)
-  if stopping_metric is None:
-    raise ValueError("Error: Stopping Metric cannot be None")
-  elif stopping_metric.startswith(LOSS):
-    logging.info("Interpreting {} to be a metric where larger numbers are worse".format(stopping_metric))
-    is_metric_larger_the_better = False
-  else:
-    logging.info("Interpreting {} to be a metric where larger numbers are better".format(stopping_metric))
-    is_metric_larger_the_better = True
-  return is_metric_larger_the_better
-
-
-def _check_whether_tree_should_expand(baseline_performance, computed_performance, sensitivity, stopping_metric, is_metric_larger_the_better):
-  """
-  Returns True if
-    - the metric is positive (e.g. ROC_AUC) and computed_performance is nontrivially smaller than the baseline_performance
-    - the metric is negative (e.g. LOSS) and computed_performance is nontrivially larger than the baseline_performance
-  """
-  difference = ((baseline_performance[stopping_metric] - computed_performance[stopping_metric]) /
-                 baseline_performance[stopping_metric])
-
-  if not is_metric_larger_the_better:
-      difference = -difference
-
-  logging.info(
-    "Found a {} difference of {}. Sensitivity is {}.".format("positive" if is_metric_larger_the_better else "negative", difference, sensitivity))
-  return difference > sensitivity
+    # Infers whether a metric should be interpreted such that larger numbers are better (e.g. ROC_AUC), as opposed to
+    #   larger numbers being worse (e.g. LOSS)
+    if stopping_metric is None:
+        raise ValueError("Error: Stopping Metric cannot be None")
+    elif stopping_metric.startswith(LOSS):
+        logging.info(
+            "Interpreting {} to be a metric where larger numbers are worse".format(
+                stopping_metric
+            )
+        )
+        is_metric_larger_the_better = False
+    else:
+        logging.info(
+            "Interpreting {} to be a metric where larger numbers are better".format(
+                stopping_metric
+            )
+        )
+        is_metric_larger_the_better = True
+    return is_metric_larger_the_better
+
+
+def _check_whether_tree_should_expand(
+    baseline_performance,
+    computed_performance,
+    sensitivity,
+    stopping_metric,
+    is_metric_larger_the_better,
+):
+    """
+    Returns True if
+      - the metric is positive (e.g. ROC_AUC) and computed_performance is nontrivially smaller than the baseline_performance
+      - the metric is negative (e.g. LOSS) and computed_performance is nontrivially larger than the baseline_performance
+    """
+    difference = (
+        baseline_performance[stopping_metric] - computed_performance[stopping_metric]
+    ) / baseline_performance[stopping_metric]
+
+    if not is_metric_larger_the_better:
+        difference = -difference
+
+    logging.info(
+        "Found a {} difference of {}. Sensitivity is {}.".format(
+            "positive" if is_metric_larger_the_better else "negative",
+            difference,
+            sensitivity,
+        )
+    )
+    return difference > sensitivity
 
 
 def _compute_multiple_permuted_performances_from_trainer(
-    factory, fname_ftypes, trainer, parse_fn, record_count):
-  """Compute performances with fname and fype permuted
-  """
-  metrics_hook = _get_metrics_hook(trainer)
-  trainer._estimator.evaluate(
-    input_fn=factory.get_permuted_input_fn(
-      batch_size=trainer._params.eval_batch_size, parse_fn=parse_fn, fname_ftypes=fname_ftypes),
-    steps=(record_count + trainer._params.eval_batch_size) // trainer._params.eval_batch_size,
-    hooks=[metrics_hook],
-    checkpoint_path=trainer.best_or_latest_checkpoint)
-  return metrics_hook.metric_values
-
-
-def _get_extra_feature_group_performances(factory, trainer, parse_fn, extra_groups, feature_to_type, record_count):
-  """Compute performance differences for the extra feature groups
-  """
-  extra_group_feature_performance_results = {}
-  for group_name, raw_feature_regex_list in extra_groups.items():
-    start = time.time()
-    fnames = match_feature_regex_list(
-      features=feature_to_type.keys(),
-      feature_regex_list=[regex for regex in raw_feature_regex_list],
-      preprocess=False,
-      as_dict=False)
-
-    fnames_ftypes = [(fname, feature_to_type[fname]) for fname in fnames]
-
-    logging.info("Extracted extra group {} with features {}".format(group_name, fnames_ftypes))
-    extra_group_feature_performance_results[group_name] = _compute_multiple_permuted_performances_from_trainer(
-      factory=factory, fname_ftypes=fnames_ftypes,
-      trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-    logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-      group_name, int(time.time() - start)))
-  return extra_group_feature_performance_results
+    factory, fname_ftypes, trainer, parse_fn, record_count
+):
+    """Compute performances with fname and fype permuted"""
+    metrics_hook = _get_metrics_hook(trainer)
+    trainer._estimator.evaluate(
+        input_fn=factory.get_permuted_input_fn(
+            batch_size=trainer._params.eval_batch_size,
+            parse_fn=parse_fn,
+            fname_ftypes=fname_ftypes,
+        ),
+        steps=(record_count + trainer._params.eval_batch_size)
+        // trainer._params.eval_batch_size,
+        hooks=[metrics_hook],
+        checkpoint_path=trainer.best_or_latest_checkpoint,
+    )
+    return metrics_hook.metric_values
+
+
+def _get_extra_feature_group_performances(
+    factory, trainer, parse_fn, extra_groups, feature_to_type, record_count
+):
+    """Compute performance differences for the extra feature groups"""
+    extra_group_feature_performance_results = {}
+    for group_name, raw_feature_regex_list in extra_groups.items():
+        start = time.time()
+        fnames = match_feature_regex_list(
+            features=feature_to_type.keys(),
+            feature_regex_list=[regex for regex in raw_feature_regex_list],
+            preprocess=False,
+            as_dict=False,
+        )
+
+        fnames_ftypes = [(fname, feature_to_type[fname]) for fname in fnames]
+
+        logging.info(
+            "Extracted extra group {} with features {}".format(
+                group_name, fnames_ftypes
+            )
+        )
+        extra_group_feature_performance_results[
+            group_name
+        ] = _compute_multiple_permuted_performances_from_trainer(
+            factory=factory,
+            fname_ftypes=fnames_ftypes,
+            trainer=trainer,
+            parse_fn=parse_fn,
+            record_count=record_count,
+        )
+        logging.info(
+            "\n\nImportances computed for {} in {} seconds \n\n".format(
+                group_name, int(time.time() - start)
+            )
+        )
+    return extra_group_feature_performance_results
 
 
 def _feature_importances_tree_algorithm(
-    data_dir, trainer, parse_fn, fnames, stopping_metric, file_list=None, datarecord_filter_fn=None, split_feature_group_on_period=True,
-    record_count=99999, is_metric_larger_the_better=None, sensitivity=0.025, extra_groups=None, dont_build_tree=False):
-  """Tree algorithm for feature and feature group importances. This algorithm build a prefix tree of
-  the feature names and then traverses the tree with a BFS. At each node (aka group of features with
-  a shared prefix) the algorithm computes the performance of the model when we permute all features
-  in the group. The algorithm only zooms-in on groups that impact the performance by more than
-  sensitivity. As a result, features that affect the model performance by less than sensitivity will
-  not have an exact importance.
-  Args:
-    data_dir: (str): The location of the training or testing data to compute importances over.
-      If None, the trainer._eval_files are used
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    parse_fn: (function): The parse_fn used by eval_input_fn
-    fnames (list<string>): The list of feature names
-    stopping_metric (str): The metric to use to determine when to stop expanding trees
-    file_list (list<str>): The list of filenames. Exactly one of file_list and data_dir should be
-      provided
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    split_feature_group_on_period (boolean): If true, split feature groups by period rather than on
-      optimal prefix
-    record_count (int): The number of records to compute importances over
-    is_metric_larger_the_better (boolean): If true, assume that stopping_metric is a metric where larger
-      values are better (e.g. ROC-AUC)
-    sensitivity (float): The smallest change in performance to continue to expand the tree
-    extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
-      the names of the features in the group. You should only supply a value for this argument if you have a set
-      of features that you want to evaluate as a group but don't share a prefix
-    dont_build_tree (boolean): If True, don't build the tree and only compute the extra_groups importances
-  Returns:
-    A dictionary that contains the individual and group feature importances
-  """
-  factory = PermutedInputFnFactory(
-    data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn)
-  baseline_performance = _compute_multiple_permuted_performances_from_trainer(
-    factory=factory, fname_ftypes=[],
-    trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-  out = {"None": baseline_performance}
-
-  if stopping_metric not in baseline_performance:
-    raise ValueError("The stopping metric '{}' not found in baseline_performance. Metrics are {}".format(
-      stopping_metric, list(baseline_performance.keys())))
-
-  is_metric_larger_the_better = (
-    is_metric_larger_the_better if is_metric_larger_the_better is not None else _infer_if_is_metric_larger_the_better(stopping_metric))
-  logging.info("Using {} as the stopping metric for the tree algorithm".format(stopping_metric))
-
-  feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames)
-  all_feature_types = list(feature_to_type.items())
-
-  individual_feature_performances = {}
-  feature_group_performances = {}
-  if dont_build_tree:
-    logging.info("Not building feature importance trie. Will only compute importances for the extra_groups")
-  else:
-    logging.info("Building feature importance trie")
-    # Each element in the Queue will be a tuple of (prefix, list_of_feature_type_pairs) where
-    #   each feature in list_of_feature_type_pairs will have have the prefix "prefix"
-    feature_list_queue = _repartition(
-      feature_list_queue=Queue(), fnames_ftypes=all_feature_types, split_feature_group_on_period=split_feature_group_on_period)
-
-    while not feature_list_queue.empty():
-      # Pop the queue. We should never have an empty list in the queue
-      prefix, fnames_ftypes = feature_list_queue.get()
-      assert len(fnames_ftypes) > 0
-
-      # Compute performance from permuting all features in fname_ftypes
-      logging.info(
-        "\n\nComputing importances for {} ({}...). {} elements left in the queue \n\n".format(
-          prefix, fnames_ftypes[:5], feature_list_queue.qsize()))
-      start = time.time()
-      computed_performance = _compute_multiple_permuted_performances_from_trainer(
-        factory=factory, fname_ftypes=fnames_ftypes,
-        trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-      logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-        prefix, int(time.time() - start)))
-      if len(fnames_ftypes) == 1:
-        individual_feature_performances[fnames_ftypes[0][0]] = computed_performance
-      else:
-        feature_group_performances[prefix] = computed_performance
-      # Dig deeper into the features in fname_ftypes only if there is more than one feature in the
-      #    list and the performance drop is nontrivial
-      logging.info("Checking performance for {} ({}...)".format(prefix, fnames_ftypes[:5]))
-      check = _check_whether_tree_should_expand(
-        baseline_performance=baseline_performance, computed_performance=computed_performance,
-        sensitivity=sensitivity, stopping_metric=stopping_metric, is_metric_larger_the_better=is_metric_larger_the_better)
-      if len(fnames_ftypes) > 1 and check:
-        logging.info("Expanding {} ({}...)".format(prefix, fnames_ftypes[:5]))
-        feature_list_queue = _repartition(
-          feature_list_queue=feature_list_queue, fnames_ftypes=fnames_ftypes, split_feature_group_on_period=split_feature_group_on_period)
-      else:
-        logging.info("Not expanding {} ({}...)".format(prefix, fnames_ftypes[:5]))
-
-  # Baseline performance is grouped in with individual_feature_importance_results
-  individual_feature_performance_results = dict(
-    out, **{k: v for k, v in individual_feature_performances.items()})
-  group_feature_performance_results = {k: v for k, v in feature_group_performances.items()}
-
-  if extra_groups is not None:
-    logging.info("Computing performances for extra groups {}".format(extra_groups.keys()))
-    for group_name, performances in _get_extra_feature_group_performances(
+    data_dir,
+    trainer,
+    parse_fn,
+    fnames,
+    stopping_metric,
+    file_list=None,
+    datarecord_filter_fn=None,
+    split_feature_group_on_period=True,
+    record_count=99999,
+    is_metric_larger_the_better=None,
+    sensitivity=0.025,
+    extra_groups=None,
+    dont_build_tree=False,
+):
+    """Tree algorithm for feature and feature group importances. This algorithm build a prefix tree of
+    the feature names and then traverses the tree with a BFS. At each node (aka group of features with
+    a shared prefix) the algorithm computes the performance of the model when we permute all features
+    in the group. The algorithm only zooms-in on groups that impact the performance by more than
+    sensitivity. As a result, features that affect the model performance by less than sensitivity will
+    not have an exact importance.
+    Args:
+      data_dir: (str): The location of the training or testing data to compute importances over.
+        If None, the trainer._eval_files are used
+      trainer: (DataRecordTrainer): A DataRecordTrainer object
+      parse_fn: (function): The parse_fn used by eval_input_fn
+      fnames (list<string>): The list of feature names
+      stopping_metric (str): The metric to use to determine when to stop expanding trees
+      file_list (list<str>): The list of filenames. Exactly one of file_list and data_dir should be
+        provided
+      datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
+          and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
+      split_feature_group_on_period (boolean): If true, split feature groups by period rather than on
+        optimal prefix
+      record_count (int): The number of records to compute importances over
+      is_metric_larger_the_better (boolean): If true, assume that stopping_metric is a metric where larger
+        values are better (e.g. ROC-AUC)
+      sensitivity (float): The smallest change in performance to continue to expand the tree
+      extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
+        the names of the features in the group. You should only supply a value for this argument if you have a set
+        of features that you want to evaluate as a group but don't share a prefix
+      dont_build_tree (boolean): If True, don't build the tree and only compute the extra_groups importances
+    Returns:
+      A dictionary that contains the individual and group feature importances
+    """
+    factory = PermutedInputFnFactory(
+        data_dir=data_dir,
+        record_count=record_count,
+        file_list=file_list,
+        datarecord_filter_fn=datarecord_filter_fn,
+    )
+    baseline_performance = _compute_multiple_permuted_performances_from_trainer(
         factory=factory,
+        fname_ftypes=[],
         trainer=trainer,
         parse_fn=parse_fn,
-        extra_groups=extra_groups,
-        feature_to_type=feature_to_type,
-        record_count=record_count).items():
-      group_feature_performance_results[group_name] = performances
-  else:
-    logging.info("Not computing performances for extra groups")
-
-  return {INDIVIDUAL: individual_feature_performance_results,
-          GROUP: group_feature_performance_results}
+        record_count=record_count,
+    )
+    out = {"None": baseline_performance}
+
+    if stopping_metric not in baseline_performance:
+        raise ValueError(
+            "The stopping metric '{}' not found in baseline_performance. Metrics are {}".format(
+                stopping_metric, list(baseline_performance.keys())
+            )
+        )
+
+    is_metric_larger_the_better = (
+        is_metric_larger_the_better
+        if is_metric_larger_the_better is not None
+        else _infer_if_is_metric_larger_the_better(stopping_metric)
+    )
+    logging.info(
+        "Using {} as the stopping metric for the tree algorithm".format(stopping_metric)
+    )
+
+    feature_to_type = _get_feature_types_from_records(
+        records=factory.records, fnames=fnames
+    )
+    all_feature_types = list(feature_to_type.items())
+
+    individual_feature_performances = {}
+    feature_group_performances = {}
+    if dont_build_tree:
+        logging.info(
+            "Not building feature importance trie. Will only compute importances for the extra_groups"
+        )
+    else:
+        logging.info("Building feature importance trie")
+        # Each element in the Queue will be a tuple of (prefix, list_of_feature_type_pairs) where
+        #   each feature in list_of_feature_type_pairs will have have the prefix "prefix"
+        feature_list_queue = _repartition(
+            feature_list_queue=Queue(),
+            fnames_ftypes=all_feature_types,
+            split_feature_group_on_period=split_feature_group_on_period,
+        )
+
+        while not feature_list_queue.empty():
+            # Pop the queue. We should never have an empty list in the queue
+            prefix, fnames_ftypes = feature_list_queue.get()
+            assert len(fnames_ftypes) > 0
+
+            # Compute performance from permuting all features in fname_ftypes
+            logging.info(
+                "\n\nComputing importances for {} ({}...). {} elements left in the queue \n\n".format(
+                    prefix, fnames_ftypes[:5], feature_list_queue.qsize()
+                )
+            )
+            start = time.time()
+            computed_performance = _compute_multiple_permuted_performances_from_trainer(
+                factory=factory,
+                fname_ftypes=fnames_ftypes,
+                trainer=trainer,
+                parse_fn=parse_fn,
+                record_count=record_count,
+            )
+            logging.info(
+                "\n\nImportances computed for {} in {} seconds \n\n".format(
+                    prefix, int(time.time() - start)
+                )
+            )
+            if len(fnames_ftypes) == 1:
+                individual_feature_performances[
+                    fnames_ftypes[0][0]
+                ] = computed_performance
+            else:
+                feature_group_performances[prefix] = computed_performance
+            # Dig deeper into the features in fname_ftypes only if there is more than one feature in the
+            #    list and the performance drop is nontrivial
+            logging.info(
+                "Checking performance for {} ({}...)".format(prefix, fnames_ftypes[:5])
+            )
+            check = _check_whether_tree_should_expand(
+                baseline_performance=baseline_performance,
+                computed_performance=computed_performance,
+                sensitivity=sensitivity,
+                stopping_metric=stopping_metric,
+                is_metric_larger_the_better=is_metric_larger_the_better,
+            )
+            if len(fnames_ftypes) > 1 and check:
+                logging.info("Expanding {} ({}...)".format(prefix, fnames_ftypes[:5]))
+                feature_list_queue = _repartition(
+                    feature_list_queue=feature_list_queue,
+                    fnames_ftypes=fnames_ftypes,
+                    split_feature_group_on_period=split_feature_group_on_period,
+                )
+            else:
+                logging.info(
+                    "Not expanding {} ({}...)".format(prefix, fnames_ftypes[:5])
+                )
+
+    # Baseline performance is grouped in with individual_feature_importance_results
+    individual_feature_performance_results = dict(
+        out, **{k: v for k, v in individual_feature_performances.items()}
+    )
+    group_feature_performance_results = {
+        k: v for k, v in feature_group_performances.items()
+    }
+
+    if extra_groups is not None:
+        logging.info(
+            "Computing performances for extra groups {}".format(extra_groups.keys())
+        )
+        for group_name, performances in _get_extra_feature_group_performances(
+            factory=factory,
+            trainer=trainer,
+            parse_fn=parse_fn,
+            extra_groups=extra_groups,
+            feature_to_type=feature_to_type,
+            record_count=record_count,
+        ).items():
+            group_feature_performance_results[group_name] = performances
+    else:
+        logging.info("Not computing performances for extra groups")
+
+    return {
+        INDIVIDUAL: individual_feature_performance_results,
+        GROUP: group_feature_performance_results,
+    }
 
 
 def _feature_importances_serial_algorithm(
-    data_dir, trainer, parse_fn, fnames, file_list=None, datarecord_filter_fn=None, factory=None, record_count=99999):
-  """Serial algorithm for feature importances. This algorithm computes the
-  importance of each feature.
-  """
-  factory = PermutedInputFnFactory(
-    data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn)
-  feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames)
-
-  out = {}
-  for fname, ftype in list(feature_to_type.items()) + [(None, None)]:
-    logging.info("\n\nComputing importances for {}\n\n".format(fname))
-    start = time.time()
-    fname_ftypes = [(fname, ftype)] if fname is not None else []
-    out[str(fname)] = _compute_multiple_permuted_performances_from_trainer(
-      factory=factory, fname_ftypes=fname_ftypes,
-      trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-    logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-      fname, int(time.time() - start)))
-  # The serial algorithm does not compute group feature results.
-  return {INDIVIDUAL: out, GROUP: {}}
+    data_dir,
+    trainer,
+    parse_fn,
+    fnames,
+    file_list=None,
+    datarecord_filter_fn=None,
+    factory=None,
+    record_count=99999,
+):
+    """Serial algorithm for feature importances. This algorithm computes the
+    importance of each feature.
+    """
+    factory = PermutedInputFnFactory(
+        data_dir=data_dir,
+        record_count=record_count,
+        file_list=file_list,
+        datarecord_filter_fn=datarecord_filter_fn,
+    )
+    feature_to_type = _get_feature_types_from_records(
+        records=factory.records, fnames=fnames
+    )
+
+    out = {}
+    for fname, ftype in list(feature_to_type.items()) + [(None, None)]:
+        logging.info("\n\nComputing importances for {}\n\n".format(fname))
+        start = time.time()
+        fname_ftypes = [(fname, ftype)] if fname is not None else []
+        out[str(fname)] = _compute_multiple_permuted_performances_from_trainer(
+            factory=factory,
+            fname_ftypes=fname_ftypes,
+            trainer=trainer,
+            parse_fn=parse_fn,
+            record_count=record_count,
+        )
+        logging.info(
+            "\n\nImportances computed for {} in {} seconds \n\n".format(
+                fname, int(time.time() - start)
+            )
+        )
+    # The serial algorithm does not compute group feature results.
+    return {INDIVIDUAL: out, GROUP: {}}
 
 
 def _process_feature_name_for_mldash(feature_name):
-  # Using a forward slash in the name causes feature importance writing to fail because strato interprets it as
-  #   part of a url
-  return feature_name.replace("/", "__")
+    # Using a forward slash in the name causes feature importance writing to fail because strato interprets it as
+    #   part of a url
+    return feature_name.replace("/", "__")
 
 
 def compute_feature_importances(
-    trainer, data_dir=None, feature_config=None, algorithm=TREE, parse_fn=None, datarecord_filter_fn=None, **kwargs):
-  """Perform a feature importance analysis on a trained model
-  Args:
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    data_dir: (str): The location of the training or testing data to compute importances over.
-      If None, the trainer._eval_files are used
-    feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
-      is taken from the trainer
-    algorithm (str): The algorithm to use
-    parse_fn: (function): The parse_fn used by eval_input_fn. By default this is
-      feature_config.get_parse_fn()
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-  """
-
-  # We only use the trainer's eval files if an override data_dir is not provided
-  if data_dir is None:
-    logging.info("Using trainer._eval_files (found {} as files)".format(trainer._eval_files))
-    file_list = trainer._eval_files
-  else:
-    logging.info("data_dir provided. Looking at {} for data.".format(data_dir))
-    file_list = None
-
-  feature_config = feature_config or trainer._feature_config
-  out = {}
-  if not feature_config:
-    logging.warn("WARN: Not computing feature importance because trainer._feature_config is None")
-    out = None
-  else:
-    parse_fn = parse_fn if parse_fn is not None else feature_config.get_parse_fn()
-    fnames = _get_feature_name_from_config(feature_config)
-    logging.info("Computing importances for {}".format(fnames))
-    logging.info("Using the {} feature importance computation algorithm".format(algorithm))
-    algorithm = {
-      SERIAL: _feature_importances_serial_algorithm,
-      TREE: _feature_importances_tree_algorithm}[algorithm]
-    out = algorithm(data_dir=data_dir, trainer=trainer, parse_fn=parse_fn, fnames=fnames, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn, **kwargs)
-  return out
+    trainer,
+    data_dir=None,
+    feature_config=None,
+    algorithm=TREE,
+    parse_fn=None,
+    datarecord_filter_fn=None,
+    **kwargs
+):
+    """Perform a feature importance analysis on a trained model
+    Args:
+      trainer: (DataRecordTrainer): A DataRecordTrainer object
+      data_dir: (str): The location of the training or testing data to compute importances over.
+        If None, the trainer._eval_files are used
+      feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
+        is taken from the trainer
+      algorithm (str): The algorithm to use
+      parse_fn: (function): The parse_fn used by eval_input_fn. By default this is
+        feature_config.get_parse_fn()
+      datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
+          and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
+    """
+
+    # We only use the trainer's eval files if an override data_dir is not provided
+    if data_dir is None:
+        logging.info(
+            "Using trainer._eval_files (found {} as files)".format(trainer._eval_files)
+        )
+        file_list = trainer._eval_files
+    else:
+        logging.info("data_dir provided. Looking at {} for data.".format(data_dir))
+        file_list = None
+
+    feature_config = feature_config or trainer._feature_config
+    out = {}
+    if not feature_config:
+        logging.warn(
+            "WARN: Not computing feature importance because trainer._feature_config is None"
+        )
+        out = None
+    else:
+        parse_fn = parse_fn if parse_fn is not None else feature_config.get_parse_fn()
+        fnames = _get_feature_name_from_config(feature_config)
+        logging.info("Computing importances for {}".format(fnames))
+        logging.info(
+            "Using the {} feature importance computation algorithm".format(algorithm)
+        )
+        algorithm = {
+            SERIAL: _feature_importances_serial_algorithm,
+            TREE: _feature_importances_tree_algorithm,
+        }[algorithm]
+        out = algorithm(
+            data_dir=data_dir,
+            trainer=trainer,
+            parse_fn=parse_fn,
+            fnames=fnames,
+            file_list=file_list,
+            datarecord_filter_fn=datarecord_filter_fn,
+            **kwargs
+        )
+    return out
 
 
 def write_feature_importances_to_hdfs(
-    trainer, feature_importances, output_path=None, metric="roc_auc"):
-  """Publish a feature importance analysis to hdfs as a tsv
-  Args:
-    (see compute_feature_importances for other args)
-    trainer (Trainer)
-    feature_importances (dict): Dictionary of feature importances
-    output_path (str): The remote or local file to write the feature importances to. If not
-      provided, this is inferred to be the trainer save dir
-    metric (str): The metric to write to tsv
-  """
-  # String formatting appends (Individual) or (Group) to feature name depending on type
-  perfs = {"{} ({})".format(k, importance_key) if k != "None" else k: v[metric]
-    for importance_key, importance_value in feature_importances.items()
-    for k, v in importance_value.items()}
-
-  output_path = ("{}/feature_importances-{}".format(
-    trainer._save_dir[:-1] if trainer._save_dir.endswith('/') else trainer._save_dir,
-    output_path if output_path is not None else str(time.time())))
-
-  if len(perfs) > 0:
-    logging.info("Writing feature_importances for {} to hdfs".format(perfs.keys()))
-    entries = [
-      {
-        "name": name,
-        "drop": perfs["None"] - perfs[name],
-        "pdrop": 100 * (perfs["None"] - perfs[name]) / (perfs["None"] + 1e-8),
-        "perf": perfs[name]
-      } for name in perfs.keys()]
-    out = ["Name\tPerformance Drop\tPercent Performance Drop\tPerformance"]
-    for entry in sorted(entries, key=lambda d: d["drop"]):
-      out.append("{name}\t{drop}\t{pdrop}%\t{perf}".format(**entry))
-    logging.info("\n".join(out))
-    write_list_to_hdfs_gfile(out, output_path)
-    logging.info("Wrote feature feature_importances to {}".format(output_path))
-  else:
-    logging.info("Not writing feature_importances to hdfs")
-  return output_path
-
-
-def write_feature_importances_to_ml_dash(trainer, feature_importances, feature_config=None):
-  # type: (DataRecordTrainer, FeatureConfig, dict) -> None
-  """Publish feature importances + all feature names to ML Metastore
-  Args:
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
-      is taken from the trainer
-    feature_importances (dict, default=None): Dictionary of precomputed feature importances
-    feature_importance_metric (str, default=None): The metric to write to ML Dashboard
-  """
-  experiment_tracking_path = trainer.experiment_tracker.tracking_path\
-    if trainer.experiment_tracker.tracking_path\
-    else ExperimentTracker.guess_path(trainer._save_dir)
-
-  logging.info('Computing feature importances for run: {}'.format(experiment_tracking_path))
-
-  feature_importance_list = []
-  for key in feature_importances:
-    for feature, imps in feature_importances[key].items():
-      logging.info('FEATURE NAME: {}'.format(feature))
-      feature_name = feature.split(' (').pop(0)
-      for metric_name, value in imps.items():
-        try:
-          imps[metric_name] = float(value)
-          logging.info('Wrote feature importance value {} for metric: {}'.format(str(value), metric_name))
-        except Exception as ex:
-          logging.error("Skipping writing metric:{} to ML Metastore due to invalid metric value: {} or value type: {}. Exception: {}".format(metric_name, str(value), type(value), str(ex)))
-          pass
-
-      feature_importance_list.append(FeatureImportance(
-        run_id=experiment_tracking_path,
-        feature_name=_process_feature_name_for_mldash(feature_name),
-        feature_importance_metrics=imps,
-        is_group=key == GROUP
-      ))
-
-# setting feature config to match the one used in compute_feature_importances
-  feature_config = feature_config or trainer._feature_config
-  feature_names = FeatureNames(
-    run_id=experiment_tracking_path,
-    names=list(feature_config.features.keys())
-  )
-
-  try:
-    client = ModelRepoClient()
-    logging.info('Writing feature importances to ML Metastore')
-    client.add_feature_importances(feature_importance_list)
-    logging.info('Writing feature names to ML Metastore')
-    client.add_feature_names(feature_names)
-  except (HTTPError, RetryError) as err:
-    logging.error('Feature importance is not being written due to: '
-                  'HTTPError when attempting to write to ML Metastore: \n{}.'.format(err))
+    trainer, feature_importances, output_path=None, metric="roc_auc"
+):
+    """Publish a feature importance analysis to hdfs as a tsv
+    Args:
+      (see compute_feature_importances for other args)
+      trainer (Trainer)
+      feature_importances (dict): Dictionary of feature importances
+      output_path (str): The remote or local file to write the feature importances to. If not
+        provided, this is inferred to be the trainer save dir
+      metric (str): The metric to write to tsv
+    """
+    # String formatting appends (Individual) or (Group) to feature name depending on type
+    perfs = {
+        "{} ({})".format(k, importance_key) if k != "None" else k: v[metric]
+        for importance_key, importance_value in feature_importances.items()
+        for k, v in importance_value.items()
+    }
+
+    output_path = "{}/feature_importances-{}".format(
+        trainer._save_dir[:-1]
+        if trainer._save_dir.endswith("/")
+        else trainer._save_dir,
+        output_path if output_path is not None else str(time.time()),
+    )
+
+    if len(perfs) > 0:
+        logging.info("Writing feature_importances for {} to hdfs".format(perfs.keys()))
+        entries = [
+            {
+                "name": name,
+                "drop": perfs["None"] - perfs[name],
+                "pdrop": 100 * (perfs["None"] - perfs[name]) / (perfs["None"] + 1e-8),
+                "perf": perfs[name],
+            }
+            for name in perfs.keys()
+        ]
+        out = ["Name\tPerformance Drop\tPercent Performance Drop\tPerformance"]
+        for entry in sorted(entries, key=lambda d: d["drop"]):
+            out.append("{name}\t{drop}\t{pdrop}%\t{perf}".format(**entry))
+        logging.info("\n".join(out))
+        write_list_to_hdfs_gfile(out, output_path)
+        logging.info("Wrote feature feature_importances to {}".format(output_path))
+    else:
+        logging.info("Not writing feature_importances to hdfs")
+    return output_path
+
+
+def write_feature_importances_to_ml_dash(
+    trainer, feature_importances, feature_config=None
+):
+    # type: (DataRecordTrainer, FeatureConfig, dict) -> None
+    """Publish feature importances + all feature names to ML Metastore
+    Args:
+      trainer: (DataRecordTrainer): A DataRecordTrainer object
+      feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
+        is taken from the trainer
+      feature_importances (dict, default=None): Dictionary of precomputed feature importances
+      feature_importance_metric (str, default=None): The metric to write to ML Dashboard
+    """
+    experiment_tracking_path = (
+        trainer.experiment_tracker.tracking_path
+        if trainer.experiment_tracker.tracking_path
+        else ExperimentTracker.guess_path(trainer._save_dir)
+    )
+
+    logging.info(
+        "Computing feature importances for run: {}".format(experiment_tracking_path)
+    )
+
+    feature_importance_list = []
+    for key in feature_importances:
+        for feature, imps in feature_importances[key].items():
+            logging.info("FEATURE NAME: {}".format(feature))
+            feature_name = feature.split(" (").pop(0)
+            for metric_name, value in imps.items():
+                try:
+                    imps[metric_name] = float(value)
+                    logging.info(
+                        "Wrote feature importance value {} for metric: {}".format(
+                            str(value), metric_name
+                        )
+                    )
+                except Exception as ex:
+                    logging.error(
+                        "Skipping writing metric:{} to ML Metastore due to invalid metric value: {} or value type: {}. Exception: {}".format(
+                            metric_name, str(value), type(value), str(ex)
+                        )
+                    )
+                    pass
+
+            feature_importance_list.append(
+                FeatureImportance(
+                    run_id=experiment_tracking_path,
+                    feature_name=_process_feature_name_for_mldash(feature_name),
+                    feature_importance_metrics=imps,
+                    is_group=key == GROUP,
+                )
+            )
+
+    # setting feature config to match the one used in compute_feature_importances
+    feature_config = feature_config or trainer._feature_config
+    feature_names = FeatureNames(
+        run_id=experiment_tracking_path, names=list(feature_config.features.keys())
+    )
+
+    try:
+        client = ModelRepoClient()
+        logging.info("Writing feature importances to ML Metastore")
+        client.add_feature_importances(feature_importance_list)
+        logging.info("Writing feature names to ML Metastore")
+        client.add_feature_names(feature_names)
+    except (HTTPError, RetryError) as err:
+        logging.error(
+            "Feature importance is not being written due to: "
+            "HTTPError when attempting to write to ML Metastore: \n{}.".format(err)
+        )
diff --git a/twml/twml/contrib/feature_importances/feature_permutation.py b/twml/twml/contrib/feature_importances/feature_permutation.py
index 809f5fde0..55a0d40ac 100644
--- a/twml/twml/contrib/feature_importances/feature_permutation.py
+++ b/twml/twml/contrib/feature_importances/feature_permutation.py
@@ -1,129 +1,167 @@
-from copy import deepcopy
 import random
 import types
+from copy import deepcopy
 
+import tensorflow.compat.v1 as tf
+from com.twitter.ml.api.ttypes import DataRecord  # pylint: disable=import-error
+from tensorflow.compat.v1 import logging
 from twitter.deepbird.util.thrift.simple_converters import (
-  bytes_to_thrift_object, thrift_object_to_bytes)
+    bytes_to_thrift_object,
+    thrift_object_to_bytes,
+)
 
-from tensorflow.compat.v1 import logging
-from com.twitter.ml.api.ttypes import DataRecord  # pylint: disable=import-error
-import tensorflow.compat.v1 as tf
 import twml
 
 
 class PermutedInputFnFactory(object):
+    def __init__(
+        self, data_dir, record_count, file_list=None, datarecord_filter_fn=None
+    ):
+        """
+        Args:
+          data_dir (str): The location of the records on hdfs
+          record_count (int): The number of records to process
+          file_list (list<str>, default=None): The list of data files on HDFS. If provided, use this instead
+            of data_dir
+          datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
+            and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
+        """
+        if not (data_dir is None) ^ (file_list is None):
+            raise ValueError(
+                "Exactly one of data_dir and file_list can be provided. Got {} for data_dir and {} for file_list".format(
+                    data_dir, file_list
+                )
+            )
 
-  def __init__(self, data_dir, record_count, file_list=None, datarecord_filter_fn=None):
-    """
-    Args:
-      data_dir (str): The location of the records on hdfs
-      record_count (int): The number of records to process
-      file_list (list<str>, default=None): The list of data files on HDFS. If provided, use this instead
-        of data_dir
-      datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    """
-    if not (data_dir is None) ^ (file_list is None):
-      raise ValueError("Exactly one of data_dir and file_list can be provided. Got {} for data_dir and {} for file_list".format(
-        data_dir, file_list))
+        file_list = (
+            file_list
+            if file_list is not None
+            else twml.util.list_files(twml.util.preprocess_path(data_dir))
+        )
+        _next_batch = twml.input_fns.default_input_fn(
+            file_list, 1, lambda x: x, num_threads=2, shuffle=True, shuffle_files=True
+        )
+        self.records = []
+        # Validate datarecord_filter_fn
+        if datarecord_filter_fn is not None and not isinstance(
+            datarecord_filter_fn, types.FunctionType
+        ):
+            raise TypeError("datarecord_filter_fn is not function type")
+        with tf.Session() as sess:
+            for i in range(record_count):
+                try:
+                    record = bytes_to_thrift_object(
+                        sess.run(_next_batch)[0], DataRecord
+                    )
+                    if datarecord_filter_fn is None or datarecord_filter_fn(record):
+                        self.records.append(record)
+                except tf.errors.OutOfRangeError:
+                    logging.info(
+                        "Stopping after reading {} records out of {}".format(
+                            i, record_count
+                        )
+                    )
+                    break
+            if datarecord_filter_fn:
+                logging.info(
+                    "datarecord_filter_fn has been applied; keeping {} records out of {}".format(
+                        len(self.records), record_count
+                    )
+                )
 
-    file_list = file_list if file_list is not None else twml.util.list_files(twml.util.preprocess_path(data_dir))
-    _next_batch = twml.input_fns.default_input_fn(file_list, 1, lambda x: x,
-      num_threads=2, shuffle=True, shuffle_files=True)
-    self.records = []
-    # Validate datarecord_filter_fn
-    if datarecord_filter_fn is not None and not isinstance(datarecord_filter_fn, types.FunctionType):
-      raise TypeError("datarecord_filter_fn is not function type")
-    with tf.Session() as sess:
-      for i in range(record_count):
-        try:
-          record = bytes_to_thrift_object(sess.run(_next_batch)[0], DataRecord)
-          if datarecord_filter_fn is None or datarecord_filter_fn(record):
-            self.records.append(record)
-        except tf.errors.OutOfRangeError:
-          logging.info("Stopping after reading {} records out of {}".format(i, record_count))
-          break
-      if datarecord_filter_fn:
-        logging.info("datarecord_filter_fn has been applied; keeping {} records out of {}".format(len(self.records), record_count))
+    def _get_record_generator(self):
+        return (thrift_object_to_bytes(r) for r in self.records)
 
-  def _get_record_generator(self):
-    return (thrift_object_to_bytes(r) for r in self.records)
+    def get_permuted_input_fn(self, batch_size, parse_fn, fname_ftypes):
+        """Get an input function that passes in a preset number of records that have been feature permuted
+        Args:
+          parse_fn (function): The function to parse inputs
+          fname_ftypes: (list<(str, str)>): The names and types of the features to permute
+        """
 
-  def get_permuted_input_fn(self, batch_size, parse_fn, fname_ftypes):
-    """Get an input function that passes in a preset number of records that have been feature permuted
-    Args:
-      parse_fn (function): The function to parse inputs
-      fname_ftypes: (list<(str, str)>): The names and types of the features to permute
-    """
-    def permuted_parse_pyfn(bytes_array):
-      out = []
-      for b in bytes_array:
-        rec = bytes_to_thrift_object(b, DataRecord)
-        if fname_ftypes:
-          rec = _permutate_features(rec, fname_ftypes=fname_ftypes, records=self.records)
-        out.append(thrift_object_to_bytes(rec))
-      return [out]
+        def permuted_parse_pyfn(bytes_array):
+            out = []
+            for b in bytes_array:
+                rec = bytes_to_thrift_object(b, DataRecord)
+                if fname_ftypes:
+                    rec = _permutate_features(
+                        rec, fname_ftypes=fname_ftypes, records=self.records
+                    )
+                out.append(thrift_object_to_bytes(rec))
+            return [out]
 
-    def permuted_parse_fn(bytes_tensor):
-      parsed_bytes_tensor = parse_fn(tf.py_func(permuted_parse_pyfn, [bytes_tensor], tf.string))
-      return parsed_bytes_tensor
+        def permuted_parse_fn(bytes_tensor):
+            parsed_bytes_tensor = parse_fn(
+                tf.py_func(permuted_parse_pyfn, [bytes_tensor], tf.string)
+            )
+            return parsed_bytes_tensor
 
-    def input_fn(batch_size=batch_size, parse_fn=parse_fn, factory=self):
-      return (tf.data.Dataset
-          .from_generator(self._get_record_generator, tf.string)
-          .batch(batch_size)
-          .map(permuted_parse_fn, 4)
-          .make_one_shot_iterator()
-          .get_next())
-    return input_fn
+        def input_fn(batch_size=batch_size, parse_fn=parse_fn, factory=self):
+            return (
+                tf.data.Dataset.from_generator(self._get_record_generator, tf.string)
+                .batch(batch_size)
+                .map(permuted_parse_fn, 4)
+                .make_one_shot_iterator()
+                .get_next()
+            )
+
+        return input_fn
 
 
 def _permutate_features(rec, fname_ftypes, records):
-  """Replace a feature value with a value from random selected record
-  Args:
-    rec: (datarecord): A datarecord returned from DataRecordGenerator
-    fname_ftypes: (list<(str, str)>): The names and types of the features to permute
-    records: (list<datarecord>): The records to sample from
-  Returns:
-    The record with the feature permuted
-  """
-  rec_new = deepcopy(rec)
-  rec_replace = random.choice(records)
+    """Replace a feature value with a value from random selected record
+    Args:
+      rec: (datarecord): A datarecord returned from DataRecordGenerator
+      fname_ftypes: (list<(str, str)>): The names and types of the features to permute
+      records: (list<datarecord>): The records to sample from
+    Returns:
+      The record with the feature permuted
+    """
+    rec_new = deepcopy(rec)
+    rec_replace = random.choice(records)
 
-  # If the replacement datarecord does not have the feature type entirely, add it in
-  #   to make the logic a bit simpler
-  for fname, feature_type in fname_ftypes:
-    fid = twml.feature_id(fname)[0]
-    if rec_replace.__dict__.get(feature_type, None) is None:
-      rec_replace.__dict__[feature_type] = (
-        dict() if feature_type != 'binaryFeatures' else set())
-    if rec_new.__dict__.get(feature_type, None) is None:
-      rec_new.__dict__[feature_type] = (
-        dict() if feature_type != 'binaryFeatures' else set())
+    # If the replacement datarecord does not have the feature type entirely, add it in
+    #   to make the logic a bit simpler
+    for fname, feature_type in fname_ftypes:
+        fid = twml.feature_id(fname)[0]
+        if rec_replace.__dict__.get(feature_type, None) is None:
+            rec_replace.__dict__[feature_type] = (
+                dict() if feature_type != "binaryFeatures" else set()
+            )
+        if rec_new.__dict__.get(feature_type, None) is None:
+            rec_new.__dict__[feature_type] = (
+                dict() if feature_type != "binaryFeatures" else set()
+            )
 
-    if feature_type != 'binaryFeatures':
-      if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, dict()):
-        # If the replacement datarecord does not contain the feature but the original does
-        del rec_new.__dict__[feature_type][fid]
-      elif fid in rec_replace.__dict__[feature_type]:
-        # If the replacement datarecord does contain the feature
-        if rec_new.__dict__[feature_type] is None:
-          rec_new.__dict__[feature_type] = dict()
-        rec_new.__dict__[feature_type][fid] = rec_replace.__dict__[feature_type][fid]
-      else:
-        # If neither datarecord contains this feature
-        pass
-    else:
-      if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, set()):
-        # If the replacement datarecord does not contain the feature but the original does
-        rec_new.__dict__[feature_type].remove(fid)
-      elif fid in rec_replace.__dict__[feature_type]:
-        # If the replacement datarecord does contain the feature
-        if rec_new.__dict__[feature_type] is None:
-          rec_new.__dict__[feature_type] = set()
-        rec_new.__dict__[feature_type].add(fid)
-        # If neither datarecord contains this feature
-      else:
-        # If neither datarecord contains this feature
-        pass
-  return rec_new
+        if feature_type != "binaryFeatures":
+            if fid not in rec_replace.__dict__[
+                feature_type
+            ] and fid in rec_new.__dict__.get(feature_type, dict()):
+                # If the replacement datarecord does not contain the feature but the original does
+                del rec_new.__dict__[feature_type][fid]
+            elif fid in rec_replace.__dict__[feature_type]:
+                # If the replacement datarecord does contain the feature
+                if rec_new.__dict__[feature_type] is None:
+                    rec_new.__dict__[feature_type] = dict()
+                rec_new.__dict__[feature_type][fid] = rec_replace.__dict__[
+                    feature_type
+                ][fid]
+            else:
+                # If neither datarecord contains this feature
+                pass
+        else:
+            if fid not in rec_replace.__dict__[
+                feature_type
+            ] and fid in rec_new.__dict__.get(feature_type, set()):
+                # If the replacement datarecord does not contain the feature but the original does
+                rec_new.__dict__[feature_type].remove(fid)
+            elif fid in rec_replace.__dict__[feature_type]:
+                # If the replacement datarecord does contain the feature
+                if rec_new.__dict__[feature_type] is None:
+                    rec_new.__dict__[feature_type] = set()
+                rec_new.__dict__[feature_type].add(fid)
+                # If neither datarecord contains this feature
+            else:
+                # If neither datarecord contains this feature
+                pass
+    return rec_new
diff --git a/twml/twml/contrib/feature_importances/helpers.py b/twml/twml/contrib/feature_importances/helpers.py
index f3f600e8b..e29e1bb12 100644
--- a/twml/twml/contrib/feature_importances/helpers.py
+++ b/twml/twml/contrib/feature_importances/helpers.py
@@ -1,96 +1,103 @@
 import uuid
 
+import tensorflow.compat.v1 as tf
 from tensorflow.compat.v1 import logging
+
 import twml
-import tensorflow.compat.v1 as tf
 
 
 def write_list_to_hdfs_gfile(list_to_write, output_path):
-  """Use tensorflow gfile to write a list to a location on hdfs"""
-  locname = "/tmp/{}".format(str(uuid.uuid4()))
-  with open(locname, "w") as f:
-    for row in list_to_write:
-      f.write("%s\n" % row)
-  tf.io.gfile.copy(locname, output_path, overwrite=False)
+    """Use tensorflow gfile to write a list to a location on hdfs"""
+    locname = "/tmp/{}".format(str(uuid.uuid4()))
+    with open(locname, "w") as f:
+        for row in list_to_write:
+            f.write("%s\n" % row)
+    tf.io.gfile.copy(locname, output_path, overwrite=False)
 
 
 def decode_str_or_unicode(str_or_unicode):
-  return str_or_unicode.decode() if hasattr(str_or_unicode, 'decode') else str_or_unicode
+    return (
+        str_or_unicode.decode() if hasattr(str_or_unicode, "decode") else str_or_unicode
+    )
 
 
 def longest_common_prefix(strings, split_character):
-  """
-  Args:
-    string (list<str>): The list of strings to find the longest common prefix of
-    split_character (str): If not None, require that the return string end in this character or
-      be the length of the entire string
-  Returns:
-    The string corresponding to the longest common prefix
-  """
-  sorted_strings = sorted(strings)
-  s1, s2 = sorted_strings[0], sorted_strings[-1]
-  if s1 == s2:
-    # If the strings are the same, just return the full string
-    out = s1
-  else:
-    # If the strings are not the same, return the longest common prefix optionally ending in split_character
-    ix = 0
-    for i in range(min(len(s1), len(s2))):
-      if s1[i] != s2[i]:
-        break
-      if split_character is None or s1[i] == split_character:
-        ix = i + 1
-    out = s1[:ix]
-  return out
+    """
+    Args:
+      string (list<str>): The list of strings to find the longest common prefix of
+      split_character (str): If not None, require that the return string end in this character or
+        be the length of the entire string
+    Returns:
+      The string corresponding to the longest common prefix
+    """
+    sorted_strings = sorted(strings)
+    s1, s2 = sorted_strings[0], sorted_strings[-1]
+    if s1 == s2:
+        # If the strings are the same, just return the full string
+        out = s1
+    else:
+        # If the strings are not the same, return the longest common prefix optionally ending in split_character
+        ix = 0
+        for i in range(min(len(s1), len(s2))):
+            if s1[i] != s2[i]:
+                break
+            if split_character is None or s1[i] == split_character:
+                ix = i + 1
+        out = s1[:ix]
+    return out
 
 
 def _expand_prefix(fname, prefix, split_character):
-  if len(fname) == len(prefix):
-    # If the prefix is already the full feature, just take the feature name
-    out = fname
-  elif split_character is None:
-    # Advance the prefix by one character
-    out = fname[:len(prefix) + 1]
-  else:
-    # Advance the prefix to the next instance of split_character or the end of the string
-    for ix in range(len(prefix), len(fname)):
-      if fname[ix] == split_character:
-        break
-    out = fname[:ix + 1]
-  return out
+    if len(fname) == len(prefix):
+        # If the prefix is already the full feature, just take the feature name
+        out = fname
+    elif split_character is None:
+        # Advance the prefix by one character
+        out = fname[: len(prefix) + 1]
+    else:
+        # Advance the prefix to the next instance of split_character or the end of the string
+        for ix in range(len(prefix), len(fname)):
+            if fname[ix] == split_character:
+                break
+        out = fname[: ix + 1]
+    return out
 
 
 def _get_feature_types_from_records(records, fnames):
-  # This method gets the types of the features in fnames by looking at the datarecords themselves.
-  #   The reason why we do this rather than extract the feature types from the feature_config is
-  #   that the feature naming conventions in the feature_config are different from those in the
-  #   datarecords.
-  fids = [twml.feature_id(fname)[0] for fname in fnames]
-  feature_to_type = {}
-  for record in records:
-    for feature_type, values in record.__dict__.items():
-      if values is not None:
-        included_ids = set(values)
-        for fname, fid in zip(fnames, fids):
-          if fid in included_ids:
-            feature_to_type[fname] = feature_type
-  return feature_to_type
+    # This method gets the types of the features in fnames by looking at the datarecords themselves.
+    #   The reason why we do this rather than extract the feature types from the feature_config is
+    #   that the feature naming conventions in the feature_config are different from those in the
+    #   datarecords.
+    fids = [twml.feature_id(fname)[0] for fname in fnames]
+    feature_to_type = {}
+    for record in records:
+        for feature_type, values in record.__dict__.items():
+            if values is not None:
+                included_ids = set(values)
+                for fname, fid in zip(fnames, fids):
+                    if fid in included_ids:
+                        feature_to_type[fname] = feature_type
+    return feature_to_type
 
 
 def _get_metrics_hook(trainer):
-  def get_metrics_fn(trainer=trainer):
-    return {k: v[0]for k, v in trainer.current_estimator_spec.eval_metric_ops.items()}
-  return twml.hooks.GetMetricsHook(get_metrics_fn=get_metrics_fn)
+    def get_metrics_fn(trainer=trainer):
+        return {
+            k: v[0] for k, v in trainer.current_estimator_spec.eval_metric_ops.items()
+        }
+
+    return twml.hooks.GetMetricsHook(get_metrics_fn=get_metrics_fn)
 
 
 def _get_feature_name_from_config(feature_config):
-  """Extract the names of the features on a feature config object
-  """
-  decoded_feature_names = []
-  for f in feature_config.get_feature_spec()['features'].values():
-    try:
-      fname = decode_str_or_unicode(f['featureName'])
-    except UnicodeEncodeError as e:
-      logging.error("Encountered decoding exception when decoding %s: %s" % (f, e))
-    decoded_feature_names.append(fname)
-  return decoded_feature_names
+    """Extract the names of the features on a feature config object"""
+    decoded_feature_names = []
+    for f in feature_config.get_feature_spec()["features"].values():
+        try:
+            fname = decode_str_or_unicode(f["featureName"])
+        except UnicodeEncodeError as e:
+            logging.error(
+                "Encountered decoding exception when decoding %s: %s" % (f, e)
+            )
+        decoded_feature_names.append(fname)
+    return decoded_feature_names
diff --git a/twml/twml/contrib/hooks.py b/twml/twml/contrib/hooks.py
index 6d68831fc..60a46708a 100644
--- a/twml/twml/contrib/hooks.py
+++ b/twml/twml/contrib/hooks.py
@@ -1,42 +1,44 @@
 import datetime
 
-from absl import logging
 import pytz
 import tensorflow.compat.v1 as tf
+from absl import logging
 
 
 class StopAtTimeHook(tf.train.SessionRunHook):
-  """
-  Hook that stops training at a fixed datetime
-  """
-
-  def __init__(self, stop_time):
     """
-    Arguments:
-      stop_time:
-        a datetime.datetime or a datetime.timedelta specifying when to stop.
-        For naive datetime.datetime objects (with no time zone specified),
-        UTC time zone is assumed.
+    Hook that stops training at a fixed datetime
     """
-    if isinstance(stop_time, datetime.timedelta):
-      self._stop_datetime = pytz.utc.localize(datetime.datetime.utcnow() + stop_time)
-    elif isinstance(stop_time, datetime.datetime):
-      if stop_time.tzinfo is None:
-        self._stop_datetime = pytz.utc.localize(stop_time)
-      else:
-        self._stop_datetime = stop_time.astimezone(pytz.UTC)
-    else:
-      raise ValueError("Expecting datetime or timedelta for stop_time arg")
-    self._stop_requested = False
 
-  def after_run(self, run_context, run_values):
-    delta = self._stop_datetime - pytz.utc.localize(datetime.datetime.utcnow())
-    if delta.total_seconds() <= 0:
-      logging.info("StopAtTimeHook reached stop_time; requesting stop")
-      run_context.request_stop()
-      self._stop_requested = True
+    def __init__(self, stop_time):
+        """
+        Arguments:
+          stop_time:
+            a datetime.datetime or a datetime.timedelta specifying when to stop.
+            For naive datetime.datetime objects (with no time zone specified),
+            UTC time zone is assumed.
+        """
+        if isinstance(stop_time, datetime.timedelta):
+            self._stop_datetime = pytz.utc.localize(
+                datetime.datetime.utcnow() + stop_time
+            )
+        elif isinstance(stop_time, datetime.datetime):
+            if stop_time.tzinfo is None:
+                self._stop_datetime = pytz.utc.localize(stop_time)
+            else:
+                self._stop_datetime = stop_time.astimezone(pytz.UTC)
+        else:
+            raise ValueError("Expecting datetime or timedelta for stop_time arg")
+        self._stop_requested = False
+
+    def after_run(self, run_context, run_values):
+        delta = self._stop_datetime - pytz.utc.localize(datetime.datetime.utcnow())
+        if delta.total_seconds() <= 0:
+            logging.info("StopAtTimeHook reached stop_time; requesting stop")
+            run_context.request_stop()
+            self._stop_requested = True
 
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
+    @property
+    def stop_requested(self):
+        """true if this hook requested a stop"""
+        return self._stop_requested
diff --git a/twml/twml/contrib/initializers.py b/twml/twml/contrib/initializers.py
index 52bad3a19..52362cb14 100644
--- a/twml/twml/contrib/initializers.py
+++ b/twml/twml/contrib/initializers.py
@@ -1,61 +1,67 @@
 import numpy as np
 import tensorflow.compat.v1 as tf
 
-
 TWML_INIT_FEED_KEY = "TWML_INIT_FEED_COLLECTION"
 
 
 class PartitionConstant(tf.keras.initializers.Constant):
-  """A constant initializer that supports partitions"""
-
-  def __call__(self, shape, dtype=None, partition_info=None):
-    if partition_info is not None:
-      if not isinstance(self.value, np.ndarray):
-        raise ValueError(
-          "Currently, PartitionConstant only supports "
-          "partitioning on np.ndarrays. Got {}".format(type(self.value).__name__))
-      offsets = partition_info.var_offset
-      indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)])
-      subset = self.value[indices]
-      return subset
-    else:
-      return self.value
+    """A constant initializer that supports partitions"""
+
+    def __call__(self, shape, dtype=None, partition_info=None):
+        if partition_info is not None:
+            if not isinstance(self.value, np.ndarray):
+                raise ValueError(
+                    "Currently, PartitionConstant only supports "
+                    "partitioning on np.ndarrays. Got {}".format(
+                        type(self.value).__name__
+                    )
+                )
+            offsets = partition_info.var_offset
+            indices = tuple(
+                [slice(offset, offset + size) for offset, size in zip(offsets, shape)]
+            )
+            subset = self.value[indices]
+            return subset
+        else:
+            return self.value
 
 
 partition_constant_initializer = PartitionConstant
 
 
 class PlaceholderInitializer(tf.keras.initializers.Initializer):
-  """A placeholder initializer that supports partitions"""
+    """A placeholder initializer that supports partitions"""
 
-  def __init__(self, shape, dtype):
-    self.dtype = dtype
-    self.value = tf.placeholder(dtype=dtype, shape=shape)
+    def __init__(self, shape, dtype):
+        self.dtype = dtype
+        self.value = tf.placeholder(dtype=dtype, shape=shape)
 
-  def __call__(self, shape, dtype=None, partition_info=None):
-    if partition_info is not None:
-      if self.dtype != dtype:
-        raise ValueError("dtype does not match placeholder dtype")
-      offsets = partition_info.var_offset
-      indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)])
-      subset = self.value[indices]
-      return subset
-    else:
-      return self.value
+    def __call__(self, shape, dtype=None, partition_info=None):
+        if partition_info is not None:
+            if self.dtype != dtype:
+                raise ValueError("dtype does not match placeholder dtype")
+            offsets = partition_info.var_offset
+            indices = tuple(
+                [slice(offset, offset + size) for offset, size in zip(offsets, shape)]
+            )
+            subset = self.value[indices]
+            return subset
+        else:
+            return self.value
 
 
 def get_init_feed_dict():
-  """Get the init feed dictionary to be used when running the init op."""
-  # Get the reference to the collection.
-  init_feed_collection = tf.get_collection(TWML_INIT_FEED_KEY)
-  init_feed_dict = {}
-  for d in init_feed_collection:
-    init_feed_dict.update(d)
-  return init_feed_dict
+    """Get the init feed dictionary to be used when running the init op."""
+    # Get the reference to the collection.
+    init_feed_collection = tf.get_collection(TWML_INIT_FEED_KEY)
+    init_feed_dict = {}
+    for d in init_feed_collection:
+        init_feed_dict.update(d)
+    return init_feed_dict
 
 
 def clear_init_feed_collection():
-  """Clear the init feed collection."""
-  init_feed_collection = tf.get_collection_ref(TWML_INIT_FEED_KEY)
-  while init_feed_collection:
-    init_feed_collection.pop()
+    """Clear the init feed collection."""
+    init_feed_collection = tf.get_collection_ref(TWML_INIT_FEED_KEY)
+    while init_feed_collection:
+        init_feed_collection.pop()
diff --git a/twml/twml/contrib/layers/__init__.py b/twml/twml/contrib/layers/__init__.py
index aa6e7d7e4..72a292685 100644
--- a/twml/twml/contrib/layers/__init__.py
+++ b/twml/twml/contrib/layers/__init__.py
@@ -1,11 +1,14 @@
 # pylint: disable=wildcard-import
 """ This module contains all contrib Layers. """
 
+from .embedding_lookup import EmbeddingLookup  # noqa: F401
+from .factorization_machine import FactorizationMachine  # noqa: F401
+from .full_dense import FullDense, full_dense  # noqa: F401
 from .hashed_percentile_discretizer import HashedPercentileDiscretizer  # noqa: F401
 from .hashing_discretizer import HashingDiscretizer  # noqa: F401
 from .mask_layer import MaskLayer  # noqa: F401
-from .embedding_lookup import EmbeddingLookup  # noqa: F401
-from .factorization_machine import FactorizationMachine # noqa: F401
-from .full_dense import full_dense, FullDense  # noqa: F401
 from .stacked_rnn import StackedRNN, stacked_rnn  # noqa: F401
-from .zscore_normalization import ZscoreNormalization, zscore_normalization  # noqa: F401
+from .zscore_normalization import (
+    ZscoreNormalization,  # noqa: F401
+    zscore_normalization,
+)
diff --git a/twml/twml/contrib/layers/embedding_lookup.py b/twml/twml/contrib/layers/embedding_lookup.py
index c83dc7edd..360bfed3d 100644
--- a/twml/twml/contrib/layers/embedding_lookup.py
+++ b/twml/twml/contrib/layers/embedding_lookup.py
@@ -1,12 +1,11 @@
 import os
 import re
 import time
-
 from collections import OrderedDict
 
-from absl import logging
 import numpy as np
 import tensorflow.compat.v1 as tf
+from absl import logging
 from tensorflow.python.ops.lookup_ops import index_table_from_tensor
 
 import twml
@@ -17,403 +16,433 @@
 
 
 def load_initializers_from_csv(
-  embedding_path, vocab_size=-1, embedding_size=None, separator=None, vocab=None
+    embedding_path, vocab_size=-1, embedding_size=None, separator=None, vocab=None
 ):
-  """
-  Loads embeddings saved in the `glove format <https://nlp.stanford.edu/projects/glove/>`_.
-  The glove format is a txt file separated by spaces.
-  Each line looks like: "word 0.00001 0.2334 ...".
-
-  Arguments:
-    embedding_path:
-      path to the embeddings file on HDFS (hdfs://default/...)
-      or its local_path (/path/to/...).
-      The embedding_path may also specify a pattern. In which case, the embeddings
-      are read in the lexical order of the filenames that match the order.
-    vocab_size:
-      the maximum size of the vocabulary. The top ``vocab_size`` words in the file
-      are included in the vocabulary. If you specify a positive vocab_size,
-      the words are expected to be in descending order of frequency.
-      This allows the embeddings to be easily filtered to top vocab_size words.
-      Reducing the vocab_size acts as a regularizer, preventing the model to overfit on rarer words.
-      A negative vocab_size loads all embeddings.
-      Reducing the vocab_size may also help with memory issues,
-      allowing the embedding initializers to fit inside the graph.
-    embedding_size:
-      Defaults to None. If None, the embedding size is infered from the file name.
-      For example, ``glove.300d.txt`` and ``glove300d200.txt`` will both infrered
-      as ``embedding_size=300``. If this can't be done, the ``embedding_size`` is
-      inferred from the first line in the file. If ``embedding_size`` is provided,
-      only the last ``embedding_size`` values of each line are considered. This
-      allows the line parser to recover from partial word parsing errors.
-    separator:
-      Specifies the separator to use when splitting each line into values.
-      Default value is a whitespace (same as glove format).
-    vocab:
-      OrderedDict mapping words to np.array embedding vectors. Initializes the vocabulary.
-      Duplicate words found in the file are ignored.
-      Defaults to a vocabulary of two words::
-
-        vocab = OrderedDict()
-        vocab[''] = np.random.randn(embedding_size)
-        vocab['<UNK>'] = np.random.randn(embedding_size)
-
-  Returns:
-    tuple of (vocab_initializer, weight_initializer, shape)
-
-    vocab_initializer:
-      A tf.constant_initializer containing a vector of word strings of size vocab_size.
-    weight_initializer:
-      A twml.contrib.initializers.partition_constant_initializer containing
-      the weight matrix of embeddings of size vocab_size x embedding_size.
-    shape:
-      A tuple containing of (vocab_size, embedding_size).
-
-  """
-
-  start = time.time()
-
-  embedding_path = twml.util.sanitize_hdfs_path(embedding_path)
-
-  is_user_vocab = True
-  if vocab is None:
-    vocab = OrderedDict()
-    vocab[''] = True
-    vocab['<UNK>'] = True
-    is_user_vocab = False
-  elif not isinstance(vocab, OrderedDict):
-    raise RuntimeError(
-      "Expecting vocab argument of type OrderedDict or None. "
-      "Got type %s instead." % type(vocab).__name__
-    )
-
-  if embedding_size is None:
-    embedding_file = os.path.basename(embedding_path)
-    match = re.search(r"[^\d]([\d]+)d", embedding_file)
-    if match is not None:
-      embedding_size = int(match.group(1))
-
-  if embedding_size is not None and not isinstance(embedding_size, int):
-    raise RuntimeError(
-      "Expecting embedding_size argument of type int or None. "
-      "Got type %s, instead." % type(embedding_size).__name__
-    )
-
-  embedding_paths = sorted(tf.io.gfile.glob(embedding_path))
-
-  if len(embedding_paths) > 1:
-    raise ValueError(
-      "You are most likely using a the wrong --embedding.path"
-    )
-
-  embedding_path = embedding_paths[0]
-  logging.info("Reading embeddings file from path %s.." % embedding_path)
-
-  with tf.io.gfile.GFile(embedding_path) as f:
-    lines = f.readlines()
-
-  logging.info("Done reading embeddings file from path %s." % embedding_path)
-
-  logging.info("Parsing vocbulary and embeddings...")
-
-  for line in lines:
-    # Word and weights separated by space
-    values = line.strip().split(separator)
-    # Word is first symbol on each line
-    word = values[0]
-
-    if word not in vocab:
-      if embedding_size is None or embedding_size <= 0:
-        # get all elements after the first one.
-        word_weights = values[1:]
-        embedding_size = len(word_weights)
-      else:
-        # get the last embedding_size elements
-        word_weights = values[-min(embedding_size, len(values) - 1) :]
-
-      try:
-        if len(word_weights) != embedding_size:
-          raise ValueError
+    """
+    Loads embeddings saved in the `glove format <https://nlp.stanford.edu/projects/glove/>`_.
+    The glove format is a txt file separated by spaces.
+    Each line looks like: "word 0.00001 0.2334 ...".
 
-        word_weights = np.asarray(word_weights, dtype=np.float32)
-        vocab[word] = word_weights
-      except ValueError:
-        logging.info("Wasn't able to load embeddings for word '%s'. Ignoring it" % word)
+    Arguments:
+      embedding_path:
+        path to the embeddings file on HDFS (hdfs://default/...)
+        or its local_path (/path/to/...).
+        The embedding_path may also specify a pattern. In which case, the embeddings
+        are read in the lexical order of the filenames that match the order.
+      vocab_size:
+        the maximum size of the vocabulary. The top ``vocab_size`` words in the file
+        are included in the vocabulary. If you specify a positive vocab_size,
+        the words are expected to be in descending order of frequency.
+        This allows the embeddings to be easily filtered to top vocab_size words.
+        Reducing the vocab_size acts as a regularizer, preventing the model to overfit on rarer words.
+        A negative vocab_size loads all embeddings.
+        Reducing the vocab_size may also help with memory issues,
+        allowing the embedding initializers to fit inside the graph.
+      embedding_size:
+        Defaults to None. If None, the embedding size is infered from the file name.
+        For example, ``glove.300d.txt`` and ``glove300d200.txt`` will both infrered
+        as ``embedding_size=300``. If this can't be done, the ``embedding_size`` is
+        inferred from the first line in the file. If ``embedding_size`` is provided,
+        only the last ``embedding_size`` values of each line are considered. This
+        allows the line parser to recover from partial word parsing errors.
+      separator:
+        Specifies the separator to use when splitting each line into values.
+        Default value is a whitespace (same as glove format).
+      vocab:
+        OrderedDict mapping words to np.array embedding vectors. Initializes the vocabulary.
+        Duplicate words found in the file are ignored.
+        Defaults to a vocabulary of two words::
+
+          vocab = OrderedDict()
+          vocab[''] = np.random.randn(embedding_size)
+          vocab['<UNK>'] = np.random.randn(embedding_size)
 
-      vocab_len = len(vocab)
-      if vocab_size > 0 and vocab_len == vocab_size:
-        # Limit vocabulary to top terms
-        break
-      elif (vocab_len % 1000) == 0:
-        logging.info("Loaded %d words into vocab" % vocab_len)
+    Returns:
+      tuple of (vocab_initializer, weight_initializer, shape)
 
-    else:
-      logging.info("found duplicate word: %s" % word)
+      vocab_initializer:
+        A tf.constant_initializer containing a vector of word strings of size vocab_size.
+      weight_initializer:
+        A twml.contrib.initializers.partition_constant_initializer containing
+        the weight matrix of embeddings of size vocab_size x embedding_size.
+      shape:
+        A tuple containing of (vocab_size, embedding_size).
 
-  if not is_user_vocab:
-    vocab[''] = np.random.randn(embedding_size)
-    vocab['<UNK>'] = np.random.randn(embedding_size)
+    """
 
-  words = list(vocab.keys())
-  weights = list(vocab.values())
+    start = time.time()
 
-  weights = np.asarray(weights, dtype=np.float32)
-  assert weights.shape[0] == len(vocab)
-  assert weights.shape[1] == embedding_size
+    embedding_path = twml.util.sanitize_hdfs_path(embedding_path)
 
-  vocab_initializer = tf.constant_initializer(words, tf.string)
-  weight_initializer = twml.contrib.initializers.PartitionConstant(weights, tf.float32)
+    is_user_vocab = True
+    if vocab is None:
+        vocab = OrderedDict()
+        vocab[""] = True
+        vocab["<UNK>"] = True
+        is_user_vocab = False
+    elif not isinstance(vocab, OrderedDict):
+        raise RuntimeError(
+            "Expecting vocab argument of type OrderedDict or None. "
+            "Got type %s instead." % type(vocab).__name__
+        )
+
+    if embedding_size is None:
+        embedding_file = os.path.basename(embedding_path)
+        match = re.search(r"[^\d]([\d]+)d", embedding_file)
+        if match is not None:
+            embedding_size = int(match.group(1))
+
+    if embedding_size is not None and not isinstance(embedding_size, int):
+        raise RuntimeError(
+            "Expecting embedding_size argument of type int or None. "
+            "Got type %s, instead." % type(embedding_size).__name__
+        )
+
+    embedding_paths = sorted(tf.io.gfile.glob(embedding_path))
+
+    if len(embedding_paths) > 1:
+        raise ValueError("You are most likely using a the wrong --embedding.path")
+
+    embedding_path = embedding_paths[0]
+    logging.info("Reading embeddings file from path %s.." % embedding_path)
+
+    with tf.io.gfile.GFile(embedding_path) as f:
+        lines = f.readlines()
+
+    logging.info("Done reading embeddings file from path %s." % embedding_path)
+
+    logging.info("Parsing vocbulary and embeddings...")
+
+    for line in lines:
+        # Word and weights separated by space
+        values = line.strip().split(separator)
+        # Word is first symbol on each line
+        word = values[0]
+
+        if word not in vocab:
+            if embedding_size is None or embedding_size <= 0:
+                # get all elements after the first one.
+                word_weights = values[1:]
+                embedding_size = len(word_weights)
+            else:
+                # get the last embedding_size elements
+                word_weights = values[-min(embedding_size, len(values) - 1) :]
+
+            try:
+                if len(word_weights) != embedding_size:
+                    raise ValueError
+
+                word_weights = np.asarray(word_weights, dtype=np.float32)
+                vocab[word] = word_weights
+            except ValueError:
+                logging.info(
+                    "Wasn't able to load embeddings for word '%s'. Ignoring it" % word
+                )
+
+            vocab_len = len(vocab)
+            if vocab_size > 0 and vocab_len == vocab_size:
+                # Limit vocabulary to top terms
+                break
+            elif (vocab_len % 1000) == 0:
+                logging.info("Loaded %d words into vocab" % vocab_len)
+
+        else:
+            logging.info("found duplicate word: %s" % word)
+
+    if not is_user_vocab:
+        vocab[""] = np.random.randn(embedding_size)
+        vocab["<UNK>"] = np.random.randn(embedding_size)
+
+    words = list(vocab.keys())
+    weights = list(vocab.values())
+
+    weights = np.asarray(weights, dtype=np.float32)
+    assert weights.shape[0] == len(vocab)
+    assert weights.shape[1] == embedding_size
+
+    vocab_initializer = tf.constant_initializer(words, tf.string)
+    weight_initializer = twml.contrib.initializers.PartitionConstant(
+        weights, tf.float32
+    )
 
-  logging.info("Loaded %d embeddings in %d seconds." % (len(vocab), time.time() - start))
-  return vocab_initializer, weight_initializer, weights.shape
+    logging.info(
+        "Loaded %d embeddings in %d seconds." % (len(vocab), time.time() - start)
+    )
+    return vocab_initializer, weight_initializer, weights.shape
 
 
 def add_parser_arguments(parser):
-  """
-  Adds the embedding.path and embedding.vocab_size command-line arguments to the parser.
-  These can be used to call an initializer loader function like
-  the ``load_initializers_from_csv`` function.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-
-  parser.add_argument(
-    "--embedding.path",
-    "--embedding_path",
-    dest="embedding_path",
-    type=str,
-    default=None,
-    help="When specified, loads glove embeddings from .txt glove file",
-  )
-  parser.add_argument(
-    "--embedding.vocab_size",
-    "--embedding_vocab_size",
-    dest="embedding_vocab_size",
-    type=int,
-    default=-1,
-    help="Size of vocabulary. Uses this many of the most frequent terms. Defaults to -1 (use full vocab).",
-  )
-
-  return parser
+    """
+    Adds the embedding.path and embedding.vocab_size command-line arguments to the parser.
+    These can be used to call an initializer loader function like
+    the ``load_initializers_from_csv`` function.
 
+    Arguments:
+      parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
 
-class EmbeddingLookup(twml.layers.Layer):
-  """Layer for looking up embeddings.
-  Transforms a sequence of strings to a sequence of embeddings.
-
-  Arguments:
-    vocab_size:
-      The number of word strings and embeddings in the vocabulary.
-    output_size:
-      Long or Integer, dimensionality of the output space. The embedding vector size.
-    vocab_initializer:
-      Initializer function for the vocabulary. Required. The initializer should
-      return a list of strings of size vocab_size.
-    weight_initializer:
-      Initializer function for the weight matrix of size vocab_size x output_size.
-      This argument defaults to zeros_initializer().
-      This is valid when the EmbeddingLookup is the first layer of
-      parameters but should be changed otherwise.
-    trainable:
-      Boolean, if `True` adds variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-      Defaults to True: trains the embeddings.
-    num_oov_buckets:
-      The number of buckets to use for OOV strings. These bucket ids occur after the vocab bucket
-      ids. Hashing is used to assign OOV strings to these buckets. If `num_oov_buckets` is not
-      specified, index `OOV_WORD_ID` is used for OOV strings.
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column, does not support yet)
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    dtype:
-      Defaults to tf.float32. Specifies the dtype of the weights.
-    use_placeholder:
-      Defaults to True.
-      If set to `True`, the initializer is passed via a placeholder. The initializer in this case needs to be of type `keras.initializers.Constant`.
-      If set to `False`, the initializer becomes part of the graph. This can sometimes be beyond what protobuf clients support.
-    checkpoint_dir:
-      Default to None.
-      If set to the path of a checkpoint, load embedding from the checkpoint.
-    convert_to_lowercase:
-      Default to True.
-      Converting all string inputs to lowercase.
-
-  Notes: If `use_placeholder` is set to `True`, the feed dictionary can be accessed by calling `twml.contrib.initializers.get_init_feed_dict()`.
-  """
-
-  def __init__(
-    self,
-    vocab_size,
-    output_size,
-    vocab_initializer,
-    weight_initializer=None,
-    trainable=True,
-    num_oov_buckets=None,
-    oov_word_id=None,
-    name=None,
-    num_partitions=1,
-    partition_axis=0,
-    weight_regularizer=None,
-    dtype=None,
-    use_placeholder=True,
-    checkpoint_dir=None,
-    convert_to_lowercase=True,
-    **kwargs,
-  ):
-    if dtype is None:
-      # prevents a bug where the parent class defaults to the type of the first input tensor.
-      dtype = tf.float32
-    super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
-    # Weights initialization is set to 0s. This is safe for full sparse layers because
-    # you are supposed to learn your embedding from the label.
-
-    is_constant_init = isinstance(weight_initializer, tf.keras.initializers.Constant)
-    if use_placeholder and (not is_constant_init) and (weight_initializer is not None):
-      raise ValueError("Weight initializer should be a `Constant` or `None`.")
-
-    if weight_initializer is None:
-      self.weight_initializer = tf.zeros_initializer()
-    else:
-      self.weight_initializer = weight_initializer
-    self.use_placeholder = use_placeholder
-    self.checkpoint_dir = checkpoint_dir
-    self.convert_to_lowercase = convert_to_lowercase
-
-    self.vocab_initializer = vocab_initializer
-    self.vocab_size = vocab_size
-    self.output_size = output_size
-    self.num_partitions = num_partitions
-    self.partition_axis = partition_axis
-    self.weight_regularizer = weight_regularizer
-    self.trainable = trainable
-    self.oov_word_id = oov_word_id
-    self.num_oov_buckets = num_oov_buckets
-
-    if self.oov_word_id is not None and self.num_oov_buckets is not None:
-      raise ValueError("At most one of oov_word_id or num_oov_buckets should be specified")
-    elif self.oov_word_id is None and self.num_oov_buckets is None:
-      self.oov_word_id = OOV_WORD_ID  # use the default OOV word id
-
-    if partition_axis != 0:
-      raise NotImplementedError("embedding_lookup only supports partition_axis = 0")
-
-  def build(self, input_shapes):
-    """
-    creates the ``vocab`` and ``weight`` Variables
-    of shape ``[vocab_size]`` and ``[vocab_size, output_size]`` respectively.
+    Returns:
+      argparse.ArgumentParser instance with discretizer-specific arguments added
     """
-    partitioner = None
-
-    additional_buckets_for_oov = self.num_oov_buckets if self.num_oov_buckets is not None else 0
-    shape = [self.vocab_size + additional_buckets_for_oov, self.output_size]
-
-    if self.use_placeholder:
-      embedding_weight_initializer = twml.contrib.initializers.PlaceholderInitializer(
-        shape, self.dtype
-      )
-      tf.add_to_collection(
-        twml.contrib.initializers.TWML_INIT_FEED_KEY,
-        {embedding_weight_initializer.value: self.weight_initializer.value},
-      )
-    else:
-      embedding_weight_initializer = self.weight_initializer
-
-    if self.num_partitions:
-      partition_axis = int(self.partition_axis)
-      partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis)
-    else:
-      # Regular variables do not like it when you pass both constant tensors and shape
-      if not callable(self.weight_initializer):
-        shape = None
-
-    self.vocab = self.add_variable(
-      'vocab',
-      initializer=self.vocab_initializer,
-      shape=[self.vocab_size],
-      dtype=tf.string,
-      trainable=False,
-    )
 
-    self.weight = self.add_variable(
-      'weight',
-      initializer=None if self.checkpoint_dir is not None else embedding_weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=self.trainable,
-      partitioner=partitioner,
+    parser.add_argument(
+        "--embedding.path",
+        "--embedding_path",
+        dest="embedding_path",
+        type=str,
+        default=None,
+        help="When specified, loads glove embeddings from .txt glove file",
+    )
+    parser.add_argument(
+        "--embedding.vocab_size",
+        "--embedding_vocab_size",
+        dest="embedding_vocab_size",
+        type=int,
+        default=-1,
+        help="Size of vocabulary. Uses this many of the most frequent terms. Defaults to -1 (use full vocab).",
     )
-    if self.checkpoint_dir is not None:
-      twml.trainers.trainer.init_from_checkpoint(self.checkpoint_dir, {'weight': self.weight.name})
 
-    self.built = True
+    return parser
 
-  def call(
-    self, inputs, debug=False, oov_summaries=False, **kwargs
-  ):  # pylint: disable=unused-argument
-    """Converts word strings to word ids using the vocabulary lookup table.
-    Then converts the word ids to their commensurate embedding vector.
 
-    Arguments:
-      inputs:
-        A tensor of word strings. Typically, of size batch_size x seq_len.
-      debug:
-        When True, prints the input strings and their commensurate input_ids.
-        Defaults to False.
-      oov_summaries:
-        When True, log the out-of-vocabulary (OOV) rate to TensorBoard
-        Defaults to False.
+class EmbeddingLookup(twml.layers.Layer):
+    """Layer for looking up embeddings.
+    Transforms a sequence of strings to a sequence of embeddings.
 
-    Returns:
-      The mapping of input word strings to output embedding vectors.
-      Given an input of shape ``batch_size x seq_len``, the output has shape
-      ``batch_size x seq_len x embedding_size``.
+    Arguments:
+      vocab_size:
+        The number of word strings and embeddings in the vocabulary.
+      output_size:
+        Long or Integer, dimensionality of the output space. The embedding vector size.
+      vocab_initializer:
+        Initializer function for the vocabulary. Required. The initializer should
+        return a list of strings of size vocab_size.
+      weight_initializer:
+        Initializer function for the weight matrix of size vocab_size x output_size.
+        This argument defaults to zeros_initializer().
+        This is valid when the EmbeddingLookup is the first layer of
+        parameters but should be changed otherwise.
+      trainable:
+        Boolean, if `True` adds variables to the graph collection
+        ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+        <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+        Defaults to True: trains the embeddings.
+      num_oov_buckets:
+        The number of buckets to use for OOV strings. These bucket ids occur after the vocab bucket
+        ids. Hashing is used to assign OOV strings to these buckets. If `num_oov_buckets` is not
+        specified, index `OOV_WORD_ID` is used for OOV strings.
+      name:
+        String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+      num_partitions:
+        Number of partitions to use for the weight variable. Defaults to 1.
+      partition_axis:
+        If num_partitions is specified, the partition axis for the weight variable
+        Defaults to 0 (partition by row).
+        Must be 0 (row) or 1 (column, does not support yet)
+      weight_regularizer:
+        Regularizer function for the weight matrix.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      dtype:
+        Defaults to tf.float32. Specifies the dtype of the weights.
+      use_placeholder:
+        Defaults to True.
+        If set to `True`, the initializer is passed via a placeholder. The initializer in this case needs to be of type `keras.initializers.Constant`.
+        If set to `False`, the initializer becomes part of the graph. This can sometimes be beyond what protobuf clients support.
+      checkpoint_dir:
+        Default to None.
+        If set to the path of a checkpoint, load embedding from the checkpoint.
+      convert_to_lowercase:
+        Default to True.
+        Converting all string inputs to lowercase.
+
+    Notes: If `use_placeholder` is set to `True`, the feed dictionary can be accessed by calling `twml.contrib.initializers.get_init_feed_dict()`.
     """
-    if self.convert_to_lowercase:
-      inputs = tf.strings.lower(inputs)
-    if self.num_oov_buckets is None:
-      lookup_table = index_table_from_tensor(self.vocab, default_value=self.oov_word_id)
-    else:
-      lookup_table = index_table_from_tensor(self.vocab, num_oov_buckets=self.num_oov_buckets)
-    input_ids = lookup_table.lookup(inputs)
-
-    if oov_summaries:
-      oov_count = tf.reduce_sum(
-        tf.cast(tf.math.equal(input_ids, self.oov_word_id), tf.dtypes.float32)
-      )
-      valid_count = tf.reduce_sum(
-        tf.cast(tf.math.not_equal(input_ids, PAD_WORD_ID), tf.dtypes.float32)
-      )
-      oov_rate = oov_count / valid_count
-      tf.summary.scalar('OOV_rate', oov_rate)
-
-    if debug:
-
-      def print_debug():
-        return tf.print("input_strings:", inputs, "\ninput_ids: ", input_ids, summarize=140)
-
-      with tf.control_dependencies([twml.util.do_every_n_steps(print_debug, 1000)]):
-        input_ids = tf.identity(input_ids)
-
-    output_embeddings = tf.nn.embedding_lookup(
-      params=self.weight, ids=input_ids, partition_strategy='div'
-    )
-
-    output_shape = inputs.shape.concatenate(tf.TensorShape([self.output_size]))
-    output_embeddings.set_shape(output_shape)
 
-    return output_embeddings
+    def __init__(
+        self,
+        vocab_size,
+        output_size,
+        vocab_initializer,
+        weight_initializer=None,
+        trainable=True,
+        num_oov_buckets=None,
+        oov_word_id=None,
+        name=None,
+        num_partitions=1,
+        partition_axis=0,
+        weight_regularizer=None,
+        dtype=None,
+        use_placeholder=True,
+        checkpoint_dir=None,
+        convert_to_lowercase=True,
+        **kwargs,
+    ):
+        if dtype is None:
+            # prevents a bug where the parent class defaults to the type of the first input tensor.
+            dtype = tf.float32
+        super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
+        # Weights initialization is set to 0s. This is safe for full sparse layers because
+        # you are supposed to learn your embedding from the label.
+
+        is_constant_init = isinstance(
+            weight_initializer, tf.keras.initializers.Constant
+        )
+        if (
+            use_placeholder
+            and (not is_constant_init)
+            and (weight_initializer is not None)
+        ):
+            raise ValueError("Weight initializer should be a `Constant` or `None`.")
+
+        if weight_initializer is None:
+            self.weight_initializer = tf.zeros_initializer()
+        else:
+            self.weight_initializer = weight_initializer
+        self.use_placeholder = use_placeholder
+        self.checkpoint_dir = checkpoint_dir
+        self.convert_to_lowercase = convert_to_lowercase
+
+        self.vocab_initializer = vocab_initializer
+        self.vocab_size = vocab_size
+        self.output_size = output_size
+        self.num_partitions = num_partitions
+        self.partition_axis = partition_axis
+        self.weight_regularizer = weight_regularizer
+        self.trainable = trainable
+        self.oov_word_id = oov_word_id
+        self.num_oov_buckets = num_oov_buckets
+
+        if self.oov_word_id is not None and self.num_oov_buckets is not None:
+            raise ValueError(
+                "At most one of oov_word_id or num_oov_buckets should be specified"
+            )
+        elif self.oov_word_id is None and self.num_oov_buckets is None:
+            self.oov_word_id = OOV_WORD_ID  # use the default OOV word id
+
+        if partition_axis != 0:
+            raise NotImplementedError(
+                "embedding_lookup only supports partition_axis = 0"
+            )
+
+    def build(self, input_shapes):
+        """
+        creates the ``vocab`` and ``weight`` Variables
+        of shape ``[vocab_size]`` and ``[vocab_size, output_size]`` respectively.
+        """
+        partitioner = None
+
+        additional_buckets_for_oov = (
+            self.num_oov_buckets if self.num_oov_buckets is not None else 0
+        )
+        shape = [self.vocab_size + additional_buckets_for_oov, self.output_size]
+
+        if self.use_placeholder:
+            embedding_weight_initializer = (
+                twml.contrib.initializers.PlaceholderInitializer(shape, self.dtype)
+            )
+            tf.add_to_collection(
+                twml.contrib.initializers.TWML_INIT_FEED_KEY,
+                {embedding_weight_initializer.value: self.weight_initializer.value},
+            )
+        else:
+            embedding_weight_initializer = self.weight_initializer
+
+        if self.num_partitions:
+            partition_axis = int(self.partition_axis)
+            partitioner = tf.fixed_size_partitioner(
+                self.num_partitions, axis=partition_axis
+            )
+        else:
+            # Regular variables do not like it when you pass both constant tensors and shape
+            if not callable(self.weight_initializer):
+                shape = None
+
+        self.vocab = self.add_variable(
+            "vocab",
+            initializer=self.vocab_initializer,
+            shape=[self.vocab_size],
+            dtype=tf.string,
+            trainable=False,
+        )
+
+        self.weight = self.add_variable(
+            "weight",
+            initializer=None
+            if self.checkpoint_dir is not None
+            else embedding_weight_initializer,
+            regularizer=self.weight_regularizer,
+            shape=shape,
+            dtype=self.dtype,
+            trainable=self.trainable,
+            partitioner=partitioner,
+        )
+        if self.checkpoint_dir is not None:
+            twml.trainers.trainer.init_from_checkpoint(
+                self.checkpoint_dir, {"weight": self.weight.name}
+            )
+
+        self.built = True
+
+    def call(
+        self, inputs, debug=False, oov_summaries=False, **kwargs
+    ):  # pylint: disable=unused-argument
+        """Converts word strings to word ids using the vocabulary lookup table.
+        Then converts the word ids to their commensurate embedding vector.
+
+        Arguments:
+          inputs:
+            A tensor of word strings. Typically, of size batch_size x seq_len.
+          debug:
+            When True, prints the input strings and their commensurate input_ids.
+            Defaults to False.
+          oov_summaries:
+            When True, log the out-of-vocabulary (OOV) rate to TensorBoard
+            Defaults to False.
+
+        Returns:
+          The mapping of input word strings to output embedding vectors.
+          Given an input of shape ``batch_size x seq_len``, the output has shape
+          ``batch_size x seq_len x embedding_size``.
+        """
+        if self.convert_to_lowercase:
+            inputs = tf.strings.lower(inputs)
+        if self.num_oov_buckets is None:
+            lookup_table = index_table_from_tensor(
+                self.vocab, default_value=self.oov_word_id
+            )
+        else:
+            lookup_table = index_table_from_tensor(
+                self.vocab, num_oov_buckets=self.num_oov_buckets
+            )
+        input_ids = lookup_table.lookup(inputs)
+
+        if oov_summaries:
+            oov_count = tf.reduce_sum(
+                tf.cast(tf.math.equal(input_ids, self.oov_word_id), tf.dtypes.float32)
+            )
+            valid_count = tf.reduce_sum(
+                tf.cast(tf.math.not_equal(input_ids, PAD_WORD_ID), tf.dtypes.float32)
+            )
+            oov_rate = oov_count / valid_count
+            tf.summary.scalar("OOV_rate", oov_rate)
+
+        if debug:
+
+            def print_debug():
+                return tf.print(
+                    "input_strings:", inputs, "\ninput_ids: ", input_ids, summarize=140
+                )
+
+            with tf.control_dependencies(
+                [twml.util.do_every_n_steps(print_debug, 1000)]
+            ):
+                input_ids = tf.identity(input_ids)
+
+        output_embeddings = tf.nn.embedding_lookup(
+            params=self.weight, ids=input_ids, partition_strategy="div"
+        )
+
+        output_shape = inputs.shape.concatenate(tf.TensorShape([self.output_size]))
+        output_embeddings.set_shape(output_shape)
+
+        return output_embeddings
diff --git a/twml/twml/contrib/layers/factorization_machine.py b/twml/twml/contrib/layers/factorization_machine.py
index 3b8adae42..2e4e9322d 100644
--- a/twml/twml/contrib/layers/factorization_machine.py
+++ b/twml/twml/contrib/layers/factorization_machine.py
@@ -3,177 +3,195 @@
 Implementing factorization Layer
 """
 
+import tensorflow.compat.v1 as tf
 from twitter.deepbird.sparse.sparse_ops import _pad_empty_outputs
 
-import tensorflow.compat.v1 as tf
 import twml
 from twml.layers.layer import Layer
 
 
 class FactorizationMachine(Layer):
-  """factorization machine layer class.
-  This layer implements the factorization machine operation.
-  The paper is "Factorization Machines" by Steffen Rendle.
-  TDD: go/tf-fm-tdd
-
-  Arguments:
-    num_latent_variables:
-      num of latent variables
-      The number of parameter in this layer is num_latent_variables x n where n is number of
-      input features.
-    weight_initializer:
-      Initializer function for the weight matrix.
-      This argument defaults to zeros_initializer().
-      This is valid when the FullSparse is the first layer of
-      parameters but should be changed otherwise.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-      This parameter can also be a list of binary values if `inputs` passed to `call` a list.
-  """
-
-  def __init__(self,
-    num_latent_variables=10,
-    weight_initializer=None,
-    activation=None,
-    trainable=True,
-    name=None,
-    use_sparse_grads=True,
-    use_binary_values=False,
-    weight_regularizer=None,
-    substract_self_cross=True,
-    **kwargs):
-    super(FactorizationMachine, self).__init__(trainable=trainable, name=name, **kwargs)
-
-    if weight_initializer is None:
-      weight_initializer = tf.zeros_initializer()
-    self.weight_initializer = weight_initializer
-    self.num_latent_variables = num_latent_variables
-    self.activation = activation
-    self.use_sparse_grads = use_sparse_grads
-    self.use_binary_values = use_binary_values
-    self.weight_regularizer = weight_regularizer
-    self.substract_self_cross = substract_self_cross
-
-  def build(self, input_shape):
-    """
-    creates``weight`` Variable of shape``[input_size, num_latent_variables]``.
-
-    """
-
-    shape = [input_shape[1], self.num_latent_variables]
-
-    # There is a 2GB limitation for each tensor because of protobuf.
-    # 2**30 is 1GB. 2 * (2**30) is 2GB.
-    dtype = tf.as_dtype(self.dtype)
-    requested_size = input_shape[1] * self.num_latent_variables * dtype.size
-    if (requested_size >= 2**31):
-      raise ValueError("Weight tensor can not be larger than 2GB. " %
-                       "Requested Dimensions(%d, %d) of type %s (%d bytes total)"
-                       (input_shape[1], self.num_latent_variables, dtype.name))
-
-    if not callable(self.weight_initializer):
-      shape = None
-
-    # dense tensor
-    self.weight = self.add_variable(
-      'weight',
-      initializer=self.weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=True,
-    )
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
+    """factorization machine layer class.
+    This layer implements the factorization machine operation.
+    The paper is "Factorization Machines" by Steffen Rendle.
+    TDD: go/tf-fm-tdd
 
     Arguments:
-      inputs:
-        A SparseTensor
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns a number with cross info
+      num_latent_variables:
+        num of latent variables
+        The number of parameter in this layer is num_latent_variables x n where n is number of
+        input features.
+      weight_initializer:
+        Initializer function for the weight matrix.
+        This argument defaults to zeros_initializer().
+        This is valid when the FullSparse is the first layer of
+        parameters but should be changed otherwise.
+      weight_regularizer:
+        Regularizer function for the weight matrix.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      activation:
+        Activation function (callable). Set it to None to maintain a linear activation.
+      trainable:
+        Boolean, if `True` also add variables to the graph collection
+        ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+        <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+      name:
+        String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+      use_sparse_grads:
+        Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
+        make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
+        speed up at training time when input_size is large and optimizer handles sparse gradients
+        correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
+        to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
+        be large, so it's better to set it to `True`
+      use_binary_values:
+        Assume all non zero values are 1. Defaults to False.
+        This can improve training if used in conjunction with MDL.
+        This parameter can also be a list of binary values if `inputs` passed to `call` a list.
     """
-    # The following are given:
-    # - inputs is a sparse tensor, we call it sp_x.
-    # - The dense_v tensor is a dense matrix, whose row i
-    #   corresponds to the vector V_i.
-    #   weights has shape [num_features, k]
-    sp_x = inputs
-    if isinstance(inputs, twml.SparseTensor):
-      sp_x = inputs.to_tf()
-    elif not isinstance(sp_x, tf.SparseTensor):
-      raise TypeError("The sp_x must be of type tf.SparseTensor or twml.SparseTensor")
-
-    indices = sp_x.indices[:, 1]
-    batch_ids = sp_x.indices[:, 0]
-    values = tf.reshape(sp_x.values, [-1, 1], name=self.name)
-    if self.use_sparse_grads:
-      v = tf.nn.embedding_lookup(self.weight, indices)
-      # if (self.use_binary_values):
-      #   values = tf.ones(tf.shape(values), dtype=values.dtype)
-      v_times_x = v * values
-      # First term: Sum_k  [Sum_i (v_ik * x_i)]^2
-      all_crosses = tf.segment_sum(v_times_x, batch_ids, name=self.name)
-      all_crosses_squared = tf.reduce_sum((all_crosses * all_crosses), 1)
-
-      if self.substract_self_cross:
-        # Second term: Sum_k Sum_i [ (v_ik * x_i)^2 ]
-        v_times_x_2 = v_times_x**2
-        self_crosses = tf.reduce_sum(tf.segment_sum(v_times_x_2, batch_ids, name=self.name), 1)
-        outputs = all_crosses_squared - self_crosses
-      else:
-        outputs = all_crosses_squared
-    else:
-      # need to check if prediction is faster with code below
-      crossTerm = tf.reduce_sum((tf.sparse_tensor_dense_matmul(sp_x, self.weight)**2), 1)
-
-      if self.substract_self_cross:
-        # compute self-cross term
-        self_crossTerm = tf.reduce_sum(tf.segment_sum((tf.gather(self.weight, indices) * values)**2, batch_ids), 1)
-        outputs = crossTerm - self_crossTerm
-      else:
-        outputs = crossTerm
-
-    if self.activation is not None:
-      outputs = self.activation(outputs)
-
-    outputs = tf.reshape(outputs, [-1, 1], name=self.name)
-    outputs = _pad_empty_outputs(outputs, tf.cast(sp_x.dense_shape[0], tf.int32))
-    # set more explicit and static shape to avoid shape inference error
-    # valueError: The last dimension of the inputs to `Dense` should be defined. Found `None`
-    outputs.set_shape([None, 1])
-    return outputs
+
+    def __init__(
+        self,
+        num_latent_variables=10,
+        weight_initializer=None,
+        activation=None,
+        trainable=True,
+        name=None,
+        use_sparse_grads=True,
+        use_binary_values=False,
+        weight_regularizer=None,
+        substract_self_cross=True,
+        **kwargs
+    ):
+        super(FactorizationMachine, self).__init__(
+            trainable=trainable, name=name, **kwargs
+        )
+
+        if weight_initializer is None:
+            weight_initializer = tf.zeros_initializer()
+        self.weight_initializer = weight_initializer
+        self.num_latent_variables = num_latent_variables
+        self.activation = activation
+        self.use_sparse_grads = use_sparse_grads
+        self.use_binary_values = use_binary_values
+        self.weight_regularizer = weight_regularizer
+        self.substract_self_cross = substract_self_cross
+
+    def build(self, input_shape):
+        """
+        creates``weight`` Variable of shape``[input_size, num_latent_variables]``.
+
+        """
+
+        shape = [input_shape[1], self.num_latent_variables]
+
+        # There is a 2GB limitation for each tensor because of protobuf.
+        # 2**30 is 1GB. 2 * (2**30) is 2GB.
+        dtype = tf.as_dtype(self.dtype)
+        requested_size = input_shape[1] * self.num_latent_variables * dtype.size
+        if requested_size >= 2**31:
+            raise ValueError(
+                "Weight tensor can not be larger than 2GB. "
+                % "Requested Dimensions(%d, %d) of type %s (%d bytes total)"(
+                    input_shape[1], self.num_latent_variables, dtype.name
+                )
+            )
+
+        if not callable(self.weight_initializer):
+            shape = None
+
+        # dense tensor
+        self.weight = self.add_variable(
+            "weight",
+            initializer=self.weight_initializer,
+            regularizer=self.weight_regularizer,
+            shape=shape,
+            dtype=self.dtype,
+            trainable=True,
+        )
+
+        self.built = True
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+
+        """
+        raise NotImplementedError
+
+    def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Arguments:
+          inputs:
+            A SparseTensor
+        Returns:
+          - If `inputs` is `SparseTensor`, then returns a number with cross info
+        """
+        # The following are given:
+        # - inputs is a sparse tensor, we call it sp_x.
+        # - The dense_v tensor is a dense matrix, whose row i
+        #   corresponds to the vector V_i.
+        #   weights has shape [num_features, k]
+        sp_x = inputs
+        if isinstance(inputs, twml.SparseTensor):
+            sp_x = inputs.to_tf()
+        elif not isinstance(sp_x, tf.SparseTensor):
+            raise TypeError(
+                "The sp_x must be of type tf.SparseTensor or twml.SparseTensor"
+            )
+
+        indices = sp_x.indices[:, 1]
+        batch_ids = sp_x.indices[:, 0]
+        values = tf.reshape(sp_x.values, [-1, 1], name=self.name)
+        if self.use_sparse_grads:
+            v = tf.nn.embedding_lookup(self.weight, indices)
+            # if (self.use_binary_values):
+            #   values = tf.ones(tf.shape(values), dtype=values.dtype)
+            v_times_x = v * values
+            # First term: Sum_k  [Sum_i (v_ik * x_i)]^2
+            all_crosses = tf.segment_sum(v_times_x, batch_ids, name=self.name)
+            all_crosses_squared = tf.reduce_sum((all_crosses * all_crosses), 1)
+
+            if self.substract_self_cross:
+                # Second term: Sum_k Sum_i [ (v_ik * x_i)^2 ]
+                v_times_x_2 = v_times_x**2
+                self_crosses = tf.reduce_sum(
+                    tf.segment_sum(v_times_x_2, batch_ids, name=self.name), 1
+                )
+                outputs = all_crosses_squared - self_crosses
+            else:
+                outputs = all_crosses_squared
+        else:
+            # need to check if prediction is faster with code below
+            crossTerm = tf.reduce_sum(
+                (tf.sparse_tensor_dense_matmul(sp_x, self.weight) ** 2), 1
+            )
+
+            if self.substract_self_cross:
+                # compute self-cross term
+                self_crossTerm = tf.reduce_sum(
+                    tf.segment_sum(
+                        (tf.gather(self.weight, indices) * values) ** 2, batch_ids
+                    ),
+                    1,
+                )
+                outputs = crossTerm - self_crossTerm
+            else:
+                outputs = crossTerm
+
+        if self.activation is not None:
+            outputs = self.activation(outputs)
+
+        outputs = tf.reshape(outputs, [-1, 1], name=self.name)
+        outputs = _pad_empty_outputs(outputs, tf.cast(sp_x.dense_shape[0], tf.int32))
+        # set more explicit and static shape to avoid shape inference error
+        # valueError: The last dimension of the inputs to `Dense` should be defined. Found `None`
+        outputs.set_shape([None, 1])
+        return outputs
diff --git a/twml/twml/contrib/layers/full_dense.py b/twml/twml/contrib/layers/full_dense.py
index ad78a91a4..0d498bbe1 100644
--- a/twml/twml/contrib/layers/full_dense.py
+++ b/twml/twml/contrib/layers/full_dense.py
@@ -2,379 +2,394 @@
 """
 Implementing Full Dense Layer
 """
-from twml.layers import Layer
-
 import tensorflow.compat.v1 as tf
 from tensorflow.python.layers import core
 
+from twml.layers import Layer
+
 
 class FullDense(Layer):
-  """
-  Full-connected, Dense input layer class.
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Where ``activation`` is the activation function passed as the ``activation``
-  argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
-  and ``bias`` is a bias vector created by the layer.
-
-  However, this layer breaks up ``weight`` into ``num_partitions`` parts,
-  for the purpose of even disribution of weights across parameter servers
-  for distributed training.
-
-  Note - This layer is created to allow distributed training optimizations,
-  but can also be used for single node training (e.g. hogwild) without
-  code modification
-
-  Arguments:
-    output_size:
-      Integer or Long, dimensionality of the output space.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    num_partitions:
-      Number of pieces to partition the weights into. This layer does
-      column partitioning of the weights, which is equivalent to
-      processing the input tensor with multiple fully connected layers
-      of smaller output size, and then concatenating these outputs
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    use_bias:
-      Boolean whether to include a bias parameter in the layer
-    bias_initializer:
-      Initializer function for the bias.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-
-  Properties:
-    output_size:
-      Python integer, dimensionality of the output space.
-    activation:
-      Activation function (callable).
-    weight_initializer:
-      Initializer instance (or name) for the weight matrix.
-    bias_initializer:
-      Initializer instance (or name) for the bias.
-    weights:
-      list of underlying weight and bias matrix components. no guarantee on order of elements
-    weight_regularizer:
-      Regularizer instance for the weight matrix (callable)
-    bias_regularizer:
-      Regularizer instance for the bias (callable).
-    activity_regularizer:
-      Regularizer instance for the output (callable)
-    weight_constraint:
-      Constraint function for the weight matrix.
-    bias_constraint:
-      Constraint function for the bias.
-  """
-
-  def __init__(self, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=3,
-               activation=None,
-               use_bias=True,
-               bias_initializer=tf.zeros_initializer(),
-               bias_regularizer=None,
-               activity_regularizer=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(FullDense, self).__init__(trainable=trainable, name=name, **kwargs)
-    self._output_sizes = self._get_output_partition_sizes(output_size, num_partitions)
-    self._units = output_size
-    self._activation = activation
-    self._weight_initializer = weight_initializer
-    self._bias_initializer = bias_initializer
-    self._weight_regularizer = weight_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._weight_constraint = weight_constraint
-    self._bias_constraint = bias_constraint
-    self._use_bias = use_bias
-    # NOTE - many initializers depend on fan_in and fan_out
-    #      - as such, initialization here may be different than
-    #      - for a non-partitioned FullDense
-    self._parts = [core.Dense(units=out_size,
-                              activation=activation,
-                              use_bias=use_bias,
-                              kernel_initializer=weight_initializer,
-                              bias_initializer=bias_initializer,
-                              kernel_regularizer=weight_regularizer,
-                              bias_regularizer=bias_regularizer,
-                              activity_regularizer=activity_regularizer,
-                              kernel_constraint=weight_constraint,
-                              bias_constraint=bias_constraint,
-                              trainable=trainable,
-                              name=name,
-                              **kwargs) for out_size in self._output_sizes]
-
-  @staticmethod
-  def _get_output_partition_sizes(out_size, num_parts):
-    """ Returns the appropriate output sizes of the partitions """
-    boundaries = [out_size * n // num_parts for n in range(num_parts + 1)]
-    return [k - j for j, k in zip(boundaries[:], boundaries[1:])]
-
-  def build(self, input_shapes):
-    """ Create the appropriately sized weights and biases in each layer partition """
-    if isinstance(input_shapes, (list, tuple)):
-      input_shape = input_shapes[0]
-      is_compatible = True
-      for other_shape in input_shapes[1:]:
-        is_compatible &= input_shape.is_compatible_with(other_shape)
-      if not is_compatible:
-        raise ValueError("Input shapes %s are not compatible." % input_shapes)
-    else:
-      input_shape = input_shapes
-
-    for part in self._parts:
-      part.build(input_shape)
-
-    self.built = True
-
-  @property
-  def units(self):
-    """ Returns the number of output units of the layer """
-    return self._units
-
-  @property
-  def output_size(self):
-    """ Returns the number of output units of the layer """
-    return self._units
-
-  @property
-  def activation(self):
-    """ Returns the activation function """
-    return self._activation
-
-  @property
-  def weight_initializer(self):
-    """ Returns the weight_initializer """
-    return self._weight_initializer
-
-  @property
-  def weight_regularizer(self):
-    """ Returns the weight_regularizer """
-    return self._weight_regularizer
-
-  @property
-  def weight_constraint(self):
-    """ Returns the weight_constraint """
-    return self._weight_constraint
-
-  @property
-  def bias_initializer(self):
-    """ Returns the bias_initializer """
-    return self._bias_initializer
-
-  @property
-  def bias_regularizer(self):
-    """ Returns the bias_regularizer """
-    return self._bias_regularizer
-
-  @property
-  def bias_constraint(self):
-    """ Returns the bias_constraint """
-    return self._bias_constraint
-
-  @property
-  def use_bias(self):
-    """ Returns whether a bias is used in the layer """
-    return self._use_bias
-
-  @property
-  def trainable_variables(self):
-    """ Returns the trainable variables of the layer """
-    trainable_vars = []
-    for pt in self._parts:
-      trainable_vars += pt.trainable_variables
-    return trainable_vars
-
-  @property
-  def trainable_weights(self):
-    """ Returns the trainable variables of the layer """
-    return self.trainable_variables
-
-  @property
-  def non_trainable_variables(self):
-    """ Returns the non-trainable variables of the layer """
-    non_trainable_vars = []
-    for pt in self._parts:
-      non_trainable_vars += pt.non_trainable_variables
-    return non_trainable_vars
-
-  @property
-  def non_trainable_weights(self):
-    """ Returns the non-trainable variables of the layer """
-    return self.non_trainable_variables
-
-  @property
-  def variables(self):
-    """ Returns a list of all weights and biases in this layer """
-    layer_vars = []
-    for pt in self._parts:
-      layer_vars += pt.weights
-    return layer_vars
-
-  @property
-  def weights(self):
-    """ Returns a list of all weights and biases in this layer """
-    return self.variables
-
-  @property
-  def dtype(self):
-    """ Returns the dtype of the layers weights """
-    return self._parts[0].dtype
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
+    """
+    Full-connected, Dense input layer class.
+    This layer implements the operation:
+
+    .. code-block:: python
+
+      outputs = activation(inputs.weight + bias)
+
+    Where ``activation`` is the activation function passed as the ``activation``
+    argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
+    and ``bias`` is a bias vector created by the layer.
+
+    However, this layer breaks up ``weight`` into ``num_partitions`` parts,
+    for the purpose of even disribution of weights across parameter servers
+    for distributed training.
+
+    Note - This layer is created to allow distributed training optimizations,
+    but can also be used for single node training (e.g. hogwild) without
+    code modification
+
+    Arguments:
+      output_size:
+        Integer or Long, dimensionality of the output space.
+      weight_initializer:
+        Initializer function for the weight matrix.
+      weight_regularizer:
+        Regularizer function for the weight matrix.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      weight_constraint:
+        An optional projection function to be applied to the
+        weight after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+      bias_constraint:
+        An optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+      num_partitions:
+        Number of pieces to partition the weights into. This layer does
+        column partitioning of the weights, which is equivalent to
+        processing the input tensor with multiple fully connected layers
+        of smaller output size, and then concatenating these outputs
+      activation:
+        Activation function (callable). Set it to None to maintain a linear activation.
+      use_bias:
+        Boolean whether to include a bias parameter in the layer
+      bias_initializer:
+        Initializer function for the bias.
+      bias_regularizer:
+        Regularizer function for the bias.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      activity_regularizer:
+        Regularizer function for the output.
+      trainable:
+        Boolean, if `True` also add variables to the graph collection
+        ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+        <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+      name:
+        String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+
+    Properties:
+      output_size:
+        Python integer, dimensionality of the output space.
+      activation:
+        Activation function (callable).
+      weight_initializer:
+        Initializer instance (or name) for the weight matrix.
+      bias_initializer:
+        Initializer instance (or name) for the bias.
+      weights:
+        list of underlying weight and bias matrix components. no guarantee on order of elements
+      weight_regularizer:
+        Regularizer instance for the weight matrix (callable)
+      bias_regularizer:
+        Regularizer instance for the bias (callable).
+      activity_regularizer:
+        Regularizer instance for the output (callable)
+      weight_constraint:
+        Constraint function for the weight matrix.
+      bias_constraint:
+        Constraint function for the bias.
+    """
+
+    def __init__(
+        self,
+        output_size,
+        weight_initializer=None,
+        weight_regularizer=None,
+        weight_constraint=None,
+        bias_constraint=None,
+        num_partitions=3,
+        activation=None,
+        use_bias=True,
+        bias_initializer=tf.zeros_initializer(),
+        bias_regularizer=None,
+        activity_regularizer=None,
+        trainable=True,
+        name=None,
+        **kwargs
+    ):
+        super(FullDense, self).__init__(trainable=trainable, name=name, **kwargs)
+        self._output_sizes = self._get_output_partition_sizes(
+            output_size, num_partitions
+        )
+        self._units = output_size
+        self._activation = activation
+        self._weight_initializer = weight_initializer
+        self._bias_initializer = bias_initializer
+        self._weight_regularizer = weight_regularizer
+        self._bias_regularizer = bias_regularizer
+        self._weight_constraint = weight_constraint
+        self._bias_constraint = bias_constraint
+        self._use_bias = use_bias
+        # NOTE - many initializers depend on fan_in and fan_out
+        #      - as such, initialization here may be different than
+        #      - for a non-partitioned FullDense
+        self._parts = [
+            core.Dense(
+                units=out_size,
+                activation=activation,
+                use_bias=use_bias,
+                kernel_initializer=weight_initializer,
+                bias_initializer=bias_initializer,
+                kernel_regularizer=weight_regularizer,
+                bias_regularizer=bias_regularizer,
+                activity_regularizer=activity_regularizer,
+                kernel_constraint=weight_constraint,
+                bias_constraint=bias_constraint,
+                trainable=trainable,
+                name=name,
+                **kwargs
+            )
+            for out_size in self._output_sizes
+        ]
+
+    @staticmethod
+    def _get_output_partition_sizes(out_size, num_parts):
+        """Returns the appropriate output sizes of the partitions"""
+        boundaries = [out_size * n // num_parts for n in range(num_parts + 1)]
+        return [k - j for j, k in zip(boundaries[:], boundaries[1:])]
+
+    def build(self, input_shapes):
+        """Create the appropriately sized weights and biases in each layer partition"""
+        if isinstance(input_shapes, (list, tuple)):
+            input_shape = input_shapes[0]
+            is_compatible = True
+            for other_shape in input_shapes[1:]:
+                is_compatible &= input_shape.is_compatible_with(other_shape)
+            if not is_compatible:
+                raise ValueError("Input shapes %s are not compatible." % input_shapes)
+        else:
+            input_shape = input_shapes
+
+        for part in self._parts:
+            part.build(input_shape)
+
+        self.built = True
+
+    @property
+    def units(self):
+        """Returns the number of output units of the layer"""
+        return self._units
+
+    @property
+    def output_size(self):
+        """Returns the number of output units of the layer"""
+        return self._units
+
+    @property
+    def activation(self):
+        """Returns the activation function"""
+        return self._activation
+
+    @property
+    def weight_initializer(self):
+        """Returns the weight_initializer"""
+        return self._weight_initializer
+
+    @property
+    def weight_regularizer(self):
+        """Returns the weight_regularizer"""
+        return self._weight_regularizer
+
+    @property
+    def weight_constraint(self):
+        """Returns the weight_constraint"""
+        return self._weight_constraint
+
+    @property
+    def bias_initializer(self):
+        """Returns the bias_initializer"""
+        return self._bias_initializer
+
+    @property
+    def bias_regularizer(self):
+        """Returns the bias_regularizer"""
+        return self._bias_regularizer
+
+    @property
+    def bias_constraint(self):
+        """Returns the bias_constraint"""
+        return self._bias_constraint
+
+    @property
+    def use_bias(self):
+        """Returns whether a bias is used in the layer"""
+        return self._use_bias
+
+    @property
+    def trainable_variables(self):
+        """Returns the trainable variables of the layer"""
+        trainable_vars = []
+        for pt in self._parts:
+            trainable_vars += pt.trainable_variables
+        return trainable_vars
+
+    @property
+    def trainable_weights(self):
+        """Returns the trainable variables of the layer"""
+        return self.trainable_variables
+
+    @property
+    def non_trainable_variables(self):
+        """Returns the non-trainable variables of the layer"""
+        non_trainable_vars = []
+        for pt in self._parts:
+            non_trainable_vars += pt.non_trainable_variables
+        return non_trainable_vars
+
+    @property
+    def non_trainable_weights(self):
+        """Returns the non-trainable variables of the layer"""
+        return self.non_trainable_variables
+
+    @property
+    def variables(self):
+        """Returns a list of all weights and biases in this layer"""
+        layer_vars = []
+        for pt in self._parts:
+            layer_vars += pt.weights
+        return layer_vars
+
+    @property
+    def weights(self):
+        """Returns a list of all weights and biases in this layer"""
+        return self.variables
+
+    @property
+    def dtype(self):
+        """Returns the dtype of the layers weights"""
+        return self._parts[0].dtype
+
+    def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Arguments:
+          inputs:
+            A dense Tensor or a list of such.
+            If `inputs` is a list, all tensors must have same `dense_shape`.
+
+        Returns:
+          - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
+          - If `inputs` is a `list[SparseTensor`, then returns
+           `bias + accumulate_n([sp_a * dense_b for sp_a in inputs])`.
+        """
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        outputs = []
+        for inp in inputs:
+            part_outputs = [part(inp) for part in self._parts]
+            outputs.append(tf.concat(part_outputs, axis=-1))
+
+        return tf.accumulate_n(outputs)
+
+
+def full_dense(
+    inputs,
+    output_size,
+    weight_initializer=None,
+    weight_regularizer=None,
+    weight_constraint=None,
+    bias_constraint=None,
+    num_partitions=3,
+    activation=None,
+    use_bias=True,
+    bias_initializer=tf.zeros_initializer(),
+    bias_regularizer=None,
+    activity_regularizer=None,
+    trainable=True,
+    name=None,
+    reuse=None,
+    **kwargs
+):
+    """Functional interface for the fully-connected dense-input layer.
+    This layer implements the operation:
+    `outputs = activation(inputs.weight + bias)`
+    Where `activation` is the activation function passed as the `activation`
+    argument (if not `None`), `weight` is a weights matrix created by the layer,
+    and `bias` is a bias vector created by the layer
+    (only if `use_bias` is `True`).
+
+    However, this layer breaks up ``weight`` into ``num_partitions`` parts,
+    for the purpose of even disribution of weights across parameter servers
+    for distributed training.
+
+    Note - This layer is created to allow distributed training optimizations,
+    but can also be used for single node training (e.g. hogwild) without
+    code modification
 
     Arguments:
-      inputs:
-        A dense Tensor or a list of such.
-        If `inputs` is a list, all tensors must have same `dense_shape`.
+      inputs: Tensor input.
+      output_size: Integer or Long, dimensionality of the output space.
+      weight_initializer: Initializer function for the weight matrix.
+        If `None` (default), weights are initialized using the default
+        initializer used by `tf.get_variable`.
+      weight_regularizer:
+        Regularizer function for the weight matrix.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      weight_constraint:
+        An optional projection function to be applied to the
+        weight after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+      bias_constraint:
+        An optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+      num_partitions:
+        Number of pieces to partition the weights into. This layer does
+        column partitioning of the weights, which is equivalent to
+        processing the input tensor with multiple fully connected layers
+        of smaller output size, and then concatenating these outputs
+      activation: Activation function (callable). Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      bias_initializer:
+        Initializer function for the bias.
+      bias_regularizer:
+        Regularizer function for the bias.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      activity_regularizer:
+        Regularizer function for the output.
+      trainable:
+        Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name:
+        String, the name of the layer.
+      reuse:
+        Boolean, whether to reuse the weights of a previous layer
+        by the same name.
 
     Returns:
-      - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
-      - If `inputs` is a `list[SparseTensor`, then returns
-       `bias + accumulate_n([sp_a * dense_b for sp_a in inputs])`.
+      Output tensor with shape `inputs.shape[:-1] + [output_size]`.
     """
     if not isinstance(inputs, (list, tuple)):
-      inputs = [inputs]
-
-    outputs = []
-    for inp in inputs:
-      part_outputs = [part(inp) for part in self._parts]
-      outputs.append(tf.concat(part_outputs, axis=-1))
-
-    return tf.accumulate_n(outputs)
-
-
-def full_dense(inputs, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=3,
-               activation=None,
-               use_bias=True,
-               bias_initializer=tf.zeros_initializer(),
-               bias_regularizer=None,
-               activity_regularizer=None,
-               trainable=True,
-               name=None,
-               reuse=None,
-               **kwargs):
-  """Functional interface for the fully-connected dense-input layer.
-  This layer implements the operation:
-  `outputs = activation(inputs.weight + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `weight` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  However, this layer breaks up ``weight`` into ``num_partitions`` parts,
-  for the purpose of even disribution of weights across parameter servers
-  for distributed training.
-
-  Note - This layer is created to allow distributed training optimizations,
-  but can also be used for single node training (e.g. hogwild) without
-  code modification
-
-  Arguments:
-    inputs: Tensor input.
-    output_size: Integer or Long, dimensionality of the output space.
-    weight_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.get_variable`.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    num_partitions:
-      Number of pieces to partition the weights into. This layer does
-      column partitioning of the weights, which is equivalent to
-      processing the input tensor with multiple fully connected layers
-      of smaller output size, and then concatenating these outputs
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    bias_initializer:
-      Initializer function for the bias.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name:
-      String, the name of the layer.
-    reuse:
-      Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor with shape `inputs.shape[:-1] + [output_size]`.
-  """
-  if not isinstance(inputs, (list, tuple)):
-    inputs = [inputs]
-
-  dtype = inputs[0].dtype.base_dtype
-
-  layer = FullDense(output_size=output_size,
-                    weight_initializer=weight_initializer,
-                    weight_regularizer=weight_regularizer,
-                    weight_constraint=weight_constraint,
-                    bias_constraint=bias_constraint,
-                    num_partitions=num_partitions,
-                    activation=activation,
-                    use_bias=use_bias,
-                    bias_initializer=bias_initializer,
-                    bias_regularizer=bias_regularizer,
-                    activity_regularizer=activity_regularizer,
-                    trainable=trainable,
-                    name=name,
-                    dtype=dtype,
-                    _scope=name,
-                    _reuse=reuse,
-                    **kwargs)
-
-  return layer(inputs)
+        inputs = [inputs]
+
+    dtype = inputs[0].dtype.base_dtype
+
+    layer = FullDense(
+        output_size=output_size,
+        weight_initializer=weight_initializer,
+        weight_regularizer=weight_regularizer,
+        weight_constraint=weight_constraint,
+        bias_constraint=bias_constraint,
+        num_partitions=num_partitions,
+        activation=activation,
+        use_bias=use_bias,
+        bias_initializer=bias_initializer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name,
+        dtype=dtype,
+        _scope=name,
+        _reuse=reuse,
+        **kwargs
+    )
+
+    return layer(inputs)
diff --git a/twml/twml/contrib/layers/hashed_percentile_discretizer.py b/twml/twml/contrib/layers/hashed_percentile_discretizer.py
index b32c3be8d..3b89d2702 100644
--- a/twml/twml/contrib/layers/hashed_percentile_discretizer.py
+++ b/twml/twml/contrib/layers/hashed_percentile_discretizer.py
@@ -4,14 +4,14 @@
 """
 
 
-from twitter.deepbird.util.hashing import (
-  integer_multiplicative_hashing_uniform,
-  integer_multiplicative_hashing,
-)  # noqa: F401
-
-from libtwml import percentile_discretizer_bin_indices
 import numpy as np
 import tensorflow.compat.v1 as tf
+from libtwml import percentile_discretizer_bin_indices
+from twitter.deepbird.util.hashing import (  # noqa: F401
+    integer_multiplicative_hashing,
+    integer_multiplicative_hashing_uniform,
+)
+
 import twml
 from twml.layers.layer import Layer
 from twml.layers.partition import Partition
@@ -19,199 +19,217 @@
 
 
 class HashedPercentileDiscretizer(Layer):
-  """
-  HashedPercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator
-  after accumulating data
-  and performing minimum description length (PercentileDiscretizer) calibration.
-
-  HashedPercentileDiscretizer takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an HashedPercentileDiscretizer
-  bin.
-  Each HashedPercentileDiscretizer input feature is converted to n_bin bins.
-  Each HashedPercentileDiscretizer calibration tries to find bin delimiters such
-  that the number of features values
-  per bin is roughly equal (for each given HashedPercentileDiscretizer feature).
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  The difference between this layer and PercentileDiscretizer is that the
-  DeterministicPercentileDiscretize always assigns the same output id in the SparseTensor to the
-  same input feature id + bin. This is useful if you want to user transfer learning on pre-trained
-  sparse to dense embedding layers, but re-calibrate your discretizer on newer data.
-  """
-
-  def __init__(self, n_feature, n_bin, out_bits,
-               bin_values=None, hash_keys=None, hash_values=None,
-               bin_ids=None, feature_offsets=None,
-               hash_fn=integer_multiplicative_hashing_uniform, **kwargs):
     """
-    Creates a non-initialized `HashedPercentileDiscretizer` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during HashedPercentileDiscretizer calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of HashedPercentileDiscretizer bins used for
-        HashedPercentileDiscretizer calibration. Used to initialize bin_values, hash_keys,
-        hash_values, bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that HashedPercentileDiscretizer discretizes and knows
-        about. The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces:
-          HashedPercentileDiscretizer vs non-HashedPercentileDiscretizer
-          2. transate the HashedPercentileDiscretizer features into a hash_feature ID that
-          HashedPercentileDiscretizer understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for HashedPercentileDiscretizer.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the HashedPercentileDiscretizer features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-      hash_fn:
-        a function that takes in `feature_ids`, `bucket_indices` and `output_size` and
-        hashes the bucketed features into the `output_size` buckets. The default uses knuth's
-        multiplicative hashing
+    HashedPercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator
+    after accumulating data
+    and performing minimum description length (PercentileDiscretizer) calibration.
+
+    HashedPercentileDiscretizer takes sparse continuous features and converts then to sparse
+    binary features. Each binary output feature is associated to an HashedPercentileDiscretizer
+    bin.
+    Each HashedPercentileDiscretizer input feature is converted to n_bin bins.
+    Each HashedPercentileDiscretizer calibration tries to find bin delimiters such
+    that the number of features values
+    per bin is roughly equal (for each given HashedPercentileDiscretizer feature).
+    Note that if an input feature is rarely used, so will its associated output bin/features.
+    The difference between this layer and PercentileDiscretizer is that the
+    DeterministicPercentileDiscretize always assigns the same output id in the SparseTensor to the
+    same input feature id + bin. This is useful if you want to user transfer learning on pre-trained
+    sparse to dense embedding layers, but re-calibrate your discretizer on newer data.
     """
-    super(HashedPercentileDiscretizer, self).__init__(**kwargs)
-
-    max_discretizer_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    if not self.built:
-      self.build(input_shape=None)
-
-    # build variables
-    self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-    self._out_bits = out_bits
-
-    hash_keys = hash_keys
-    if hash_keys is None:
-      hash_keys = np.empty(n_feature, dtype=np.int64)
 
-    hash_values = hash_values
-    if hash_values is None:
-      hash_values = np.empty(n_feature, dtype=np.int64)
-
-    initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
-    self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
-    self.bin_ids = bin_ids
-    if bin_ids is None:
-      bin_ids = np.empty(max_discretizer_feature, dtype=np.int64)
-
-    self.bin_values = bin_values
-    if bin_values is None:
-      bin_values = np.empty(max_discretizer_feature, dtype=np.float32)
-
-    self.feature_offsets = feature_offsets
-    if feature_offsets is None:
-      feature_offsets = np.empty(n_feature, dtype=np.int64)
-
-    self.hash_fn = hash_fn
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer:
-    hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
-    """
-    # build layers
-    self.partition = Partition()
-    self.stitch = Stitch()
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements HashedPercentileDiscretizer inference where inputs are intersected with a
-    hash_map.
-    Part of the inputs are discretized using twml.discretizer
-    to produce a discretizer_output SparseTensor.
-    This SparseTensor is then joined with the original inputs SparseTensor,
-    but only for the inputs keys that did not get discretized.
-
-    Args:
-      inputs: A 2D SparseTensor that is input to HashedPercentileDiscretizer for
-        discretization. It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    hashed_keys = self.hash_map.lookup(keys)
-    hashed_keys = tf.cast(hashed_keys, tf.int64)
-
-    found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
-    partition_ids = tf.cast(found, tf.int32)
-
-    found = tf.reshape(found, [-1])
-    continuous_feature_ids = tf.boolean_mask(keys, found)
-
-    vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys))
-    non_discretizer_keys, discretizer_in_keys = key
-    non_discretizer_vals, discretizer_in_vals = vals
-
-    non_discretizer_keys = twml.util.limit_bits(non_discretizer_keys, self._out_bits)
-    self.non_discretizer_keys = non_discretizer_keys
-
-    # run HashedPercentileDiscretizer on the keys/values it knows about
-    output = percentile_discretizer_bin_indices(discretizer_in_keys,
-                                                discretizer_in_vals,
-                                                self.bin_ids,
-                                                self.bin_values,
-                                                self.feature_offsets)
-    discretizer_bucket_idxs, discretizer_vals = output
-    new_discretizer_keys = self.hash_fn(continuous_feature_ids, discretizer_bucket_idxs,
-                                        self.output_size)
-    # Stitch the keys and values from discretizer and non discretizer indices back, with help
-    # of the Stitch Layer
-    self.discretizer_out_keys = new_discretizer_keys
-
-    concat_data = self.stitch([non_discretizer_vals, discretizer_vals],
-                              [non_discretizer_keys, new_discretizer_keys],
-                              indices)
-
-    concat_vals, concat_keys = concat_data
-
-    # Generate output shape using _compute_output_shape
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self.output_size]
-    return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
+    def __init__(
+        self,
+        n_feature,
+        n_bin,
+        out_bits,
+        bin_values=None,
+        hash_keys=None,
+        hash_values=None,
+        bin_ids=None,
+        feature_offsets=None,
+        hash_fn=integer_multiplicative_hashing_uniform,
+        **kwargs
+    ):
+        """
+        Creates a non-initialized `HashedPercentileDiscretizer` object.
+        Before using the table you will have to initialize it. After initialization
+        the table will be immutable.
+
+        Parent class args:
+          see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
+          for documentation of parent class arguments.
+
+        Required args:
+          n_feature:
+            number of unique features accumulated during HashedPercentileDiscretizer calibration.
+            This is the number of features in the hash map.
+            Used to initialize bin_values, hash_keys, hash_values,
+            bin_ids, bin_values and feature_offsets.
+          n_bin:
+            number of HashedPercentileDiscretizer bins used for
+            HashedPercentileDiscretizer calibration. Used to initialize bin_values, hash_keys,
+            hash_values, bin_ids, bin_values and feature_offsets.
+          out_bits:
+            Determines the maximum value for output feature IDs.
+            The dense_shape of the SparseTensor returned by lookup(x)
+            will be [x.shape[0], 1 << output_bits].
+
+        Optional args:
+          hash_keys:
+            contains the features ID that HashedPercentileDiscretizer discretizes and knows
+            about. The hash map (hash_keys->hash_values) is used for two reasons:
+              1. divide inputs into two feature spaces:
+              HashedPercentileDiscretizer vs non-HashedPercentileDiscretizer
+              2. transate the HashedPercentileDiscretizer features into a hash_feature ID that
+              HashedPercentileDiscretizer understands.
+            The hash_map is expected to contain n_feature items.
+          hash_values:
+            translates the feature IDs into hash_feature IDs for HashedPercentileDiscretizer.
+          bin_ids:
+            a 1D Tensor of size n_feature * n_bin + 1 which contains
+            unique IDs to which the HashedPercentileDiscretizer features will be translated to.
+            For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
+            the most efficient output space.
+          bin_values:
+            a 1D Tensor aligned with bin_ids.
+            For a given hash_feature ID j, it's value bin's are indexed between
+            `j*n_bin` and `j*n_bin + n_bin-1`.
+            As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
+            and a inputs value between
+            `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
+          feature_offsets:
+            a 1D Tensor specifying the starting location of bins for a given feature id.
+            For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
+          hash_fn:
+            a function that takes in `feature_ids`, `bucket_indices` and `output_size` and
+            hashes the bucketed features into the `output_size` buckets. The default uses knuth's
+            multiplicative hashing
+        """
+        super(HashedPercentileDiscretizer, self).__init__(**kwargs)
+
+        max_discretizer_feature = n_feature * (n_bin + 1)
+        self._n_feature = n_feature
+        self._n_bin = n_bin
+
+        if not self.built:
+            self.build(input_shape=None)
+
+        # build variables
+        self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
+        self._out_bits = out_bits
+
+        hash_keys = hash_keys
+        if hash_keys is None:
+            hash_keys = np.empty(n_feature, dtype=np.int64)
+
+        hash_values = hash_values
+        if hash_values is None:
+            hash_values = np.empty(n_feature, dtype=np.int64)
+
+        initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
+        self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
+        self.bin_ids = bin_ids
+        if bin_ids is None:
+            bin_ids = np.empty(max_discretizer_feature, dtype=np.int64)
+
+        self.bin_values = bin_values
+        if bin_values is None:
+            bin_values = np.empty(max_discretizer_feature, dtype=np.float32)
+
+        self.feature_offsets = feature_offsets
+        if feature_offsets is None:
+            feature_offsets = np.empty(n_feature, dtype=np.int64)
+
+        self.hash_fn = hash_fn
+
+    def build(self, input_shape):  # pylint: disable=unused-argument
+        """
+        Creates the variables of the layer:
+        hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
+        """
+        # build layers
+        self.partition = Partition()
+        self.stitch = Stitch()
+        # make sure this is last
+        self.built = True
+
+    def call(self, inputs, **kwargs):
+        """Looks up `keys` in a table, outputs the corresponding values.
+
+        Implements HashedPercentileDiscretizer inference where inputs are intersected with a
+        hash_map.
+        Part of the inputs are discretized using twml.discretizer
+        to produce a discretizer_output SparseTensor.
+        This SparseTensor is then joined with the original inputs SparseTensor,
+        but only for the inputs keys that did not get discretized.
+
+        Args:
+          inputs: A 2D SparseTensor that is input to HashedPercentileDiscretizer for
+            discretization. It has a dense_shape of [batch_size, input_size]
+          name: A name for the operation (optional).
+        Returns:
+          A `SparseTensor` of the same type as `inputs`.
+          Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+        """
+        if isinstance(inputs, tf.SparseTensor):
+            inputs = twml.SparseTensor.from_tf(inputs)
+
+        assert isinstance(inputs, twml.SparseTensor)
+
+        # sparse column indices
+        ids = inputs.ids
+        # sparse row indices
+        keys = inputs.indices
+        # sparse values
+        vals = inputs.values
+
+        hashed_keys = self.hash_map.lookup(keys)
+        hashed_keys = tf.cast(hashed_keys, tf.int64)
+
+        found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
+        partition_ids = tf.cast(found, tf.int32)
+
+        found = tf.reshape(found, [-1])
+        continuous_feature_ids = tf.boolean_mask(keys, found)
+
+        vals, key, indices = self.partition(
+            partition_ids, vals, tf.where(found, hashed_keys, keys)
+        )
+        non_discretizer_keys, discretizer_in_keys = key
+        non_discretizer_vals, discretizer_in_vals = vals
+
+        non_discretizer_keys = twml.util.limit_bits(
+            non_discretizer_keys, self._out_bits
+        )
+        self.non_discretizer_keys = non_discretizer_keys
+
+        # run HashedPercentileDiscretizer on the keys/values it knows about
+        output = percentile_discretizer_bin_indices(
+            discretizer_in_keys,
+            discretizer_in_vals,
+            self.bin_ids,
+            self.bin_values,
+            self.feature_offsets,
+        )
+        discretizer_bucket_idxs, discretizer_vals = output
+        new_discretizer_keys = self.hash_fn(
+            continuous_feature_ids, discretizer_bucket_idxs, self.output_size
+        )
+        # Stitch the keys and values from discretizer and non discretizer indices back, with help
+        # of the Stitch Layer
+        self.discretizer_out_keys = new_discretizer_keys
+
+        concat_data = self.stitch(
+            [non_discretizer_vals, discretizer_vals],
+            [non_discretizer_keys, new_discretizer_keys],
+            indices,
+        )
+
+        concat_vals, concat_keys = concat_data
+
+        # Generate output shape using _compute_output_shape
+
+        batch_size = tf.to_int64(inputs.dense_shape[0])
+        output_shape = [batch_size, self.output_size]
+        return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
diff --git a/twml/twml/contrib/layers/hashing_discretizer.py b/twml/twml/contrib/layers/hashing_discretizer.py
index 2a8244f4b..e6be4b75c 100644
--- a/twml/twml/contrib/layers/hashing_discretizer.py
+++ b/twml/twml/contrib/layers/hashing_discretizer.py
@@ -6,151 +6,165 @@
 
 import libtwml
 import tensorflow.compat.v1 as tf
+
 import twml
 from twml.constants import HashingDiscretizerOptions
 from twml.layers.layer import Layer
 
 
 class HashingDiscretizer(Layer):
-  """A layer that discretizes continuous features, with hashed feature assignments
-
-  HashingDiscretizer converts sparse continuous features into sparse
-  binary features. Each binary output feature indicates the presence of a
-  value in a HashingDiscretizer bin.
-
-  Each calibrated HashingDiscretizer input feature is converted to n_bin+1 bins.
-
-  - n_bin bin boundaries for each feature (i.e. len(bin_vals[id])==n_bin) defines n_bin+1 bins
-  - bin assignment = sum(bin_vals<val)
-
-  The difference between this layer and PercentileDiscretizer is that the
-  HashingDiscretizer always assigns the same output id in the
-  SparseTensor to the same input (feature id, bin) pair. This is useful if you
-  want to user transfer learning on pre-trained sparse to dense embedding
-  layers, but re-calibrate your discretizer on newer data.
+    """A layer that discretizes continuous features, with hashed feature assignments
 
-  If there are no calibrated features, then the discretizer will only apply
-  twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-  the discretizer will be a "no-operation", other than obeying `out_bits`
+    HashingDiscretizer converts sparse continuous features into sparse
+    binary features. Each binary output feature indicates the presence of a
+    value in a HashingDiscretizer bin.
 
-  Typically, a HashingDiscretizer layer will be generated by calling the
-  to_layer() method of the HashingDiscretizerCalibrator
-  """
+    Each calibrated HashingDiscretizer input feature is converted to n_bin+1 bins.
 
-  def __init__(self, feature_ids, bin_vals, n_bin, out_bits,
-               cost_per_unit=500, options=None, **kwargs):
-    """
-    Creates a non-initialized `HashingDiscretizer` object.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      feature_ids (1D int64 numpy array):
-      - list of feature IDs that have been calibrated and have corresponding
-        bin boundary values in the bin_vals array
-      - bin values for feature feature_ids[i] live at bin_vals[i*n_bin:(i+1)*n_bin]
-      bin_vals (1D float numpy array):
-      - These are the bin boundary values for each calibrated feature
-      - len(bin_vals) = n_bin*len(feature_ids)
-      n_bin (int):
-      - number of HashingDiscretizer bins is actually n_bin + 1
-      - ***Note*** that if a value N is passed for the value of n_bin to
-        HashingDiscretizerCalibrator, then HashingDiscretizerCalibrator
-        will generate N+1 bin boundaries for each feature, and hence there
-        will actually be N+2 potential bins for each feature
-      out_bits (int):
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      cost_per_unit (int):
-      - heuristic for intra op multithreading. approximate nanoseconds per input value.
-      options (int or None for default):
-      - Selects behavior of the op. Default is lower_bound and integer_multiplicative_hashing.
-      - Use values in twml.constants.HashingDiscretizerOptions to select options as follows
-        choose exactly one of HashingDiscretizerOptions.{SEARCH_LOWER_BOUND, SEARCH_LINEAR, SEARCH_UPPER_BOUND}
-        choose exactly one of HashingDiscretizerOptions.{HASH_32BIT, HASH_64BIT}
-        Bitwise OR these together to construct the options input.
-        For example, `options=(HashingDiscretizerOptions.SEARCH_UPPER_BOUND | HashingDiscretizerOptions.HASH_64BIT)`
-    """
-    super(HashingDiscretizer, self).__init__(**kwargs)
+    - n_bin bin boundaries for each feature (i.e. len(bin_vals[id])==n_bin) defines n_bin+1 bins
+    - bin assignment = sum(bin_vals<val)
 
-    self._feature_ids = feature_ids
-    self._bin_vals = bin_vals
-    self._n_bin = n_bin
-    self._out_bits = out_bits
-    self.cost_per_unit = cost_per_unit
-    if options is None:
-      options = HashingDiscretizerOptions.SEARCH_LOWER_BOUND | HashingDiscretizerOptions.HASH_32BIT
-    self._options = options
+    The difference between this layer and PercentileDiscretizer is that the
+    HashingDiscretizer always assigns the same output id in the
+    SparseTensor to the same input (feature id, bin) pair. This is useful if you
+    want to user transfer learning on pre-trained sparse to dense embedding
+    layers, but re-calibrate your discretizer on newer data.
 
-    if not self.built:
-      self.build(input_shape=None)
+    If there are no calibrated features, then the discretizer will only apply
+    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
+    the discretizer will be a "no-operation", other than obeying `out_bits`
 
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer
+    Typically, a HashingDiscretizer layer will be generated by calling the
+    to_layer() method of the HashingDiscretizerCalibrator
     """
-    # make sure this is last
-    self.built = True
 
-  def call(self, inputs, **kwargs):
-    """
-    Implements HashingDiscretizer inference on a twml.SparseTensor.
-    Alternatively, accepts a tf.SparseTensor that can be converted
-    to twml.SparseTensor.
-
-    Performs discretization of input values.
-    i.e. bucket_val = bucket(val | feature_id)
-
-    This bucket mapping depends on the calibration (i.e. the bin boundaries).
-    However, (feature_id, bucket_val) pairs are mapped to new_feature_id in
-    a way that is independent of the calibration procedure
-
-    Args:
-      inputs: A 2D SparseTensor that is input to HashingDiscretizer for
-        discretization. It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A tf.SparseTensor, created from twml.SparseTensor.to_tf()
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    if len(self._feature_ids) > 0:
-      # pass all inputs to the c++ op
-      # the op determines whether to discretize (when a feature is calibrated),
-      #   or whether to simply limit bits and pass through (when not calibrated)
-      # NOTE - Hashing is done in C++
-      discretizer_keys, discretizer_vals = libtwml.ops.hashing_discretizer(
-        input_ids=keys,  # Input
-        input_vals=vals,  # Input
-        bin_vals=self._bin_vals,  # Input
-        feature_ids=tf.make_tensor_proto(self._feature_ids),  # Attr
-        n_bin=self._n_bin,  # Attr
-        output_bits=self._out_bits,  # Attr
-        cost_per_unit=self.cost_per_unit,  # Attr
-        options=self._options,  # Attr
-      )
-    else:
-      discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
-      discretizer_vals = vals
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_size = tf.convert_to_tensor(1 << self._out_bits, tf.int64)
-    output_shape = [batch_size, output_size]
-
-    return twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
+    def __init__(
+        self,
+        feature_ids,
+        bin_vals,
+        n_bin,
+        out_bits,
+        cost_per_unit=500,
+        options=None,
+        **kwargs
+    ):
+        """
+        Creates a non-initialized `HashingDiscretizer` object.
+
+        Parent class args:
+          see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
+          for documentation of parent class arguments.
+
+        Required args:
+          feature_ids (1D int64 numpy array):
+          - list of feature IDs that have been calibrated and have corresponding
+            bin boundary values in the bin_vals array
+          - bin values for feature feature_ids[i] live at bin_vals[i*n_bin:(i+1)*n_bin]
+          bin_vals (1D float numpy array):
+          - These are the bin boundary values for each calibrated feature
+          - len(bin_vals) = n_bin*len(feature_ids)
+          n_bin (int):
+          - number of HashingDiscretizer bins is actually n_bin + 1
+          - ***Note*** that if a value N is passed for the value of n_bin to
+            HashingDiscretizerCalibrator, then HashingDiscretizerCalibrator
+            will generate N+1 bin boundaries for each feature, and hence there
+            will actually be N+2 potential bins for each feature
+          out_bits (int):
+            Determines the maximum value for output feature IDs.
+            The dense_shape of the SparseTensor returned by lookup(x)
+            will be [x.shape[0], 1 << output_bits].
+
+        Optional args:
+          cost_per_unit (int):
+          - heuristic for intra op multithreading. approximate nanoseconds per input value.
+          options (int or None for default):
+          - Selects behavior of the op. Default is lower_bound and integer_multiplicative_hashing.
+          - Use values in twml.constants.HashingDiscretizerOptions to select options as follows
+            choose exactly one of HashingDiscretizerOptions.{SEARCH_LOWER_BOUND, SEARCH_LINEAR, SEARCH_UPPER_BOUND}
+            choose exactly one of HashingDiscretizerOptions.{HASH_32BIT, HASH_64BIT}
+            Bitwise OR these together to construct the options input.
+            For example, `options=(HashingDiscretizerOptions.SEARCH_UPPER_BOUND | HashingDiscretizerOptions.HASH_64BIT)`
+        """
+        super(HashingDiscretizer, self).__init__(**kwargs)
+
+        self._feature_ids = feature_ids
+        self._bin_vals = bin_vals
+        self._n_bin = n_bin
+        self._out_bits = out_bits
+        self.cost_per_unit = cost_per_unit
+        if options is None:
+            options = (
+                HashingDiscretizerOptions.SEARCH_LOWER_BOUND
+                | HashingDiscretizerOptions.HASH_32BIT
+            )
+        self._options = options
+
+        if not self.built:
+            self.build(input_shape=None)
+
+    def build(self, input_shape):  # pylint: disable=unused-argument
+        """
+        Creates the variables of the layer
+        """
+        # make sure this is last
+        self.built = True
+
+    def call(self, inputs, **kwargs):
+        """
+        Implements HashingDiscretizer inference on a twml.SparseTensor.
+        Alternatively, accepts a tf.SparseTensor that can be converted
+        to twml.SparseTensor.
+
+        Performs discretization of input values.
+        i.e. bucket_val = bucket(val | feature_id)
+
+        This bucket mapping depends on the calibration (i.e. the bin boundaries).
+        However, (feature_id, bucket_val) pairs are mapped to new_feature_id in
+        a way that is independent of the calibration procedure
+
+        Args:
+          inputs: A 2D SparseTensor that is input to HashingDiscretizer for
+            discretization. It has a dense_shape of [batch_size, input_size]
+          name: A name for the operation (optional).
+        Returns:
+          A tf.SparseTensor, created from twml.SparseTensor.to_tf()
+          Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+        """
+        if isinstance(inputs, tf.SparseTensor):
+            inputs = twml.SparseTensor.from_tf(inputs)
+
+        assert isinstance(inputs, twml.SparseTensor)
+
+        # sparse column indices
+        ids = inputs.ids
+        # sparse row indices
+        keys = inputs.indices
+        # sparse values
+        vals = inputs.values
+
+        if len(self._feature_ids) > 0:
+            # pass all inputs to the c++ op
+            # the op determines whether to discretize (when a feature is calibrated),
+            #   or whether to simply limit bits and pass through (when not calibrated)
+            # NOTE - Hashing is done in C++
+            discretizer_keys, discretizer_vals = libtwml.ops.hashing_discretizer(
+                input_ids=keys,  # Input
+                input_vals=vals,  # Input
+                bin_vals=self._bin_vals,  # Input
+                feature_ids=tf.make_tensor_proto(self._feature_ids),  # Attr
+                n_bin=self._n_bin,  # Attr
+                output_bits=self._out_bits,  # Attr
+                cost_per_unit=self.cost_per_unit,  # Attr
+                options=self._options,  # Attr
+            )
+        else:
+            discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
+            discretizer_vals = vals
+
+        batch_size = tf.to_int64(inputs.dense_shape[0])
+        output_size = tf.convert_to_tensor(1 << self._out_bits, tf.int64)
+        output_shape = [batch_size, output_size]
+
+        return twml.SparseTensor(
+            ids, discretizer_keys, discretizer_vals, output_shape
+        ).to_tf()
diff --git a/twml/twml/contrib/layers/mask_layer.py b/twml/twml/contrib/layers/mask_layer.py
index f5e788c7b..70be78fc3 100644
--- a/twml/twml/contrib/layers/mask_layer.py
+++ b/twml/twml/contrib/layers/mask_layer.py
@@ -3,27 +3,27 @@
 
 
 class MaskLayer(Layer):
-  """
-  This layer corresponds to `twml.contrib.pruning.apply_mask`.
-
-  It applies a binary mask to mask out channels of a given tensor. The masks can be
-  optimized using `twml.contrib.trainers.PruningDataRecordTrainer`.
-  """
+    """
+    This layer corresponds to `twml.contrib.pruning.apply_mask`.
 
-  def call(self, inputs, **kwargs):
+    It applies a binary mask to mask out channels of a given tensor. The masks can be
+    optimized using `twml.contrib.trainers.PruningDataRecordTrainer`.
     """
-    Applies a binary mask to the channels of the input.
 
-    Arguments:
-      inputs:
-        input tensor
-      **kwargs:
-        additional keyword arguments
+    def call(self, inputs, **kwargs):
+        """
+        Applies a binary mask to the channels of the input.
 
-    Returns:
-      Masked tensor
-    """
-    return apply_mask(inputs)
+        Arguments:
+          inputs:
+            input tensor
+          **kwargs:
+            additional keyword arguments
+
+        Returns:
+          Masked tensor
+        """
+        return apply_mask(inputs)
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/twml/twml/contrib/layers/stacked_rnn.py b/twml/twml/contrib/layers/stacked_rnn.py
index e05f5d853..d141be6c2 100644
--- a/twml/twml/contrib/layers/stacked_rnn.py
+++ b/twml/twml/contrib/layers/stacked_rnn.py
@@ -1,189 +1,204 @@
-
+import tensorflow
+import tensorflow.compat.v1 as tf
 from twitter.deepbird.compat.v1.rnn import stack_bidirectional_dynamic_rnn
 
-import tensorflow.compat.v1 as tf
-import tensorflow
 import twml
 
 
 def _get_rnn_cell_creator(cell_type):
-  if cell_type == "LSTM":
-    Cell = tf.nn.rnn_cell.LSTMCell
-  elif cell_type == "GRU":
-    Cell = tf.nn.rnn_cell.GRUCell
-  else:
-    raise ValueError("cell_type: %s is not supported."
-                     "It should be one of 'LSTM' or 'GRU'." % cell_type)
-  return Cell
+    if cell_type == "LSTM":
+        Cell = tf.nn.rnn_cell.LSTMCell
+    elif cell_type == "GRU":
+        Cell = tf.nn.rnn_cell.GRUCell
+    else:
+        raise ValueError(
+            "cell_type: %s is not supported."
+            "It should be one of 'LSTM' or 'GRU'." % cell_type
+        )
+    return Cell
 
 
 def _apply_dropout_wrapper(rnn_cells, dropout):
-  """ Apply dropout wrapper around each cell if necessary """
-  if rnn_cells is None:
-    return None
+    """Apply dropout wrapper around each cell if necessary"""
+    if rnn_cells is None:
+        return None
 
-  cells = []
-  for i, dropout_rate in enumerate(dropout):
-    cell = rnn_cells[i]
-    if dropout_rate > 0:
-      cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=(1.0 - dropout_rate))
-    cells.append(cell)
-  return cells
+    cells = []
+    for i, dropout_rate in enumerate(dropout):
+        cell = rnn_cells[i]
+        if dropout_rate > 0:
+            cell = tf.nn.rnn_cell.DropoutWrapper(
+                cell, input_keep_prob=(1.0 - dropout_rate)
+            )
+        cells.append(cell)
+    return cells
 
 
 def _create_bidirectional_rnn_cell(num_units, dropout, cell_type):
-  scope_name = "lstm" if cell_type else "gru"
-  with tf.variable_scope(scope_name):
-    Cell = _get_rnn_cell_creator(cell_type)
-    cells_forward = [Cell(output_size) for output_size in num_units]
-    cells_backward = [Cell(output_size) for output_size in num_units]
-    cells_forward = _apply_dropout_wrapper(cells_forward, dropout)
-    cells_backward = _apply_dropout_wrapper(cells_backward, dropout)
-
-  def stacked_rnn_cell(inputs, sequence_lengths):
+    scope_name = "lstm" if cell_type else "gru"
     with tf.variable_scope(scope_name):
-      outputs, final_states, _ = stack_bidirectional_dynamic_rnn(
-        cells_fw=cells_forward, cells_bw=cells_backward, inputs=inputs,
-        sequence_length=sequence_lengths, dtype=inputs.dtype)
-      return final_states[-1][-1]
-
-  return stacked_rnn_cell
+        Cell = _get_rnn_cell_creator(cell_type)
+        cells_forward = [Cell(output_size) for output_size in num_units]
+        cells_backward = [Cell(output_size) for output_size in num_units]
+        cells_forward = _apply_dropout_wrapper(cells_forward, dropout)
+        cells_backward = _apply_dropout_wrapper(cells_backward, dropout)
+
+    def stacked_rnn_cell(inputs, sequence_lengths):
+        with tf.variable_scope(scope_name):
+            outputs, final_states, _ = stack_bidirectional_dynamic_rnn(
+                cells_fw=cells_forward,
+                cells_bw=cells_backward,
+                inputs=inputs,
+                sequence_length=sequence_lengths,
+                dtype=inputs.dtype,
+            )
+            return final_states[-1][-1]
+
+    return stacked_rnn_cell
 
 
 def _create_unidirectional_rnn_cell(num_units, dropout, cell_type):
-  scope_name = "lstm" if cell_type else "gru"
-  with tf.variable_scope(scope_name):
-    Cell = _get_rnn_cell_creator(cell_type)
-    cells = [Cell(output_size) for output_size in num_units]
-    cells = _apply_dropout_wrapper(cells, dropout)
-    multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells)
-
-  def stacked_rnn_cell(inputs, sequence_lengths):
+    scope_name = "lstm" if cell_type else "gru"
     with tf.variable_scope(scope_name):
-      outputs, final_states = tf.nn.static_rnn(
-        multi_cell,
-        tf.unstack(inputs, axis=1),
-        dtype=inputs.dtype,
-        sequence_length=sequence_lengths)
-      return final_states[-1].h
+        Cell = _get_rnn_cell_creator(cell_type)
+        cells = [Cell(output_size) for output_size in num_units]
+        cells = _apply_dropout_wrapper(cells, dropout)
+        multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells)
 
-  return stacked_rnn_cell
+    def stacked_rnn_cell(inputs, sequence_lengths):
+        with tf.variable_scope(scope_name):
+            outputs, final_states = tf.nn.static_rnn(
+                multi_cell,
+                tf.unstack(inputs, axis=1),
+                dtype=inputs.dtype,
+                sequence_length=sequence_lengths,
+            )
+            return final_states[-1].h
 
+    return stacked_rnn_cell
 
-def _create_regular_rnn_cell(num_units, dropout, cell_type, is_bidirectional):
-  if is_bidirectional:
-    return _create_bidirectional_rnn_cell(num_units, dropout, cell_type)
-  else:
-    return _create_unidirectional_rnn_cell(num_units, dropout, cell_type)
 
-
-class StackedRNN(twml.layers.Layer):
-  """
-  Layer for stacking RNN modules.
-  This layer provides a unified interface for RNN modules that perform well on CPUs and GPUs.
-
-  Arguments:
-    num_units:
-      A list specifying the number of units per layer.
-    dropout:
-      Dropout applied to the input of each cell.
-      If list, has to dropout used for each layer.
-      If number, the same amount of dropout is used everywhere.
-      Defaults to 0.
-    is_training:
-      Flag to specify if the layer is used in training mode or not.
-    cell_type:
-      Sepcifies the type of RNN. Can be "LSTM". "GRU" is not yet implemented.
-    is_bidirectional:
-      Specifies if the stacked RNN layer is bidirectional.
-      This is for forward compatibility, this is not yet implemented.
-      Defaults to False.
-  """
-
-  def __init__(self,
-               num_units,
-               dropout=0,
-               is_training=True,
-               cell_type="LSTM",
-               is_bidirectional=False,
-               name="stacked_rnn"):
-
-    super(StackedRNN, self).__init__(name=name)
-
-    if (is_bidirectional):
-      raise NotImplementedError("Bidirectional RNN is not yet implemented")
-
-    if (cell_type != "LSTM"):
-      raise NotImplementedError("Only LSTMs are supported")
-
-    if not isinstance(num_units, (list, tuple)):
-      num_units = [num_units]
-    else:
-      num_units = num_units
-
-    self.num_layers = len(num_units)
-    if not isinstance(dropout, (tuple, list)):
-      dropout = [dropout] * self.num_layers
+def _create_regular_rnn_cell(num_units, dropout, cell_type, is_bidirectional):
+    if is_bidirectional:
+        return _create_bidirectional_rnn_cell(num_units, dropout, cell_type)
     else:
-      dropout = dropout
-
-    self.is_training = is_training
-
-    is_gpu_available = twml.contrib.utils.is_gpu_available()
-    same_unit_size = all(size == num_units[0] for size in num_units)
-    same_dropout_rate = any(val == dropout[0] for val in dropout)
+        return _create_unidirectional_rnn_cell(num_units, dropout, cell_type)
 
-    self.stacked_rnn_cell = None
-    self.num_units = num_units
-    self.dropout = dropout
-    self.cell_type = cell_type
-    self.is_bidirectional = is_bidirectional
 
-  def build(self, input_shape):
-    self.stacked_rnn_cell = _create_regular_rnn_cell(self.num_units,
-                                                     self.dropout,
-                                                     self.cell_type,
-                                                     self.is_bidirectional)
+class StackedRNN(twml.layers.Layer):
+    """
+    Layer for stacking RNN modules.
+    This layer provides a unified interface for RNN modules that perform well on CPUs and GPUs.
 
-  def call(self, inputs, sequence_lengths):
+    Arguments:
+      num_units:
+        A list specifying the number of units per layer.
+      dropout:
+        Dropout applied to the input of each cell.
+        If list, has to dropout used for each layer.
+        If number, the same amount of dropout is used everywhere.
+        Defaults to 0.
+      is_training:
+        Flag to specify if the layer is used in training mode or not.
+      cell_type:
+        Sepcifies the type of RNN. Can be "LSTM". "GRU" is not yet implemented.
+      is_bidirectional:
+        Specifies if the stacked RNN layer is bidirectional.
+        This is for forward compatibility, this is not yet implemented.
+        Defaults to False.
     """
+
+    def __init__(
+        self,
+        num_units,
+        dropout=0,
+        is_training=True,
+        cell_type="LSTM",
+        is_bidirectional=False,
+        name="stacked_rnn",
+    ):
+        super(StackedRNN, self).__init__(name=name)
+
+        if is_bidirectional:
+            raise NotImplementedError("Bidirectional RNN is not yet implemented")
+
+        if cell_type != "LSTM":
+            raise NotImplementedError("Only LSTMs are supported")
+
+        if not isinstance(num_units, (list, tuple)):
+            num_units = [num_units]
+        else:
+            num_units = num_units
+
+        self.num_layers = len(num_units)
+        if not isinstance(dropout, (tuple, list)):
+            dropout = [dropout] * self.num_layers
+        else:
+            dropout = dropout
+
+        self.is_training = is_training
+
+        is_gpu_available = twml.contrib.utils.is_gpu_available()
+        same_unit_size = all(size == num_units[0] for size in num_units)
+        same_dropout_rate = any(val == dropout[0] for val in dropout)
+
+        self.stacked_rnn_cell = None
+        self.num_units = num_units
+        self.dropout = dropout
+        self.cell_type = cell_type
+        self.is_bidirectional = is_bidirectional
+
+    def build(self, input_shape):
+        self.stacked_rnn_cell = _create_regular_rnn_cell(
+            self.num_units, self.dropout, self.cell_type, self.is_bidirectional
+        )
+
+    def call(self, inputs, sequence_lengths):
+        """
+        Arguments:
+          inputs:
+            A tensor of size [batch_size, max_sequence_length, embedding_size].
+          sequence_lengths:
+            The length of each input sequence in the batch. Should be of size [batch_size].
+        Returns:
+          final_output
+            The output of at the end of sequence_length.
+        """
+        return self.stacked_rnn_cell(inputs, sequence_lengths)
+
+
+def stacked_rnn(
+    inputs,
+    sequence_lengths,
+    num_units,
+    dropout=0,
+    is_training=True,
+    cell_type="LSTM",
+    is_bidirectional=False,
+    name="stacked_rnn",
+):
+    """Functional interface for StackedRNN
     Arguments:
       inputs:
         A tensor of size [batch_size, max_sequence_length, embedding_size].
       sequence_lengths:
         The length of each input sequence in the batch. Should be of size [batch_size].
-    Returns:
-      final_output
-        The output of at the end of sequence_length.
+      num_units:
+        A list specifying the number of units per layer.
+      dropout:
+        Dropout applied to the input of each cell.
+        If list, has to dropout used for each layer.
+        If number, the same amount of dropout is used everywhere.
+        Defaults to 0.
+      is_training:
+        Flag to specify if the layer is used in training mode or not.
+      cell_type:
+        Sepcifies the type of RNN. Can be "LSTM" or "GRU".
+      is_bidirectional:
+        Specifies if the stacked RNN layer is bidirectional.
+        Defaults to False.
+    Returns
+      outputs, state.
     """
-    return self.stacked_rnn_cell(inputs, sequence_lengths)
-
-
-def stacked_rnn(inputs, sequence_lengths, num_units,
-                dropout=0, is_training=True,
-                cell_type="LSTM", is_bidirectional=False, name="stacked_rnn"):
-  """Functional interface for StackedRNN
-  Arguments:
-    inputs:
-      A tensor of size [batch_size, max_sequence_length, embedding_size].
-    sequence_lengths:
-      The length of each input sequence in the batch. Should be of size [batch_size].
-    num_units:
-      A list specifying the number of units per layer.
-    dropout:
-      Dropout applied to the input of each cell.
-      If list, has to dropout used for each layer.
-      If number, the same amount of dropout is used everywhere.
-      Defaults to 0.
-    is_training:
-      Flag to specify if the layer is used in training mode or not.
-    cell_type:
-      Sepcifies the type of RNN. Can be "LSTM" or "GRU".
-    is_bidirectional:
-      Specifies if the stacked RNN layer is bidirectional.
-      Defaults to False.
-  Returns
-    outputs, state.
-  """
-  rnn = StackedRNN(num_units, dropout, is_training, cell_type, is_bidirectional, name)
-  return rnn(inputs, sequence_lengths)
+    rnn = StackedRNN(num_units, dropout, is_training, cell_type, is_bidirectional, name)
+    return rnn(inputs, sequence_lengths)
diff --git a/twml/twml/contrib/layers/zscore_normalization.py b/twml/twml/contrib/layers/zscore_normalization.py
index 8a1064965..b6153e219 100644
--- a/twml/twml/contrib/layers/zscore_normalization.py
+++ b/twml/twml/contrib/layers/zscore_normalization.py
@@ -1,186 +1,237 @@
 """
 Contains the twml.layers.ZscoreNormalization layer.
 """
-from twml.layers.layer import Layer
 import tensorflow.compat.v1 as tf
-
 from tensorflow.python.training import moving_averages
 
+from twml.layers.layer import Layer
+
 
 # This is copied from tensorflow.contrib.framework.python.ops.add_model_variable in 1.15
 # Not available in 2.x
 # TODO: Figure out if this is really necessary.
 def _add_model_variable(var):
-  """Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection.
-  Args:
-    var: a variable.
-  """
-  if var not in tf.get_collection(tf.GraphKeys.MODEL_VARIABLES):
-    tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, var)
+    """Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection.
+    Args:
+      var: a variable.
+    """
+    if var not in tf.get_collection(tf.GraphKeys.MODEL_VARIABLES):
+        tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, var)
 
 
 def update_moving_variable(batch_var, moving_var, decay, zero_debias=True, name=None):
-  update_op = moving_averages.assign_moving_average(
-      moving_var, batch_var, decay, zero_debias=zero_debias, name=None)
-  _add_model_variable(moving_var)
-  with tf.control_dependencies([update_op]):
-    return tf.identity(moving_var)
+    update_op = moving_averages.assign_moving_average(
+        moving_var, batch_var, decay, zero_debias=zero_debias, name=None
+    )
+    _add_model_variable(moving_var)
+    with tf.control_dependencies([update_op]):
+        return tf.identity(moving_var)
 
 
 class ZscoreNormalization(Layer):
-  """
-  Perform z-score normalization using moving mean and std.
-  Missing values are not included during mean/std calculation
-  This layer should only be used right after input layer.
+    """
+    Perform z-score normalization using moving mean and std.
+    Missing values are not included during mean/std calculation
+    This layer should only be used right after input layer.
 
-  Args:
-    decay:
-      using large decay to include longer moving means.
-    data_type:
-      use float64 to prevent overflow during variance calculation.
-    name:
-      Layer name
-  Returns:
-    A layer representing the output of the ZscoreNormalization transformation.
-   """
+    Args:
+      decay:
+        using large decay to include longer moving means.
+      data_type:
+        use float64 to prevent overflow during variance calculation.
+      name:
+        Layer name
+    Returns:
+      A layer representing the output of the ZscoreNormalization transformation.
+    """
 
-  def __init__(
-    self,
-    decay=0.9999,
-    data_type=tf.float64,
-    name=None,
-    **kwargs):
-    super(ZscoreNormalization, self).__init__(name=name, **kwargs)
-    self.epsilon = tf.constant(1., data_type)
-    self.decay = decay
-    self.data_type = data_type
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the moving_mean and moving_var tf.Variables of the layer."""
-    input_dim = input_shape[1]
-    self.moving_mean = self.add_variable(
-      '{}_mean/EMA'.format(self.name),
-      initializer=tf.constant_initializer(),
-      shape=[input_dim],
-      dtype=self.data_type,
-      trainable=False
-    )
-    self.moving_var = self.add_variable(
-      '{}_variance/EMA'.format(self.name),
-      initializer=tf.constant_initializer(),
-      shape=[input_dim],
-      dtype=self.data_type,
-      trainable=False
-    )
-    self.built = True
+    def __init__(self, decay=0.9999, data_type=tf.float64, name=None, **kwargs):
+        super(ZscoreNormalization, self).__init__(name=name, **kwargs)
+        self.epsilon = tf.constant(1.0, data_type)
+        self.decay = decay
+        self.data_type = data_type
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    def build(self, input_shape):  # pylint: disable=unused-argument
+        """Creates the moving_mean and moving_var tf.Variables of the layer."""
+        input_dim = input_shape[1]
+        self.moving_mean = self.add_variable(
+            "{}_mean/EMA".format(self.name),
+            initializer=tf.constant_initializer(),
+            shape=[input_dim],
+            dtype=self.data_type,
+            trainable=False,
+        )
+        self.moving_var = self.add_variable(
+            "{}_variance/EMA".format(self.name),
+            initializer=tf.constant_initializer(),
+            shape=[input_dim],
+            dtype=self.data_type,
+            trainable=False,
+        )
+        self.built = True
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
 
-    """
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        """
+
+        return input_shape
+
+    def _training_pass(
+        self, input, dense_mask, input_dtype, handle_single, zero_debias
+    ):
+        epsilon = self.epsilon
+        moving_mean, moving_var = self.moving_mean, self.moving_var
+        # calculate the number of exisiting value for each feature
+        tensor_batch_num = tf.reduce_sum(tf.cast(dense_mask, self.data_type), axis=0)
+        mask_ones = tf.cast(tensor_batch_num, tf.bool)
+        eps_vector = tf.fill(tf.shape(tensor_batch_num), epsilon)
+        # the following filled 0 with epision
+        tensor_batch_num_eps = tf.where(mask_ones, tensor_batch_num, eps_vector)
+        tensor_batch_num_eps_broacast = tf.expand_dims(tensor_batch_num_eps, 0)
+        tensor_batch_divided = input / tensor_batch_num_eps_broacast
+        tensor_batch_mean = tf.reduce_sum(tensor_batch_divided, axis=0)
+
+        # update moving mean here, and use it to calculate the std.
+        tensor_moving_mean = update_moving_variable(
+            tensor_batch_mean, moving_mean, self.decay, zero_debias, name="mean_ema_op"
+        )
+
+        tensor_batch_sub_mean = input - tf.expand_dims(tensor_moving_mean, 0)
+        tensor_batch_sub_mean = tf.where(
+            dense_mask, tensor_batch_sub_mean, tf.zeros_like(tensor_batch_sub_mean)
+        )
+        # divided by sqrt(n) before square, and then do summation for numeric stability.
+        broad_sqrt_num_eps = tf.expand_dims(tf.sqrt(tensor_batch_num_eps), 0)
+        tensor_batch_sub_mean_div = tensor_batch_sub_mean / broad_sqrt_num_eps
+        tensor_batch_sub_mean_div_square = tf.square(tensor_batch_sub_mean_div)
+        tensor_batch_var = tf.reduce_sum(tensor_batch_sub_mean_div_square, axis=0)
 
-    return input_shape
-
-  def _training_pass(self, input, dense_mask, input_dtype, handle_single, zero_debias):
-    epsilon = self.epsilon
-    moving_mean, moving_var = self.moving_mean, self.moving_var
-    # calculate the number of exisiting value for each feature
-    tensor_batch_num = tf.reduce_sum(tf.cast(dense_mask, self.data_type), axis=0)
-    mask_ones = tf.cast(tensor_batch_num, tf.bool)
-    eps_vector = tf.fill(tf.shape(tensor_batch_num), epsilon)
-    # the following filled 0 with epision
-    tensor_batch_num_eps = tf.where(mask_ones,
-                                    tensor_batch_num,
-                                    eps_vector
-                                  )
-    tensor_batch_num_eps_broacast = tf.expand_dims(tensor_batch_num_eps, 0)
-    tensor_batch_divided = input / tensor_batch_num_eps_broacast
-    tensor_batch_mean = tf.reduce_sum(tensor_batch_divided, axis=0)
-
-    # update moving mean here, and use it to calculate the std.
-    tensor_moving_mean = update_moving_variable(tensor_batch_mean, moving_mean, self.decay,
-                                                zero_debias, name="mean_ema_op")
-
-    tensor_batch_sub_mean = input - tf.expand_dims(tensor_moving_mean, 0)
-    tensor_batch_sub_mean = tf.where(dense_mask,
-                                    tensor_batch_sub_mean,
-                                    tf.zeros_like(tensor_batch_sub_mean))
-    # divided by sqrt(n) before square, and then do summation for numeric stability.
-    broad_sqrt_num_eps = tf.expand_dims(tf.sqrt(tensor_batch_num_eps), 0)
-    tensor_batch_sub_mean_div = tensor_batch_sub_mean / broad_sqrt_num_eps
-    tensor_batch_sub_mean_div_square = tf.square(tensor_batch_sub_mean_div)
-    tensor_batch_var = tf.reduce_sum(tensor_batch_sub_mean_div_square, axis=0)
-
-    # update moving var here, dont replace 0 with eps before updating.
-    tensor_moving_var = update_moving_variable(tensor_batch_var, moving_var, self.decay,
-                                               zero_debias, name="var_ema_op")
-
-    # if std is 0, replace it with epsilon
-    tensor_moving_std = tf.sqrt(tensor_moving_var)
-    tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0),
-                                    eps_vector,
-                                    tensor_moving_std)
-
-    missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0)
-
-    if handle_single:
-      # if std==0 and value not missing, reset it to 1.
-      moving_var_mask_zero = tf.math.equal(tensor_moving_var, 0)
-      moving_var_mask_zero = tf.expand_dims(moving_var_mask_zero, 0)
-      missing_input_norm = tf.where(
-        tf.math.logical_and(dense_mask, moving_var_mask_zero),
-        tf.ones_like(missing_input_norm),
-        missing_input_norm
-      )
-    if input_dtype != self.data_type:
-      missing_input_norm = tf.cast(missing_input_norm, input_dtype)
-    return missing_input_norm
-
-  def _infer_pass(self, input, dense_mask, input_dtype, handle_single):
-    epsilon = tf.cast(self.epsilon, input_dtype)
-    testing_moving_mean = tf.cast(self.moving_mean, input_dtype)
-    tensor_moving_std = tf.cast(tf.sqrt(self.moving_var), input_dtype)
-
-    broad_mean = tf.expand_dims(testing_moving_mean, 0)
-    tensor_batch_sub_mean = input - broad_mean
-
-    tensor_batch_sub_mean = tf.where(dense_mask,
-                                    tensor_batch_sub_mean,
-                                    tf.zeros_like(tensor_batch_sub_mean)
-                            )
-    tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0),
-                                      tf.fill(tf.shape(tensor_moving_std), epsilon),
-                                      tensor_moving_std)
-    missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0)
-    if handle_single:
-      # if std==0 and value not missing, reset it to 1.
-      moving_var_broad = tf.expand_dims(tensor_moving_std, 0)
-      moving_var_mask_zero = tf.math.logical_not(tf.cast(moving_var_broad, tf.bool))
-
-      missing_input_norm = tf.where(tf.math.logical_and(dense_mask, moving_var_mask_zero),
-                          tf.ones_like(missing_input_norm),
-                          missing_input_norm
-                          )
-    return missing_input_norm
-
-  def call(
-    self,
+        # update moving var here, dont replace 0 with eps before updating.
+        tensor_moving_var = update_moving_variable(
+            tensor_batch_var, moving_var, self.decay, zero_debias, name="var_ema_op"
+        )
+
+        # if std is 0, replace it with epsilon
+        tensor_moving_std = tf.sqrt(tensor_moving_var)
+        tensor_moving_std_eps = tf.where(
+            tf.equal(tensor_moving_std, 0), eps_vector, tensor_moving_std
+        )
+
+        missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(
+            tensor_moving_std_eps, 0
+        )
+
+        if handle_single:
+            # if std==0 and value not missing, reset it to 1.
+            moving_var_mask_zero = tf.math.equal(tensor_moving_var, 0)
+            moving_var_mask_zero = tf.expand_dims(moving_var_mask_zero, 0)
+            missing_input_norm = tf.where(
+                tf.math.logical_and(dense_mask, moving_var_mask_zero),
+                tf.ones_like(missing_input_norm),
+                missing_input_norm,
+            )
+        if input_dtype != self.data_type:
+            missing_input_norm = tf.cast(missing_input_norm, input_dtype)
+        return missing_input_norm
+
+    def _infer_pass(self, input, dense_mask, input_dtype, handle_single):
+        epsilon = tf.cast(self.epsilon, input_dtype)
+        testing_moving_mean = tf.cast(self.moving_mean, input_dtype)
+        tensor_moving_std = tf.cast(tf.sqrt(self.moving_var), input_dtype)
+
+        broad_mean = tf.expand_dims(testing_moving_mean, 0)
+        tensor_batch_sub_mean = input - broad_mean
+
+        tensor_batch_sub_mean = tf.where(
+            dense_mask, tensor_batch_sub_mean, tf.zeros_like(tensor_batch_sub_mean)
+        )
+        tensor_moving_std_eps = tf.where(
+            tf.equal(tensor_moving_std, 0),
+            tf.fill(tf.shape(tensor_moving_std), epsilon),
+            tensor_moving_std,
+        )
+        missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(
+            tensor_moving_std_eps, 0
+        )
+        if handle_single:
+            # if std==0 and value not missing, reset it to 1.
+            moving_var_broad = tf.expand_dims(tensor_moving_std, 0)
+            moving_var_mask_zero = tf.math.logical_not(
+                tf.cast(moving_var_broad, tf.bool)
+            )
+
+            missing_input_norm = tf.where(
+                tf.math.logical_and(dense_mask, moving_var_mask_zero),
+                tf.ones_like(missing_input_norm),
+                missing_input_norm,
+            )
+        return missing_input_norm
+
+    def call(
+        self, input, is_training, dense_mask=None, zero_debias=True, handle_single=False
+    ):
+        """
+        Args:
+        -----------
+        input:  B x D : float32/float64
+          missing value must be set to 0.
+        is_training: bool
+          training phase or testing phase
+        dense_mask: B x D : bool
+          missing value should be marked as 0, non-missing as 1. same shape as input
+        zero_debias: bool
+          bias correction of the moving average. (biased towards 0 in the beginning.
+          see adam paper. https://arxiv.org/abs/1412.6980)
+        handle_single: bool
+          if std==0, and feature is not missing value, set the value to 1, instead of 0.
+          This is super rare if input only consists of continous feature.
+          But if one-hot feature is included,
+          they will all have same values 1, in that case, make sure to set handle_single to true.
+        """
+
+        if dense_mask is None:
+            dense_mask = tf.math.logical_not(tf.equal(input, 0))
+        input_dtype = input.dtype
+
+        if is_training:
+            if input_dtype != self.data_type:
+                input = tf.cast(input, self.data_type)
+            return self._training_pass(
+                input, dense_mask, input_dtype, handle_single, zero_debias
+            )
+        else:
+            return self._infer_pass(input, dense_mask, input_dtype, handle_single)
+
+
+def zscore_normalization(
     input,
     is_training,
+    decay=0.9999,
+    data_type=tf.float64,
+    name=None,
     dense_mask=None,
     zero_debias=True,
-    handle_single=False):
+    handle_single=False,
+    **kwargs
+):
     """
     Args:
-    -----------
+    ------------
     input:  B x D : float32/float64
       missing value must be set to 0.
     is_training: bool
       training phase or testing phase
+    decay:
+      using large decay to include longer moving means.
+    data_type:
+      use float64 to zprevent overflow during variance calculation.
+    name:
+      Layer name
     dense_mask: B x D : bool
       missing value should be marked as 0, non-missing as 1. same shape as input
     zero_debias: bool
@@ -193,55 +244,13 @@ def call(
       they will all have same values 1, in that case, make sure to set handle_single to true.
     """
 
-    if dense_mask is None:
-      dense_mask = tf.math.logical_not(tf.equal(input, 0))
-    input_dtype = input.dtype
-
-    if is_training:
-      if input_dtype != self.data_type:
-        input = tf.cast(input, self.data_type)
-      return self._training_pass(input, dense_mask, input_dtype, handle_single, zero_debias)
-    else:
-      return self._infer_pass(input, dense_mask, input_dtype, handle_single)
-
-
-def zscore_normalization(
-  input,
-  is_training,
-  decay=0.9999,
-  data_type=tf.float64,
-  name=None,
-  dense_mask=None,
-  zero_debias=True,
-  handle_single=False, **kwargs):
-  """
-  Args:
-  ------------
-  input:  B x D : float32/float64
-    missing value must be set to 0.
-  is_training: bool
-    training phase or testing phase
-  decay:
-    using large decay to include longer moving means.
-  data_type:
-    use float64 to zprevent overflow during variance calculation.
-  name:
-    Layer name
-  dense_mask: B x D : bool
-    missing value should be marked as 0, non-missing as 1. same shape as input
-  zero_debias: bool
-    bias correction of the moving average. (biased towards 0 in the beginning.
-    see adam paper. https://arxiv.org/abs/1412.6980)
-  handle_single: bool
-    if std==0, and feature is not missing value, set the value to 1, instead of 0.
-    This is super rare if input only consists of continous feature.
-    But if one-hot feature is included,
-    they will all have same values 1, in that case, make sure to set handle_single to true.
-  """
-
-  norm_layer = ZscoreNormalization(decay=decay, data_type=data_type, name=name, **kwargs)
-  return norm_layer(input,
-                    is_training,
-                    dense_mask=dense_mask,
-                    zero_debias=zero_debias,
-                    handle_single=handle_single)
+    norm_layer = ZscoreNormalization(
+        decay=decay, data_type=data_type, name=name, **kwargs
+    )
+    return norm_layer(
+        input,
+        is_training,
+        dense_mask=dense_mask,
+        zero_debias=zero_debias,
+        handle_single=handle_single,
+    )
diff --git a/twml/twml/contrib/metrics/__init__.py b/twml/twml/contrib/metrics/__init__.py
index 37e6563c9..f2e26dafe 100644
--- a/twml/twml/contrib/metrics/__init__.py
+++ b/twml/twml/contrib/metrics/__init__.py
@@ -1,5 +1,5 @@
 # pylint: disable=wildcard-import
 """This module contains experimental metric(s) for search and ranking"""
 
-from .search_metrics import get_search_metric_fn, ndcg  # noqa: F401
 from .metrics import *  # noqa: F401
+from .search_metrics import get_search_metric_fn, ndcg  # noqa: F401
diff --git a/twml/twml/contrib/metrics/metrics.py b/twml/twml/contrib/metrics/metrics.py
index dea1a5273..506379a85 100644
--- a/twml/twml/contrib/metrics/metrics.py
+++ b/twml/twml/contrib/metrics/metrics.py
@@ -15,195 +15,239 @@
 from collections import OrderedDict
 
 import tensorflow.compat.v1 as tf
-from twml.metrics import get_multi_binary_class_metric_fn
 
+from twml.metrics import get_multi_binary_class_metric_fn
 
 
 # checkstyle: noqa
-def get_partial_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1, predcols=None):
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    if predcols is None:
-      preds = graph_output['output']
-    else:
-      if isinstance(predcols, int):
-        predcol_list=[predcols]
-      else:
-        predcol_list=list(predcols)
-      for col in predcol_list:
-        assert 0 <= col < graph_output['output'].shape[class_dim], 'Invalid Prediction Column Index !'
-      preds  = tf.gather(graph_output['output'], indices=predcol_list, axis=class_dim)     # [batchSz, num_col]
-      labels = tf.gather(labels, indices=predcol_list, axis=class_dim)                     # [batchSz, num_col]
-
-    predInfo = {'output': preds}
-    if 'threshold' in graph_output:
-      predInfo['threshold'] = graph_output['threshold']
-    if 'hard_output' in graph_output:
-      predInfo['hard_output'] = graph_output['hard_output']
-
-    metrics_op = get_multi_binary_class_metric_fn(metrics, classes, class_dim)
-    metrics_op_res = metrics_op(predInfo, labels, weights)
-    return metrics_op_res
-
-  return get_eval_metric_ops
-
+def get_partial_multi_binary_class_metric_fn(
+    metrics, classes=None, class_dim=1, predcols=None
+):
+    def get_eval_metric_ops(graph_output, labels, weights):
+        if predcols is None:
+            preds = graph_output["output"]
+        else:
+            if isinstance(predcols, int):
+                predcol_list = [predcols]
+            else:
+                predcol_list = list(predcols)
+            for col in predcol_list:
+                assert (
+                    0 <= col < graph_output["output"].shape[class_dim]
+                ), "Invalid Prediction Column Index !"
+            preds = tf.gather(
+                graph_output["output"], indices=predcol_list, axis=class_dim
+            )  # [batchSz, num_col]
+            labels = tf.gather(
+                labels, indices=predcol_list, axis=class_dim
+            )  # [batchSz, num_col]
+
+        predInfo = {"output": preds}
+        if "threshold" in graph_output:
+            predInfo["threshold"] = graph_output["threshold"]
+        if "hard_output" in graph_output:
+            predInfo["hard_output"] = graph_output["hard_output"]
+
+        metrics_op = get_multi_binary_class_metric_fn(metrics, classes, class_dim)
+        metrics_op_res = metrics_op(predInfo, labels, weights)
+        return metrics_op_res
+
+    return get_eval_metric_ops
 
 
 # Numeric Prediction Performance among TopK Predictions
 def mean_numeric_label_topK(labels, predictions, weights, name, topK_id):
-  top_k_labels  = tf.gather(params=labels, indices=topK_id, axis=0)                # [topK, 1]
-  return tf.metrics.mean(values=top_k_labels, name=name)
+    top_k_labels = tf.gather(params=labels, indices=topK_id, axis=0)  # [topK, 1]
+    return tf.metrics.mean(values=top_k_labels, name=name)
+
 
 def mean_gated_numeric_label_topK(labels, predictions, weights, name, topK_id, bar=2.0):
-  assert isinstance(bar, int) or isinstance(bar, float), "bar must be int or float"
-  top_k_labels  = tf.gather(params=labels, indices=topK_id, axis=0)                # [topK, 1]
-  gated_top_k_labels  = tf.cast(top_k_labels > bar*1.0, tf.int32)
-  return tf.metrics.mean(values=gated_top_k_labels, name=name)
+    assert isinstance(bar, int) or isinstance(bar, float), "bar must be int or float"
+    top_k_labels = tf.gather(params=labels, indices=topK_id, axis=0)  # [topK, 1]
+    gated_top_k_labels = tf.cast(top_k_labels > bar * 1.0, tf.int32)
+    return tf.metrics.mean(values=gated_top_k_labels, name=name)
+
 
 SUPPORTED_NUMERIC_METRICS = {
-  'mean_numeric_label_topk': mean_numeric_label_topK,
-  'mean_gated_numeric_label_topk': mean_gated_numeric_label_topK
+    "mean_numeric_label_topk": mean_numeric_label_topK,
+    "mean_gated_numeric_label_topk": mean_gated_numeric_label_topK,
 }
-DEFAULT_NUMERIC_METRICS = ['mean_numeric_label_topk', 'mean_gated_numeric_label_topk']
-
-
-
-def get_metric_topK_fn_helper(targetMetrics, supportedMetrics_op, metrics=None, topK=(5,5,5), predcol=None, labelcol=None):
-  """
-  :param targetMetrics:        Target Metric List
-  :param supportedMetrics_op:  Supported Metric Operators             Dict
-  :param metrics:              Metric Set to evaluate
-  :param topK:                 (topK_min, topK_max, topK_delta)       Tuple
-  :param predcol:              Prediction Column Index
-  :param labelcol:             Label Column Index
-  :return:
-  """
-  # pylint: disable=dict-keys-not-iterating
-  if targetMetrics is None or supportedMetrics_op is None:
-    raise ValueError("Invalid Target Metric List/op !")
-
-  targetMetrics = set([m.lower() for m in targetMetrics])
-  if metrics is None:
-    metrics = list(targetMetrics)
-  else:
-    metrics = [m.lower() for m in metrics if m.lower() in targetMetrics]
+DEFAULT_NUMERIC_METRICS = ["mean_numeric_label_topk", "mean_gated_numeric_label_topk"]
 
-  num_k     = int((topK[1]-topK[0])/topK[2]+1)
-  topK_list = [topK[0]+d*topK[2] for d in range(num_k)]
-  if 1 not in topK_list:
-    topK_list = [1] + topK_list
 
-
-  def get_eval_metric_ops(graph_output, labels, weights):
+def get_metric_topK_fn_helper(
+    targetMetrics,
+    supportedMetrics_op,
+    metrics=None,
+    topK=(5, 5, 5),
+    predcol=None,
+    labelcol=None,
+):
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    :param targetMetrics:        Target Metric List
+    :param supportedMetrics_op:  Supported Metric Operators             Dict
+    :param metrics:              Metric Set to evaluate
+    :param topK:                 (topK_min, topK_max, topK_delta)       Tuple
+    :param predcol:              Prediction Column Index
+    :param labelcol:             Label Column Index
+    :return:
     """
-    eval_metric_ops = OrderedDict()
+    # pylint: disable=dict-keys-not-iterating
+    if targetMetrics is None or supportedMetrics_op is None:
+        raise ValueError("Invalid Target Metric List/op !")
 
-    if predcol is None:
-      pred = graph_output['output']
+    targetMetrics = set([m.lower() for m in targetMetrics])
+    if metrics is None:
+        metrics = list(targetMetrics)
     else:
-      assert 0 <= predcol < graph_output['output'].shape[1], 'Invalid Prediction Column Index !'
-      assert labelcol is not None
-      pred   = tf.reshape(graph_output['output'][:, predcol], shape=[-1, 1])
-      labels = tf.reshape(labels[:, labelcol], shape=[-1, 1])
-    numOut = graph_output['output'].shape[1]
-    pred_score = tf.reshape(graph_output['output'][:, numOut-1], shape=[-1, 1])
-
-    # add metrics to eval_metric_ops dict
-    for metric_name in metrics:
-      metric_name = metric_name.lower()  # metric name are case insensitive.
-
-      if metric_name in supportedMetrics_op:
-        metric_factory = supportedMetrics_op.get(metric_name)
-
-        if 'topk' not in metric_name:
-          value_op, update_op = metric_factory(
-            labels=labels,
-            predictions=pred,
-            weights=weights,
-            name=metric_name)
-          eval_metric_ops[metric_name] = (value_op, update_op)
+        metrics = [m.lower() for m in metrics if m.lower() in targetMetrics]
+
+    num_k = int((topK[1] - topK[0]) / topK[2] + 1)
+    topK_list = [topK[0] + d * topK[2] for d in range(num_k)]
+    if 1 not in topK_list:
+        topK_list = [1] + topK_list
+
+    def get_eval_metric_ops(graph_output, labels, weights):
+        """
+        graph_output:
+          dict that is returned by build_graph given input features.
+        labels:
+          target labels associated to batch.
+        weights:
+          weights of the samples..
+        """
+        eval_metric_ops = OrderedDict()
+
+        if predcol is None:
+            pred = graph_output["output"]
         else:
-          for K in topK_list:
-            K_min = tf.minimum(K, tf.shape(pred_score)[0])
-            topK_id = tf.nn.top_k(tf.reshape(pred_score, shape=[-1]), k=K_min)[1]           # [topK]
-            value_op, update_op = metric_factory(
-              labels=labels,
-              predictions=pred,
-              weights=weights,
-              name=metric_name+'__k_'+str(K),
-              topK_id=topK_id)
-            eval_metric_ops[metric_name+'__k_'+str(K)] = (value_op, update_op)
-
-      else:
-        raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-
-def get_numeric_metric_fn(metrics=None, topK=(5,5,5), predcol=None, labelcol=None):
-  if metrics is None:
-    metrics = list(DEFAULT_NUMERIC_METRICS)
-  metrics   = list(set(metrics))
-
-  metric_op = get_metric_topK_fn_helper(targetMetrics=list(DEFAULT_NUMERIC_METRICS),
-                                        supportedMetrics_op=SUPPORTED_NUMERIC_METRICS,
-                                        metrics=metrics, topK=topK, predcol=predcol, labelcol=labelcol)
-  return metric_op
-
-
-
-def get_single_binary_task_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False):
-  """
-  graph_output['output']:        [BatchSz, 1]        [pred_Task1]
-  labels:                        [BatchSz, 2]        [Task1, NumericLabel]
-  """
-  def get_eval_metric_ops(graph_output, labels, weights):
-    metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames)
-    classnames_unw = ['unweighted_'+cs for cs in classnames]
-    metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames_unw)
-
-    metrics_base_res = metric_op_base(graph_output, labels, weights)
-    metrics_unw_res = metric_op_unw(graph_output, labels, None)
-    metrics_base_res.update(metrics_unw_res)
-
-    if use_topK:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=0, labelcol=1)
-      metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
-      metrics_base_res.update(metrics_numeric_res)
-    return metrics_base_res
-
-  return get_eval_metric_ops
-
-
-def get_dual_binary_tasks_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False):
-  """
-  graph_output['output']:        [BatchSz, 3]        [pred_Task1, pred_Task2, Score]
-  labels:                        [BatchSz, 3]        [Task1, Task2, NumericLabel]
-  """
-  def get_eval_metric_ops(graph_output, labels, weights):
-
-    metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames)
-    classnames_unw = ['unweighted_'+cs for cs in classnames]
-    metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames_unw)
-
-    metrics_base_res = metric_op_base(graph_output, labels, weights)
-    metrics_unw_res = metric_op_unw(graph_output, labels, None)
-    metrics_base_res.update(metrics_unw_res)
+            assert (
+                0 <= predcol < graph_output["output"].shape[1]
+            ), "Invalid Prediction Column Index !"
+            assert labelcol is not None
+            pred = tf.reshape(graph_output["output"][:, predcol], shape=[-1, 1])
+            labels = tf.reshape(labels[:, labelcol], shape=[-1, 1])
+        numOut = graph_output["output"].shape[1]
+        pred_score = tf.reshape(graph_output["output"][:, numOut - 1], shape=[-1, 1])
+
+        # add metrics to eval_metric_ops dict
+        for metric_name in metrics:
+            metric_name = metric_name.lower()  # metric name are case insensitive.
+
+            if metric_name in supportedMetrics_op:
+                metric_factory = supportedMetrics_op.get(metric_name)
+
+                if "topk" not in metric_name:
+                    value_op, update_op = metric_factory(
+                        labels=labels,
+                        predictions=pred,
+                        weights=weights,
+                        name=metric_name,
+                    )
+                    eval_metric_ops[metric_name] = (value_op, update_op)
+                else:
+                    for K in topK_list:
+                        K_min = tf.minimum(K, tf.shape(pred_score)[0])
+                        topK_id = tf.nn.top_k(
+                            tf.reshape(pred_score, shape=[-1]), k=K_min
+                        )[
+                            1
+                        ]  # [topK]
+                        value_op, update_op = metric_factory(
+                            labels=labels,
+                            predictions=pred,
+                            weights=weights,
+                            name=metric_name + "__k_" + str(K),
+                            topK_id=topK_id,
+                        )
+                        eval_metric_ops[metric_name + "__k_" + str(K)] = (
+                            value_op,
+                            update_op,
+                        )
+
+            else:
+                raise ValueError("Cannot find the metric named " + metric_name)
+
+        return eval_metric_ops
+
+    return get_eval_metric_ops
+
+
+def get_numeric_metric_fn(metrics=None, topK=(5, 5, 5), predcol=None, labelcol=None):
+    if metrics is None:
+        metrics = list(DEFAULT_NUMERIC_METRICS)
+    metrics = list(set(metrics))
+
+    metric_op = get_metric_topK_fn_helper(
+        targetMetrics=list(DEFAULT_NUMERIC_METRICS),
+        supportedMetrics_op=SUPPORTED_NUMERIC_METRICS,
+        metrics=metrics,
+        topK=topK,
+        predcol=predcol,
+        labelcol=labelcol,
+    )
+    return metric_op
+
+
+def get_single_binary_task_metric_fn(
+    metrics, classnames, topK=(5, 5, 5), use_topK=False
+):
+    """
+    graph_output['output']:        [BatchSz, 1]        [pred_Task1]
+    labels:                        [BatchSz, 2]        [Task1, NumericLabel]
+    """
 
-    if use_topK:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=2, labelcol=2)
-      metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
-      metrics_base_res.update(metrics_numeric_res)
-    return metrics_base_res
+    def get_eval_metric_ops(graph_output, labels, weights):
+        metric_op_base = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=0, classes=classnames
+        )
+        classnames_unw = ["unweighted_" + cs for cs in classnames]
+        metric_op_unw = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=0, classes=classnames_unw
+        )
+
+        metrics_base_res = metric_op_base(graph_output, labels, weights)
+        metrics_unw_res = metric_op_unw(graph_output, labels, None)
+        metrics_base_res.update(metrics_unw_res)
+
+        if use_topK:
+            metric_op_numeric = get_numeric_metric_fn(
+                metrics=None, topK=topK, predcol=0, labelcol=1
+            )
+            metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
+            metrics_base_res.update(metrics_numeric_res)
+        return metrics_base_res
+
+    return get_eval_metric_ops
+
+
+def get_dual_binary_tasks_metric_fn(
+    metrics, classnames, topK=(5, 5, 5), use_topK=False
+):
+    """
+    graph_output['output']:        [BatchSz, 3]        [pred_Task1, pred_Task2, Score]
+    labels:                        [BatchSz, 3]        [Task1, Task2, NumericLabel]
+    """
 
-  return get_eval_metric_ops
+    def get_eval_metric_ops(graph_output, labels, weights):
+        metric_op_base = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=[0, 1], classes=classnames
+        )
+        classnames_unw = ["unweighted_" + cs for cs in classnames]
+        metric_op_unw = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=[0, 1], classes=classnames_unw
+        )
+
+        metrics_base_res = metric_op_base(graph_output, labels, weights)
+        metrics_unw_res = metric_op_unw(graph_output, labels, None)
+        metrics_base_res.update(metrics_unw_res)
+
+        if use_topK:
+            metric_op_numeric = get_numeric_metric_fn(
+                metrics=None, topK=topK, predcol=2, labelcol=2
+            )
+            metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
+            metrics_base_res.update(metrics_numeric_res)
+        return metrics_base_res
+
+    return get_eval_metric_ops
diff --git a/twml/twml/contrib/metrics/search_metrics.py b/twml/twml/contrib/metrics/search_metrics.py
index 7d7a502f1..e558c6513 100644
--- a/twml/twml/contrib/metrics/search_metrics.py
+++ b/twml/twml/contrib/metrics/search_metrics.py
@@ -19,274 +19,291 @@
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes, ops
 from tensorflow.python.ops import array_ops, state_ops
+
 import twml
 from twml.contrib.utils import math_fns
 
 
-def ndcg(labels, predictions,
-                  metrics_collections=None,
-                  updates_collections=None,
-                  name=None,
-                  top_k_int=1):
-  # pylint: disable=unused-argument
-  """
-  Compute full normalized discounted cumulative gain (ndcg) based on predictions
-  ndcg = dcg_k/idcg_k, k is a cut off ranking postion
-  There are a few variants of ndcg
-  The dcg (discounted cumulative gain) formula used in
-  twml.contrib.metrics.ndcg is::
-
-    \\sum_{i=1}^k \frac{2^{relevance\\_score} -1}{\\log_{2}(i + 1)}
-
-  k is the length of items to be ranked in a batch/query
-  Notice that whether k will be replaced with a fixed value requires discussions
-  The scores in predictions are transformed to order and relevance scores to calculate ndcg
-  A relevance score means how relevant a DataRecord is to a particular query
-
-  Arguments:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Returns:
-    ndcg: A `Tensor` representing the ndcg score.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'ndcg', (labels, predictions)):
-    label_scores = tf.to_float(labels, name='label_to_float')
-    predicted_scores = tf.to_float(predictions, name='predictions_to_float')
-
-    if context.executing_eagerly():
-      raise RuntimeError('ndcg is not supported when eager execution '
-                         'is enabled.')
-
-    total_ndcg = _metric_variable([], dtypes.float32, name='total_ndcg')
-    count_query = _metric_variable([], dtypes.float32, name='query_count')
-
-    # actual ndcg cutoff position top_k_int
-    max_prediction_size = array_ops.size(predicted_scores)
-    top_k_int = tf.minimum(max_prediction_size, top_k_int)
-    # the ndcg score of the batch
-    ndcg = math_fns.cal_ndcg(label_scores,
-      predicted_scores, top_k_int=top_k_int)
-    # add ndcg of the current batch to total_ndcg
-    update_total_op = state_ops.assign_add(total_ndcg, ndcg)
-    with ops.control_dependencies([ndcg]):
-      # count_query stores the number of queries
-      # count_query increases by 1 for each batch/query
-      update_count_op = state_ops.assign_add(count_query, 1)
-
-    mean_ndcg = math_fns.safe_div(total_ndcg, count_query, 'mean_ndcg')
-    update_op = math_fns.safe_div(update_total_op, update_count_op, 'update_mean_ndcg_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_ndcg)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean_ndcg, update_op
+def ndcg(
+    labels,
+    predictions,
+    metrics_collections=None,
+    updates_collections=None,
+    name=None,
+    top_k_int=1,
+):
+    # pylint: disable=unused-argument
+    """
+    Compute full normalized discounted cumulative gain (ndcg) based on predictions
+    ndcg = dcg_k/idcg_k, k is a cut off ranking postion
+    There are a few variants of ndcg
+    The dcg (discounted cumulative gain) formula used in
+    twml.contrib.metrics.ndcg is::
+
+      \\sum_{i=1}^k \frac{2^{relevance\\_score} -1}{\\log_{2}(i + 1)}
+
+    k is the length of items to be ranked in a batch/query
+    Notice that whether k will be replaced with a fixed value requires discussions
+    The scores in predictions are transformed to order and relevance scores to calculate ndcg
+    A relevance score means how relevant a DataRecord is to a particular query
+
+    Arguments:
+      labels: the ground truth value.
+      predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
+      metrics_collections: optional list of collections to add this metric into.
+      updates_collections: optional list of collections to add the associated update_op into.
+      name: an optional variable_scope name.
+
+    Returns:
+      ndcg: A `Tensor` representing the ndcg score.
+      update_op: A update operation used to accumulate data into this metric.
+    """
+    with tf.variable_scope(name, "ndcg", (labels, predictions)):
+        label_scores = tf.to_float(labels, name="label_to_float")
+        predicted_scores = tf.to_float(predictions, name="predictions_to_float")
+
+        if context.executing_eagerly():
+            raise RuntimeError(
+                "ndcg is not supported when eager execution " "is enabled."
+            )
+
+        total_ndcg = _metric_variable([], dtypes.float32, name="total_ndcg")
+        count_query = _metric_variable([], dtypes.float32, name="query_count")
+
+        # actual ndcg cutoff position top_k_int
+        max_prediction_size = array_ops.size(predicted_scores)
+        top_k_int = tf.minimum(max_prediction_size, top_k_int)
+        # the ndcg score of the batch
+        ndcg = math_fns.cal_ndcg(label_scores, predicted_scores, top_k_int=top_k_int)
+        # add ndcg of the current batch to total_ndcg
+        update_total_op = state_ops.assign_add(total_ndcg, ndcg)
+        with ops.control_dependencies([ndcg]):
+            # count_query stores the number of queries
+            # count_query increases by 1 for each batch/query
+            update_count_op = state_ops.assign_add(count_query, 1)
+
+        mean_ndcg = math_fns.safe_div(total_ndcg, count_query, "mean_ndcg")
+        update_op = math_fns.safe_div(
+            update_total_op, update_count_op, "update_mean_ndcg_op"
+        )
+
+        if metrics_collections:
+            ops.add_to_collections(metrics_collections, mean_ndcg)
+
+        if updates_collections:
+            ops.add_to_collections(updates_collections, update_op)
+
+        return mean_ndcg, update_op
 
 
 # Copied from metrics_impl.py with minor modifications.
 # https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39
 def _metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
+    """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
 
-  return tf.Variable(
-    lambda: tf.zeros(shape, dtype),
-    trainable=False,
-    collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
-    validate_shape=validate_shape,
-    name=name)
+    return tf.Variable(
+        lambda: tf.zeros(shape, dtype),
+        trainable=False,
+        collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
+        validate_shape=validate_shape,
+        name=name,
+    )
 
 
 # binary metric_name: (metric, requires thresholded output)
 SUPPORTED_BINARY_CLASS_METRICS = {
-  # TWML binary metrics
-  'rce': (twml.metrics.rce, False),
-  'nrce': (partial(twml.metrics.rce, normalize=True), False),
-  # CTR measures positive sample ratio. This terminology is inherited from Ads.
-  'ctr': (twml.metrics.ctr, False),
-  # predicted CTR measures predicted positive ratio.
-  'predicted_ctr': (twml.metrics.predicted_ctr, False),
-  # thresholded metrics
-  'accuracy': (tf.metrics.accuracy, True),
-  'precision': (tf.metrics.precision, True),
-  'recall': (tf.metrics.recall, True),
-  # tensorflow metrics
-  'roc_auc': (partial(tf.metrics.auc, curve='ROC'), False),
-  'pr_auc': (partial(tf.metrics.auc, curve='PR'), False),
+    # TWML binary metrics
+    "rce": (twml.metrics.rce, False),
+    "nrce": (partial(twml.metrics.rce, normalize=True), False),
+    # CTR measures positive sample ratio. This terminology is inherited from Ads.
+    "ctr": (twml.metrics.ctr, False),
+    # predicted CTR measures predicted positive ratio.
+    "predicted_ctr": (twml.metrics.predicted_ctr, False),
+    # thresholded metrics
+    "accuracy": (tf.metrics.accuracy, True),
+    "precision": (tf.metrics.precision, True),
+    "recall": (tf.metrics.recall, True),
+    # tensorflow metrics
+    "roc_auc": (partial(tf.metrics.auc, curve="ROC"), False),
+    "pr_auc": (partial(tf.metrics.auc, curve="PR"), False),
 }
 
 # search metric_name: metric
 SUPPORTED_SEARCH_METRICS = {
-  # TWML search metrics
-  # ndcg needs the raw prediction scores to sort
-  'ndcg': ndcg,
+    # TWML search metrics
+    # ndcg needs the raw prediction scores to sort
+    "ndcg": ndcg,
 }
 
 
-def get_search_metric_fn(binary_metrics=None, search_metrics=None,
-  ndcg_top_ks=[1, 3, 5, 10], use_binary_metrics=False):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for ranking. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions. Required.
-    threshold:
-      Only used in SUPPORTED_BINARY_CLASS_METRICS
-      If the lables are 0s and 1s
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      Only used in SUPPORTED_BINARY_CLASS_METRICS
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Arguments:
-    only used in pointwise learning-to-rank
-
-    binary_metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-        - ctr (same as positive sample ratio.)
-        - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-        - nrce (normalized rce, do not use this one if you do not understand what it is)
-        - pr_auc
-        - roc_auc
-        - accuracy (percentage of predictions that are correct)
-        - precision (true positives) / (true positives + false positives)
-        - recall (true positives) / (true positives + false negatives)
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When binary_metrics is None (the default), it defaults to all supported metrics
-
-    search_metrics (list of String):
-      a list of metrics of interest. E.g. ['ndcg']
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-        - ndcg
-
-      NOTE: ndcg works for ranking-relatd problems.
-      A batch contains all DataRecords that belong to the same query
-      If pair_in_batch_mode used in scalding -- a batch contains a pair of DataRecords
-      that belong to the same query and have different labels -- ndcg does not apply in here.
-
-      When search_metrics is None (the default), it defaults to all supported search metrics
-      currently only 'ndcg'
-
-    ndcg_top_ks (list of integers):
-      The cut-off ranking postions for a query
-      When ndcg_top_ks is None or empty (the default), it defaults to [1, 3, 5, 10]
-
-    use_binary_metrics:
-      False (default)
-      Only set it to true in pointwise learning-to-rank
-  """
-  # pylint: disable=dict-keys-not-iterating
-
-  if ndcg_top_ks is None or not ndcg_top_ks:
-    ndcg_top_ks = [1, 3, 5, 10]
-
-  if search_metrics is None:
-    search_metrics = list(SUPPORTED_SEARCH_METRICS.keys())
-
-  if binary_metrics is None and use_binary_metrics:
-    # Added SUPPORTED_BINARY_CLASS_METRICS in twml.metics as well
-    # they are only used in pointwise learing-to-rank
-    binary_metrics = list(SUPPORTED_BINARY_CLASS_METRICS.keys())
-
-  def get_eval_metric_ops(graph_output, labels, weights):
+def get_search_metric_fn(
+    binary_metrics=None,
+    search_metrics=None,
+    ndcg_top_ks=[1, 3, 5, 10],
+    use_binary_metrics=False,
+):
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    Returns a function having signature:
+
+    .. code-block:: python
+
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
+
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops for ranking. See `tf.estimator.EstimatorSpec
+    <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
+    for a description of eval_metric_ops. The graph_output is a the result
+    dict returned by build_graph. Labels and weights are tf.Tensors.
+
+    The following graph_output keys are recognized:
+      output:
+        the raw predictions. Required.
+      threshold:
+        Only used in SUPPORTED_BINARY_CLASS_METRICS
+        If the lables are 0s and 1s
+        A value between 0 and 1 used to threshold the output into a hard_output.
+        Defaults to 0.5 when threshold and hard_output are missing.
+        Either threshold or hard_output can be provided, but not both.
+      hard_output:
+        Only used in SUPPORTED_BINARY_CLASS_METRICS
+        A thresholded output. Either threshold or hard_output can be provided, but not both.
+
+    Arguments:
+      only used in pointwise learning-to-rank
+
+      binary_metrics (list of String):
+        a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
+        These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+        Supported metrics:
+          - ctr (same as positive sample ratio.)
+          - rce (cross entropy loss compared to the baseline model of always predicting ctr)
+          - nrce (normalized rce, do not use this one if you do not understand what it is)
+          - pr_auc
+          - roc_auc
+          - accuracy (percentage of predictions that are correct)
+          - precision (true positives) / (true positives + false positives)
+          - recall (true positives) / (true positives + false negatives)
+
+        NOTE: accuracy / precision / recall apply to binary classification problems only.
+        I.e. a prediction is only considered correct if it matches the label. E.g. if the label
+        is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
+        precision / recall / accuracy metrics with soft predictions, you'll need to threshold
+        your predictions into hard 0/1 labels.
+
+        When binary_metrics is None (the default), it defaults to all supported metrics
+
+      search_metrics (list of String):
+        a list of metrics of interest. E.g. ['ndcg']
+        These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+        Supported metrics:
+          - ndcg
+
+        NOTE: ndcg works for ranking-relatd problems.
+        A batch contains all DataRecords that belong to the same query
+        If pair_in_batch_mode used in scalding -- a batch contains a pair of DataRecords
+        that belong to the same query and have different labels -- ndcg does not apply in here.
+
+        When search_metrics is None (the default), it defaults to all supported search metrics
+        currently only 'ndcg'
+
+      ndcg_top_ks (list of integers):
+        The cut-off ranking postions for a query
+        When ndcg_top_ks is None or empty (the default), it defaults to [1, 3, 5, 10]
+
+      use_binary_metrics:
+        False (default)
+        Only set it to true in pointwise learning-to-rank
     """
+    # pylint: disable=dict-keys-not-iterating
 
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    # hard_preds is a tensor
-    # check hard_preds is None and then check if it is empty
-    if hard_preds is None or tf.equal(tf.size(hard_preds), 0):
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    # add search metrics to eval_metric_ops dict
-    for metric_name in search_metrics:
-      metric_name = metric_name.lower()  # metric name are case insensitive.
-
-      if metric_name in eval_metric_ops:
-        # avoid adding duplicate metrics.
-        continue
-
-      search_metric_factory = SUPPORTED_SEARCH_METRICS.get(metric_name)
-      if search_metric_factory:
-        if metric_name == 'ndcg':
-          for top_k in ndcg_top_ks:
-            # metric name will show as ndcg_1, ndcg_10, ...
-            metric_name_ndcg_top_k = metric_name + '_' + str(top_k)
-            top_k_int = tf.constant(top_k, dtype=tf.int32)
-            # Note: having weights in ndcg does not make much sense
-            # Because ndcg already has position weights/discounts
-            # Thus weights are not applied in ndcg metric
-            value_op, update_op = search_metric_factory(
-              labels=labels,
-              predictions=preds,
-              name=metric_name_ndcg_top_k,
-              top_k_int=top_k_int)
-            eval_metric_ops[metric_name_ndcg_top_k] = (value_op, update_op)
-      else:
-        raise ValueError('Cannot find the search metric named ' + metric_name)
-
-    if use_binary_metrics:
-      # add binary metrics to eval_metric_ops dict
-      for metric_name in binary_metrics:
-
-        if metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        metric_name = metric_name.lower()  # metric name are case insensitive.
-        binary_metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-        if binary_metric_factory:
-          value_op, update_op = binary_metric_factory(
-            labels=labels,
-            predictions=(hard_preds if requires_threshold else preds),
-            weights=weights,
-            name=metric_name)
-          eval_metric_ops[metric_name] = (value_op, update_op)
-        else:
-          raise ValueError('Cannot find the binary metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
+    if ndcg_top_ks is None or not ndcg_top_ks:
+        ndcg_top_ks = [1, 3, 5, 10]
+
+    if search_metrics is None:
+        search_metrics = list(SUPPORTED_SEARCH_METRICS.keys())
+
+    if binary_metrics is None and use_binary_metrics:
+        # Added SUPPORTED_BINARY_CLASS_METRICS in twml.metics as well
+        # they are only used in pointwise learing-to-rank
+        binary_metrics = list(SUPPORTED_BINARY_CLASS_METRICS.keys())
+
+    def get_eval_metric_ops(graph_output, labels, weights):
+        """
+        graph_output:
+          dict that is returned by build_graph given input features.
+        labels:
+          target labels associated to batch.
+        weights:
+          weights of the samples..
+        """
+
+        eval_metric_ops = OrderedDict()
+
+        preds = graph_output["output"]
+
+        threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5
+
+        hard_preds = graph_output.get("hard_output")
+        # hard_preds is a tensor
+        # check hard_preds is None and then check if it is empty
+        if hard_preds is None or tf.equal(tf.size(hard_preds), 0):
+            hard_preds = tf.greater_equal(preds, threshold)
+
+        # add search metrics to eval_metric_ops dict
+        for metric_name in search_metrics:
+            metric_name = metric_name.lower()  # metric name are case insensitive.
+
+            if metric_name in eval_metric_ops:
+                # avoid adding duplicate metrics.
+                continue
+
+            search_metric_factory = SUPPORTED_SEARCH_METRICS.get(metric_name)
+            if search_metric_factory:
+                if metric_name == "ndcg":
+                    for top_k in ndcg_top_ks:
+                        # metric name will show as ndcg_1, ndcg_10, ...
+                        metric_name_ndcg_top_k = metric_name + "_" + str(top_k)
+                        top_k_int = tf.constant(top_k, dtype=tf.int32)
+                        # Note: having weights in ndcg does not make much sense
+                        # Because ndcg already has position weights/discounts
+                        # Thus weights are not applied in ndcg metric
+                        value_op, update_op = search_metric_factory(
+                            labels=labels,
+                            predictions=preds,
+                            name=metric_name_ndcg_top_k,
+                            top_k_int=top_k_int,
+                        )
+                        eval_metric_ops[metric_name_ndcg_top_k] = (value_op, update_op)
+            else:
+                raise ValueError("Cannot find the search metric named " + metric_name)
+
+        if use_binary_metrics:
+            # add binary metrics to eval_metric_ops dict
+            for metric_name in binary_metrics:
+                if metric_name in eval_metric_ops:
+                    # avoid adding duplicate metrics.
+                    continue
+
+                metric_name = metric_name.lower()  # metric name are case insensitive.
+                (
+                    binary_metric_factory,
+                    requires_threshold,
+                ) = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
+                if binary_metric_factory:
+                    value_op, update_op = binary_metric_factory(
+                        labels=labels,
+                        predictions=(hard_preds if requires_threshold else preds),
+                        weights=weights,
+                        name=metric_name,
+                    )
+                    eval_metric_ops[metric_name] = (value_op, update_op)
+                else:
+                    raise ValueError(
+                        "Cannot find the binary metric named " + metric_name
+                    )
+
+        return eval_metric_ops
+
+    return get_eval_metric_ops
diff --git a/twml/twml/contrib/optimizers/__init__.py b/twml/twml/contrib/optimizers/__init__.py
index 112b2b410..c140e55af 100644
--- a/twml/twml/contrib/optimizers/__init__.py
+++ b/twml/twml/contrib/optimizers/__init__.py
@@ -1,4 +1,6 @@
 # pylint: disable=wildcard-import
 """This module contains experimental optimizer classes"""
-from .deep_gradient_compression_optimizer import DeepGradientCompressionOptimizer  # noqa: F401
+from .deep_gradient_compression_optimizer import (
+    DeepGradientCompressionOptimizer,
+)  # noqa: F401
 from .pruning_optimizer import PruningOptimizer  # noqa: F401
diff --git a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
index 2c71ed13f..16a45183c 100644
--- a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
+++ b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
@@ -12,169 +12,196 @@
 
 
 def compute_threshold(grad, density):
-  """
-  A utility function to compute the threshold for gradient sparsification, given the gradient
-  tensor and the density.
-  Args:
-    grad(tf.Tensor):
-      Gradient tensor for some variable.
-    density(float):
-      Density degree when sparsifying gradients.
-  Returns(float):
-    Threshold for gradient sparsification.
-  """
-  flat_grad = tf.reshape(grad, [-1])
-  abs_flat_grad = tf.abs(flat_grad)
-  size = tf.shape(abs_flat_grad)[0]
-  k = tf.maximum(tf.constant(1),
-                 tf.cast(tf.scalar_mul(density, tf.cast(size, tf.float32)), tf.int32))
-  topk, _ = tf.nn.top_k(abs_flat_grad, k, False)
-  return topk[-1]
+    """
+    A utility function to compute the threshold for gradient sparsification, given the gradient
+    tensor and the density.
+    Args:
+      grad(tf.Tensor):
+        Gradient tensor for some variable.
+      density(float):
+        Density degree when sparsifying gradients.
+    Returns(float):
+      Threshold for gradient sparsification.
+    """
+    flat_grad = tf.reshape(grad, [-1])
+    abs_flat_grad = tf.abs(flat_grad)
+    size = tf.shape(abs_flat_grad)[0]
+    k = tf.maximum(
+        tf.constant(1),
+        tf.cast(tf.scalar_mul(density, tf.cast(size, tf.float32)), tf.int32),
+    )
+    topk, _ = tf.nn.top_k(abs_flat_grad, k, False)
+    return topk[-1]
 
 
 def get_top_row_indices(values, density):
-  """
-  A utility function to get indices of most significant rows, given the density degree.
-  Args:
-    values(tf.Tensor):
-      Gradient or locally accumulated gradient for some variable.
-    density(float):
-      Density degree when filtering out rows.
-  Returns(list(int)):
-    Indices of most significant rows.
-  """
-  abs_values = tf.abs(values)
-
-  try:
-    row_num = tf.shape(abs_values)[0]
-    k = tf.maximum(tf.constant(1),
-                   tf.cast(tf.scalar_mul(density, tf.cast(row_num, tf.float32)), tf.int32))
-    row_sums = tf.squeeze(tf.reduce_sum(values, axis=1, keepdims=True))
-    _, top_row_indices = tf.nn.top_k(row_sums, k=k, sorted=False)
-    # print "abs_values", abs_values, "row_sums", row_sums
-    return top_row_indices
-    # return tf.range(row_num)
-
-  except ValueError:  # if the tensor is 0-D or 1-D
-    return None
-
-
-class DeepGradientCompressionOptimizer(tf.train.GradientDescentOptimizer):
-  """
-  A custom optimizer to implement Deep Gradient Compression (https://arxiv.org/abs/1712.01887).
-  """
-
-  def __init__(self, learning_rate, use_locking=False, name="Sparse",
-               density=1.0,
-               density_decay=False,
-               density_decay_steps=10000,
-               density_decay_rate=0.5,
-               min_density=0.1,
-               accumulation=False):
-    super(DeepGradientCompressionOptimizer, self).__init__(learning_rate, use_locking, name)
-    self._initial_density_t = tf.convert_to_tensor(density)
-    self._density_decay = density_decay
-    dtype = self._initial_density_t.dtype
-    self._density_decay_steps_t = tf.convert_to_tensor(density_decay_steps, dtype)
-    self._density_decay_rate_t = tf.convert_to_tensor(density_decay_rate, dtype)
-    self._min_density_t = tf.convert_to_tensor(min_density, dtype)
-    self._accumulation = accumulation
-
-  def _prepare(self):
-    super(DeepGradientCompressionOptimizer, self)._prepare()
-    if not self._density_decay:
-      self._density_t = self._initial_density_t
-    else:
-      dtype = self._initial_density_t.dtype
-      global_step = tf.cast(tf.train.get_global_step(), dtype)
-      p = tf.floor(tf.divide(global_step, self._density_decay_steps_t))
-      decayed_density = tf.multiply(self._initial_density_t,
-                                    tf.pow(self._density_decay_rate_t, p))
-      self._density_t = tf.maximum(self._min_density_t, decayed_density)
-
-  def _create_slots(self, var_list):
     """
-    Create a slot variable to accumulate gradients locally for each variable in `var_list`.
+    A utility function to get indices of most significant rows, given the density degree.
     Args:
-      var_list(list(tf.Variable)):
-        List of variables to accumulate gradients locally for.
+      values(tf.Tensor):
+        Gradient or locally accumulated gradient for some variable.
+      density(float):
+        Density degree when filtering out rows.
+    Returns(list(int)):
+      Indices of most significant rows.
     """
-    for var in var_list:
-      self._zeros_slot(var, "g_buffer", self._name)
-
-  def _apply_dense(self, grad, var):
-    if not self._accumulation:
-      top_row_indices = get_top_row_indices(grad, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var)
-
-      sparsified_values = tf.gather(grad, top_row_indices)
-      sparsified_indices = top_row_indices
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-    else:
-      g_buffer = self.get_slot(var, "g_buffer")
-
-      g_buffer = tf.assign_add(g_buffer, grad)
+    abs_values = tf.abs(values)
 
-      top_row_indices = get_top_row_indices(g_buffer, self._density_t)
+    try:
+        row_num = tf.shape(abs_values)[0]
+        k = tf.maximum(
+            tf.constant(1),
+            tf.cast(tf.scalar_mul(density, tf.cast(row_num, tf.float32)), tf.int32),
+        )
+        row_sums = tf.squeeze(tf.reduce_sum(values, axis=1, keepdims=True))
+        _, top_row_indices = tf.nn.top_k(row_sums, k=k, sorted=False)
+        # print "abs_values", abs_values, "row_sums", row_sums
+        return top_row_indices
+        # return tf.range(row_num)
 
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var)
+    except ValueError:  # if the tensor is 0-D or 1-D
+        return None
 
-      sparsified_values = tf.gather(g_buffer, top_row_indices)
-      sparsified_indices = top_row_indices
 
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-      update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like(
-        sparsified_values))
-
-      return tf.group(*[update_var, update_g_buffer])
-
-  def _apply_sparse_duplicate_indices(self, grad, var):
-    if not self._accumulation:
-      top_row_indices = get_top_row_indices(grad.values, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(grad, var)  # noqa: E501
-
-      sparsified_values = tf.gather(grad.values, top_row_indices)
-      sparsified_indices = tf.gather(grad.indices, top_row_indices)
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-    else:
-      g_buffer = self.get_slot(var, "g_buffer")
+class DeepGradientCompressionOptimizer(tf.train.GradientDescentOptimizer):
+    """
+    A custom optimizer to implement Deep Gradient Compression (https://arxiv.org/abs/1712.01887).
+    """
 
-      g_buffer = tf.scatter_update(g_buffer, grad.indices, grad.values)
+    def __init__(
+        self,
+        learning_rate,
+        use_locking=False,
+        name="Sparse",
+        density=1.0,
+        density_decay=False,
+        density_decay_steps=10000,
+        density_decay_rate=0.5,
+        min_density=0.1,
+        accumulation=False,
+    ):
+        super(DeepGradientCompressionOptimizer, self).__init__(
+            learning_rate, use_locking, name
+        )
+        self._initial_density_t = tf.convert_to_tensor(density)
+        self._density_decay = density_decay
+        dtype = self._initial_density_t.dtype
+        self._density_decay_steps_t = tf.convert_to_tensor(density_decay_steps, dtype)
+        self._density_decay_rate_t = tf.convert_to_tensor(density_decay_rate, dtype)
+        self._min_density_t = tf.convert_to_tensor(min_density, dtype)
+        self._accumulation = accumulation
+
+    def _prepare(self):
+        super(DeepGradientCompressionOptimizer, self)._prepare()
+        if not self._density_decay:
+            self._density_t = self._initial_density_t
+        else:
+            dtype = self._initial_density_t.dtype
+            global_step = tf.cast(tf.train.get_global_step(), dtype)
+            p = tf.floor(tf.divide(global_step, self._density_decay_steps_t))
+            decayed_density = tf.multiply(
+                self._initial_density_t, tf.pow(self._density_decay_rate_t, p)
+            )
+            self._density_t = tf.maximum(self._min_density_t, decayed_density)
+
+    def _create_slots(self, var_list):
+        """
+        Create a slot variable to accumulate gradients locally for each variable in `var_list`.
+        Args:
+          var_list(list(tf.Variable)):
+            List of variables to accumulate gradients locally for.
+        """
+        for var in var_list:
+            self._zeros_slot(var, "g_buffer", self._name)
+
+    def _apply_dense(self, grad, var):
+        if not self._accumulation:
+            top_row_indices = get_top_row_indices(grad, self._density_t)
+
+            if top_row_indices is None:
+                return super(DeepGradientCompressionOptimizer, self)._apply_dense(
+                    grad, var
+                )
+
+            sparsified_values = tf.gather(grad, top_row_indices)
+            sparsified_indices = top_row_indices
+
+            sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
+
+            return super(
+                DeepGradientCompressionOptimizer, self
+            )._apply_sparse_duplicate_indices(sparsified_grad, var)
+
+        else:
+            g_buffer = self.get_slot(var, "g_buffer")
+
+            g_buffer = tf.assign_add(g_buffer, grad)
+
+            top_row_indices = get_top_row_indices(g_buffer, self._density_t)
+
+            if top_row_indices is None:
+                return super(DeepGradientCompressionOptimizer, self)._apply_dense(
+                    grad, var
+                )
+
+            sparsified_values = tf.gather(g_buffer, top_row_indices)
+            sparsified_indices = top_row_indices
+
+            sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
+
+            update_var = super(
+                DeepGradientCompressionOptimizer, self
+            )._apply_sparse_duplicate_indices(sparsified_grad, var)
+
+            update_g_buffer = tf.scatter_update(
+                g_buffer, sparsified_indices, tf.zeros_like(sparsified_values)
+            )
+
+            return tf.group(*[update_var, update_g_buffer])
+
+    def _apply_sparse_duplicate_indices(self, grad, var):
+        if not self._accumulation:
+            top_row_indices = get_top_row_indices(grad.values, self._density_t)
+
+            if top_row_indices is None:
+                return super(
+                    DeepGradientCompressionOptimizer, self
+                )._apply_sparse_duplicate_indices(
+                    grad, var
+                )  # noqa: E501
+
+            sparsified_values = tf.gather(grad.values, top_row_indices)
+            sparsified_indices = tf.gather(grad.indices, top_row_indices)
+
+            sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
+
+            return super(
+                DeepGradientCompressionOptimizer, self
+            )._apply_sparse_duplicate_indices(sparsified_grad, var)
+
+        else:
+            g_buffer = self.get_slot(var, "g_buffer")
+
+            g_buffer = tf.scatter_update(g_buffer, grad.indices, grad.values)
 
-      top_row_indices = get_top_row_indices(g_buffer, self._density_t)
+            top_row_indices = get_top_row_indices(g_buffer, self._density_t)
 
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer,
-                     self)._apply_sparse_duplicate_indices(grad, var)
+            if top_row_indices is None:
+                return super(
+                    DeepGradientCompressionOptimizer, self
+                )._apply_sparse_duplicate_indices(grad, var)
 
-      sparsified_values = tf.gather(g_buffer, top_row_indices)
-      sparsified_indices = top_row_indices
+            sparsified_values = tf.gather(g_buffer, top_row_indices)
+            sparsified_indices = top_row_indices
 
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
+            sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
 
-      update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
+            update_var = super(
+                DeepGradientCompressionOptimizer, self
+            )._apply_sparse_duplicate_indices(sparsified_grad, var)
 
-      update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like(
-        sparsified_values))
+            update_g_buffer = tf.scatter_update(
+                g_buffer, sparsified_indices, tf.zeros_like(sparsified_values)
+            )
 
-      return tf.group(*[update_var, update_g_buffer])
+            return tf.group(*[update_var, update_g_buffer])
diff --git a/twml/twml/contrib/optimizers/pruning_optimizer.py b/twml/twml/contrib/optimizers/pruning_optimizer.py
index 2bcd612ed..8fdf70f15 100644
--- a/twml/twml/contrib/optimizers/pruning_optimizer.py
+++ b/twml/twml/contrib/optimizers/pruning_optimizer.py
@@ -23,142 +23,160 @@
 
 import tensorflow.compat.v1 as tf
 
-from twml.contrib.pruning import computational_cost, prune, update_pruning_signals
-from twml.contrib.pruning import MASK_COLLECTION
+from twml.contrib.pruning import (
+    MASK_COLLECTION,
+    computational_cost,
+    prune,
+    update_pruning_signals,
+)
 
 
 class PruningOptimizer(tf.train.MomentumOptimizer):
-  """
-  Updates parameters with SGD and pruning masks using Fisher pruning.
-
-  Arguments:
-    learning_rate: float
-      Learning rate of SGD
-
-    momentum: float
-      Momentum used by SGD
-
-    use_locking: bool
-      If `True`, use locks for update operations
-
-    name: str
-      Optional name prefix for the operations created when applying gradients
-
-    use_nesterov: bool
-      If `True`, use Nesterov momentum
-  """
-
-  def __init__(
-      self,
-      learning_rate,
-      momentum=0.9,
-      use_locking=False,
-      name="PruningOptimizer",
-      use_nesterov=False):
-    super(PruningOptimizer, self).__init__(
-        learning_rate=learning_rate,
-        momentum=momentum,
-        use_locking=use_locking,
-        name=name,
-        use_nesterov=use_nesterov)
-
-  def minimize(
-    self,
-    loss,
-    prune_every=100,
-    burn_in=0,
-    decay=.96,
-    flops_weight='AUTO',
-    flops_target=0,
-    update_params=None,
-    method='Fisher',
-    *args,
-    **kwargs):
     """
-    Create operations to minimize loss and to prune features.
-
-    A pruning signal measures the importance of feature maps. This is weighed against the
-    computational cost of computing a feature map. Features are then iteratively pruned
-    based on a weighted average of feature importance S and computational cost C (in FLOPs):
-
-    $$S + w * C$$
-
-    Setting `flops_weight` to 'AUTO' is the most convenient and recommended option, but not
-    necessarily optimal.
+    Updates parameters with SGD and pruning masks using Fisher pruning.
 
     Arguments:
-      loss: tf.Tensor
-        The value to minimize
-
-      prune_every: int
-        One entry of a mask is set to zero only every few update steps
-
-      burn_in: int
-        Pruning starts only after this many parameter updates
+      learning_rate: float
+        Learning rate of SGD
 
-      decay: float
-        Controls exponential moving average of pruning signals
+      momentum: float
+        Momentum used by SGD
 
-      flops_weight: float or str
-        Controls the targeted trade-off between computational complexity and performance
+      use_locking: bool
+        If `True`, use locks for update operations
 
-      flops_target: float
-        Stop pruning when computational complexity is less or this many floating point ops
+      name: str
+        Optional name prefix for the operations created when applying gradients
 
-      update_params: tf.Operation
-        Optional training operation used instead of MomentumOptimizer to update parameters
-
-      method: str
-        Method used to compute pruning signal (currently only supports 'Fisher')
-
-    Returns:
-      A `tf.Operation` updating parameters and pruning masks
-
-    References:
-    * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
+      use_nesterov: bool
+        If `True`, use Nesterov momentum
     """
 
-    # gradient-based updates of parameters
-    if update_params is None:
-      update_params = super(PruningOptimizer, self).minimize(loss, *args, **kwargs)
-
-    masks = tf.get_collection(MASK_COLLECTION)
-
-    with tf.variable_scope('pruning_opt', reuse=True):
-      # estimate computational cost per data point
-      batch_size = tf.cast(tf.shape(masks[0].tensor), loss.dtype)[0]
-      cost = tf.divide(computational_cost(loss), batch_size, name='computational_cost')
-
-      tf.summary.scalar('computational_cost', cost)
-
-      if masks:
-        signals = update_pruning_signals(loss, masks=masks, decay=decay, method=method)
-
-        # estimate computational cost per feature map
-        costs = tf.gradients(cost, masks)
-
-        # trade off computational complexity and performance
-        if flops_weight.upper() == 'AUTO':
-          signals = [s / (c + 1e-6) for s, c in zip(signals, costs)]
-        elif not isinstance(flops_weight, float) or flops_weight != 0.:
-          signals = [s - flops_weight * c for s, c in zip(signals, costs)]
-
-        counter = tf.Variable(0, name='pruning_counter')
-        counter = tf.assign_add(counter, 1, use_locking=True)
-
-        # only prune every so often after a burn-in phase
-        pruning_cond = tf.logical_and(counter > burn_in, tf.equal(counter % prune_every, 0))
-
-        # stop pruning after reaching threshold
-        if flops_target > 0:
-          pruning_cond = tf.logical_and(pruning_cond, tf.greater(cost, flops_target))
-
-        update_masks = tf.cond(
-          pruning_cond,
-          lambda: prune(signals, masks=masks),
-          lambda: tf.group(masks))
+    def __init__(
+        self,
+        learning_rate,
+        momentum=0.9,
+        use_locking=False,
+        name="PruningOptimizer",
+        use_nesterov=False,
+    ):
+        super(PruningOptimizer, self).__init__(
+            learning_rate=learning_rate,
+            momentum=momentum,
+            use_locking=use_locking,
+            name=name,
+            use_nesterov=use_nesterov,
+        )
+
+    def minimize(
+        self,
+        loss,
+        prune_every=100,
+        burn_in=0,
+        decay=0.96,
+        flops_weight="AUTO",
+        flops_target=0,
+        update_params=None,
+        method="Fisher",
+        *args,
+        **kwargs
+    ):
+        """
+        Create operations to minimize loss and to prune features.
+
+        A pruning signal measures the importance of feature maps. This is weighed against the
+        computational cost of computing a feature map. Features are then iteratively pruned
+        based on a weighted average of feature importance S and computational cost C (in FLOPs):
+
+        $$S + w * C$$
+
+        Setting `flops_weight` to 'AUTO' is the most convenient and recommended option, but not
+        necessarily optimal.
+
+        Arguments:
+          loss: tf.Tensor
+            The value to minimize
+
+          prune_every: int
+            One entry of a mask is set to zero only every few update steps
+
+          burn_in: int
+            Pruning starts only after this many parameter updates
+
+          decay: float
+            Controls exponential moving average of pruning signals
+
+          flops_weight: float or str
+            Controls the targeted trade-off between computational complexity and performance
+
+          flops_target: float
+            Stop pruning when computational complexity is less or this many floating point ops
+
+          update_params: tf.Operation
+            Optional training operation used instead of MomentumOptimizer to update parameters
+
+          method: str
+            Method used to compute pruning signal (currently only supports 'Fisher')
+
+        Returns:
+          A `tf.Operation` updating parameters and pruning masks
+
+        References:
+        * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
+        """
+
+        # gradient-based updates of parameters
+        if update_params is None:
+            update_params = super(PruningOptimizer, self).minimize(
+                loss, *args, **kwargs
+            )
+
+        masks = tf.get_collection(MASK_COLLECTION)
+
+        with tf.variable_scope("pruning_opt", reuse=True):
+            # estimate computational cost per data point
+            batch_size = tf.cast(tf.shape(masks[0].tensor), loss.dtype)[0]
+            cost = tf.divide(
+                computational_cost(loss), batch_size, name="computational_cost"
+            )
+
+            tf.summary.scalar("computational_cost", cost)
+
+            if masks:
+                signals = update_pruning_signals(
+                    loss, masks=masks, decay=decay, method=method
+                )
+
+                # estimate computational cost per feature map
+                costs = tf.gradients(cost, masks)
+
+                # trade off computational complexity and performance
+                if flops_weight.upper() == "AUTO":
+                    signals = [s / (c + 1e-6) for s, c in zip(signals, costs)]
+                elif not isinstance(flops_weight, float) or flops_weight != 0.0:
+                    signals = [s - flops_weight * c for s, c in zip(signals, costs)]
+
+                counter = tf.Variable(0, name="pruning_counter")
+                counter = tf.assign_add(counter, 1, use_locking=True)
+
+                # only prune every so often after a burn-in phase
+                pruning_cond = tf.logical_and(
+                    counter > burn_in, tf.equal(counter % prune_every, 0)
+                )
+
+                # stop pruning after reaching threshold
+                if flops_target > 0:
+                    pruning_cond = tf.logical_and(
+                        pruning_cond, tf.greater(cost, flops_target)
+                    )
+
+                update_masks = tf.cond(
+                    pruning_cond,
+                    lambda: prune(signals, masks=masks),
+                    lambda: tf.group(masks),
+                )
 
-        return tf.group([update_params, update_masks])
+                return tf.group([update_params, update_masks])
 
-    # no masks found
-    return update_params
+        # no masks found
+        return update_params
diff --git a/twml/twml/contrib/parsers.py b/twml/twml/contrib/parsers.py
index a27f2acbd..0f7deb01a 100644
--- a/twml/twml/contrib/parsers.py
+++ b/twml/twml/contrib/parsers.py
@@ -1,21 +1,33 @@
-'''
+"""
 Contains implementations of functions to parse the contrib.FeatureConfig
 
 Modelers can use the functions in this module as the the train/eval_parse_fn of
 the DataRecordTrainer constructor to customize how to parse their datasets.
 
 Modelers may also provide custom implementations of train/eval_parse_fn using these as reference.
-'''
+"""
 
 from twitter.deepbird.io.legacy.contrib.parsers import (
-  _convert_to_fixed_length_tensor,  # noqa: F401
-  _get_input_receiver_fn_feature_dict,  # noqa: F401
-  _merge_dictionaries,  # noqa: F401
-  get_features_as_tensor_dict,  # noqa: F401
-  get_keras_parse_fn,  # noqa: F401
-  get_serving_input_receiver_fn_feature_dict,  # noqa: F401
-  get_string_tensor_parse_fn,  # noqa: F401
-  get_string_tensor_serving_input_receiver_fn,  # noqa: F401
-  get_supervised_input_receiver_fn_feature_dict,  # noqa: F401
-  parse_string_tensor,  # noqa: F401
-)
+    _convert_to_fixed_length_tensor,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import (
+    _get_input_receiver_fn_feature_dict,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import _merge_dictionaries  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import (
+    get_features_as_tensor_dict,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import get_keras_parse_fn  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import (
+    get_serving_input_receiver_fn_feature_dict,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import (
+    get_string_tensor_parse_fn,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import (
+    get_string_tensor_serving_input_receiver_fn,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import (
+    get_supervised_input_receiver_fn_feature_dict,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import parse_string_tensor  # noqa: F401
diff --git a/twml/twml/contrib/pruning.py b/twml/twml/contrib/pruning.py
index b6ddee693..83d2c4ce4 100644
--- a/twml/twml/contrib/pruning.py
+++ b/twml/twml/contrib/pruning.py
@@ -36,328 +36,353 @@
 import numpy as np
 import tensorflow.compat.v1 as tf
 
-MASK_COLLECTION = 'pruning/masks'
-MASK_EXTENDED_COLLECTION = 'pruning/masks_extended'
-OP_COLLECTION = 'pruning/ops'
+MASK_COLLECTION = "pruning/masks"
+MASK_EXTENDED_COLLECTION = "pruning/masks_extended"
+OP_COLLECTION = "pruning/ops"
 
 
-def apply_mask(tensor, name='pruning'):
-  """
-  Point-wise multiplies a tensor with a binary mask.
+def apply_mask(tensor, name="pruning"):
+    """
+    Point-wise multiplies a tensor with a binary mask.
 
-  During training, pruning is simulated by setting entries of the mask to zero.
+    During training, pruning is simulated by setting entries of the mask to zero.
 
-  Arguments:
-    tensor: tf.Tensor
-      A tensor where the last dimension represents channels which will be masked
+    Arguments:
+      tensor: tf.Tensor
+        A tensor where the last dimension represents channels which will be masked
 
-  Returns:
-    `tf.Tensor` with same shape as `tensor`
-  """
+    Returns:
+      `tf.Tensor` with same shape as `tensor`
+    """
 
-  tensor_shape = tensor.shape
+    tensor_shape = tensor.shape
 
-  with tf.variable_scope(name, reuse=True):
-    # allocate masks and corresponding pruning signals
-    mask = tf.Variable(tf.ones(tensor.shape.as_list()[-1]), trainable=False, name='mask')
-    pruning_signal = tf.Variable(tf.zeros_like(mask), trainable=False, name='signal')
+    with tf.variable_scope(name, reuse=True):
+        # allocate masks and corresponding pruning signals
+        mask = tf.Variable(
+            tf.ones(tensor.shape.as_list()[-1]), trainable=False, name="mask"
+        )
+        pruning_signal = tf.Variable(
+            tf.zeros_like(mask), trainable=False, name="signal"
+        )
 
-    # extending masks is a trick to get a separate gradient for each data point
-    mask_extended = extend_mask(mask, tensor)
+        # extending masks is a trick to get a separate gradient for each data point
+        mask_extended = extend_mask(mask, tensor)
 
-  # store extended mask, pruning signal, and other vars for easy access later
-  mask.extended = mask_extended
-  mask.pruning_signal = pruning_signal
-  mask.tensor = tensor
+    # store extended mask, pruning signal, and other vars for easy access later
+    mask.extended = mask_extended
+    mask.pruning_signal = pruning_signal
+    mask.tensor = tensor
 
-  # mask tensor
-  tensor = tf.multiply(tensor, mask_extended)
-  tensor.set_shape(tensor_shape)
-  tensor._mask = mask
+    # mask tensor
+    tensor = tf.multiply(tensor, mask_extended)
+    tensor.set_shape(tensor_shape)
+    tensor._mask = mask
 
-  tf.add_to_collection(MASK_COLLECTION, mask)
-  tf.add_to_collection(MASK_EXTENDED_COLLECTION, mask.extended)
-  tf.add_to_collection(OP_COLLECTION, tensor.op)
+    tf.add_to_collection(MASK_COLLECTION, mask)
+    tf.add_to_collection(MASK_EXTENDED_COLLECTION, mask.extended)
+    tf.add_to_collection(OP_COLLECTION, tensor.op)
 
-  return tensor
+    return tensor
 
 
 def extend_mask(mask, tensor):
-  """
-  Repeats the mask for each data point stored in a tensor.
+    """
+    Repeats the mask for each data point stored in a tensor.
 
-  If `tensor` is AxBxC dimensional and `mask` is C dimensional, returns an Ax1xC dimensional
-  tensor with A copies or `mask`.
+    If `tensor` is AxBxC dimensional and `mask` is C dimensional, returns an Ax1xC dimensional
+    tensor with A copies or `mask`.
 
-  Arguments:
-    mask: tf.Tensor
-      The mask which will be extended
+    Arguments:
+      mask: tf.Tensor
+        The mask which will be extended
 
-    tensor: tf.Tensor
-      The tensor to which the extended mask will be applied
+      tensor: tf.Tensor
+        The tensor to which the extended mask will be applied
 
-  Returns:
-    The extended mask
-  """
+    Returns:
+      The extended mask
+    """
 
-  batch_size = tf.shape(tensor)[:1]
-  ones = tf.ones([tf.rank(tensor) - 1], dtype=batch_size.dtype)
-  multiples = tf.concat([batch_size, ones], 0)
-  mask_shape = tf.concat([ones, [-1]], 0)
-  return tf.tile(tf.reshape(mask, mask_shape), multiples)
+    batch_size = tf.shape(tensor)[:1]
+    ones = tf.ones([tf.rank(tensor) - 1], dtype=batch_size.dtype)
+    multiples = tf.concat([batch_size, ones], 0)
+    mask_shape = tf.concat([ones, [-1]], 0)
+    return tf.tile(tf.reshape(mask, mask_shape), multiples)
 
 
 def find_input_mask(tensor):
-  """
-  Find ancestral mask affecting the number of pruned channels of a tensor.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to identify relevant mask
-
-  Returns:
-    A `tf.Tensor` or `None`
-  """
-
-  if hasattr(tensor, '_mask'):
-    return tensor._mask
-  if tensor.op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D', 'Transpose']:
-    # op produces a new number of channels, preceding mask therefore irrelevant
-    return None
-  if not tensor.op.inputs:
-    return None
-  for input in tensor.op.inputs:
-    mask = find_input_mask(input)
-    if mask is not None:
-      return mask
+    """
+    Find ancestral mask affecting the number of pruned channels of a tensor.
+
+    Arguments:
+      tensor: tf.Tensor
+        Tensor for which to identify relevant mask
+
+    Returns:
+      A `tf.Tensor` or `None`
+    """
+
+    if hasattr(tensor, "_mask"):
+        return tensor._mask
+    if tensor.op.type in ["MatMul", "Conv1D", "Conv2D", "Conv3D", "Transpose"]:
+        # op produces a new number of channels, preceding mask therefore irrelevant
+        return None
+    if not tensor.op.inputs:
+        return None
+    for input in tensor.op.inputs:
+        mask = find_input_mask(input)
+        if mask is not None:
+            return mask
 
 
 def find_output_mask(tensor):
-  """
-  Find mask applied to the tensor or one of its descendants if it affects the tensor's pruned shape.
-
-  Arguments:
-    tensor: tf.Tensor or tf.Variable
-      Tensor for which to identify relevant mask
-
-  Returns:
-    A `tf.Tensor` or `None`
-  """
-
-  if isinstance(tensor, tf.Variable):
-    return find_output_mask(tensor.op.outputs[0])
-  if hasattr(tensor, '_mask'):
-    return tensor._mask
-  for op in tensor.consumers():
-    if len(op.outputs) != 1:
-      continue
-    if op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D']:
-      # masks of descendants are only relevant if tensor is right-multiplied
-      if tensor == op.inputs[1]:
-        return find_output_mask(op.outputs[0])
-      return None
-    mask = find_output_mask(op.outputs[0])
-    if mask is not None:
-      return mask
+    """
+    Find mask applied to the tensor or one of its descendants if it affects the tensor's pruned shape.
+
+    Arguments:
+      tensor: tf.Tensor or tf.Variable
+        Tensor for which to identify relevant mask
+
+    Returns:
+      A `tf.Tensor` or `None`
+    """
+
+    if isinstance(tensor, tf.Variable):
+        return find_output_mask(tensor.op.outputs[0])
+    if hasattr(tensor, "_mask"):
+        return tensor._mask
+    for op in tensor.consumers():
+        if len(op.outputs) != 1:
+            continue
+        if op.type in ["MatMul", "Conv1D", "Conv2D", "Conv3D"]:
+            # masks of descendants are only relevant if tensor is right-multiplied
+            if tensor == op.inputs[1]:
+                return find_output_mask(op.outputs[0])
+            return None
+        mask = find_output_mask(op.outputs[0])
+        if mask is not None:
+            return mask
 
 
 def find_mask(tensor):
-  """
-  Returns masks indicating channels of the tensor that are effectively removed from the graph.
+    """
+    Returns masks indicating channels of the tensor that are effectively removed from the graph.
 
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to compute a mask
+    Arguments:
+      tensor: tf.Tensor
+        Tensor for which to compute a mask
 
-  Returns:
-    A `tf.Tensor` with binary entries indicating disabled channels
-  """
+    Returns:
+      A `tf.Tensor` with binary entries indicating disabled channels
+    """
 
-  input_mask = find_input_mask(tensor)
-  output_mask = find_output_mask(tensor)
-  if input_mask is None:
-    return output_mask
-  if output_mask is None:
-    return input_mask
-  if input_mask is output_mask:
-    return input_mask
-  return input_mask * output_mask
+    input_mask = find_input_mask(tensor)
+    output_mask = find_output_mask(tensor)
+    if input_mask is None:
+        return output_mask
+    if output_mask is None:
+        return input_mask
+    if input_mask is output_mask:
+        return input_mask
+    return input_mask * output_mask
 
 
 def pruned_shape(tensor):
-  """
-  Computes the shape of a tensor after taking into account pruning of channels.
+    """
+    Computes the shape of a tensor after taking into account pruning of channels.
 
-  Note that the shape will only differ in the last dimension, even if other dimensions are also
-  effectively disabled by pruning masks.
+    Note that the shape will only differ in the last dimension, even if other dimensions are also
+    effectively disabled by pruning masks.
 
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to compute a pruned shape
+    Arguments:
+      tensor: tf.Tensor
+        Tensor for which to compute a pruned shape
 
-  Returns:
-    A `tf.Tensor[tf.float32]` representing the pruned shape
-  """
+    Returns:
+      A `tf.Tensor[tf.float32]` representing the pruned shape
+    """
 
-  mask = find_mask(tensor)
+    mask = find_mask(tensor)
 
-  if mask is None:
-    return tf.cast(tf.shape(tensor), tf.float32)
+    if mask is None:
+        return tf.cast(tf.shape(tensor), tf.float32)
 
-  return tf.concat([
-    tf.cast(tf.shape(tensor)[:-1], mask.dtype),
-    tf.reduce_sum(mask, keepdims=True)], 0)
+    return tf.concat(
+        [
+            tf.cast(tf.shape(tensor)[:-1], mask.dtype),
+            tf.reduce_sum(mask, keepdims=True),
+        ],
+        0,
+    )
 
 
 def computational_cost(op_or_tensor, _observed=None):
-  """
-  Estimates the computational complexity of a pruned graph (number of floating point operations).
+    """
+    Estimates the computational complexity of a pruned graph (number of floating point operations).
+
+    This function currently only supports sequential graphs such as those of MLPs and
+    simple CNNs with 2D convolutions in NHWC format.
+
+    Note that the computational cost returned by this function is proportional to batch size.
+
+    Arguments:
+      op_or_tensor: tf.Tensor or tf.Operation
+        Root node of graph for which to compute computational cost
+
+    Returns:
+      A `tf.Tensor` representing a number of floating point operations
+    """
+
+    cost = tf.constant(0.0)
+
+    # exclude cost of computing extended pruning masks
+    masks_extended = [mask.extended for mask in tf.get_collection(MASK_COLLECTION)]
+    if op_or_tensor in masks_extended:
+        return cost
+
+    # convert tensor to op
+    op = (
+        op_or_tensor.op
+        if isinstance(op_or_tensor, (tf.Tensor, tf.Variable))
+        else op_or_tensor
+    )
+
+    # make sure cost of op will not be counted twice
+    if _observed is None:
+        _observed = []
+    elif op in _observed:
+        return cost
+    _observed.append(op)
+
+    # compute cost of computing inputs
+    for tensor in op.inputs:
+        cost = cost + computational_cost(tensor, _observed)
+
+    # add cost of operation
+    if op.op_def is None or op in tf.get_collection(OP_COLLECTION):
+        # exclude cost of undefined ops and pruning ops
+        return cost
+
+    elif op.op_def.name == "MatMul":
+        shape_a = pruned_shape(op.inputs[0])
+        shape_b = pruned_shape(op.inputs[1])
+        return cost + shape_a[0] * shape_b[1] * (2.0 * shape_a[1] - 1.0)
+
+    elif op.op_def.name in ["Add", "Mul", "BiasAdd"]:
+        return cost + tf.cond(
+            tf.size(op.inputs[0]) > tf.size(op.inputs[1]),
+            lambda: tf.reduce_prod(pruned_shape(op.inputs[0])),
+            lambda: tf.reduce_prod(pruned_shape(op.inputs[1])),
+        )
+
+    elif op.op_def.name in ["Conv2D"]:
+        output_shape = pruned_shape(op.outputs[0])
+        input_shape = pruned_shape(op.inputs[0])
+        kernel_shape = pruned_shape(op.inputs[1])
+        inner_prod_cost = tf.reduce_prod(kernel_shape[:2]) * input_shape[-1] * 2.0 - 1.0
+        return cost + tf.reduce_prod(output_shape) * inner_prod_cost
 
-  This function currently only supports sequential graphs such as those of MLPs and
-  simple CNNs with 2D convolutions in NHWC format.
-
-  Note that the computational cost returned by this function is proportional to batch size.
-
-  Arguments:
-    op_or_tensor: tf.Tensor or tf.Operation
-      Root node of graph for which to compute computational cost
-
-  Returns:
-    A `tf.Tensor` representing a number of floating point operations
-  """
-
-  cost = tf.constant(0.)
-
-  # exclude cost of computing extended pruning masks
-  masks_extended = [mask.extended for mask in tf.get_collection(MASK_COLLECTION)]
-  if op_or_tensor in masks_extended:
     return cost
 
-  # convert tensor to op
-  op = op_or_tensor.op if isinstance(op_or_tensor, (tf.Tensor, tf.Variable)) else op_or_tensor
-
-  # make sure cost of op will not be counted twice
-  if _observed is None:
-    _observed = []
-  elif op in _observed:
-    return cost
-  _observed.append(op)
-
-  # compute cost of computing inputs
-  for tensor in op.inputs:
-    cost = cost + computational_cost(tensor, _observed)
-
-  # add cost of operation
-  if op.op_def is None or op in tf.get_collection(OP_COLLECTION):
-    # exclude cost of undefined ops and pruning ops
-    return cost
-
-  elif op.op_def.name == 'MatMul':
-    shape_a = pruned_shape(op.inputs[0])
-    shape_b = pruned_shape(op.inputs[1])
-    return cost + shape_a[0] * shape_b[1] * (2. * shape_a[1] - 1.)
-
-  elif op.op_def.name in ['Add', 'Mul', 'BiasAdd']:
-    return cost + tf.cond(
-        tf.size(op.inputs[0]) > tf.size(op.inputs[1]),
-        lambda: tf.reduce_prod(pruned_shape(op.inputs[0])),
-        lambda: tf.reduce_prod(pruned_shape(op.inputs[1])))
-
-  elif op.op_def.name in ['Conv2D']:
-    output_shape = pruned_shape(op.outputs[0])
-    input_shape = pruned_shape(op.inputs[0])
-    kernel_shape = pruned_shape(op.inputs[1])
-    inner_prod_cost = (tf.reduce_prod(kernel_shape[:2]) * input_shape[-1] * 2. - 1.)
-    return cost + tf.reduce_prod(output_shape) * inner_prod_cost
 
-  return cost
+def update_pruning_signals(loss, decay=0.96, masks=None, method="Fisher"):
+    """
+    For each mask, computes corresponding pruning signals indicating the importance of a feature.
 
+    Arguments:
+      loss: tf.Tensor
+        Any cross-entropy loss
 
-def update_pruning_signals(loss, decay=.96, masks=None, method='Fisher'):
-  """
-  For each mask, computes corresponding pruning signals indicating the importance of a feature.
+      decay: float
+        Controls exponential moving average of pruning signals
 
-  Arguments:
-    loss: tf.Tensor
-      Any cross-entropy loss
+      method: str
+        Method used to compute pruning signal (currently only supports 'Fisher')
 
-    decay: float
-      Controls exponential moving average of pruning signals
+    Returns:
+      A `list[tf.Tensor]` of pruning signals corresponding to masks
 
-    method: str
-      Method used to compute pruning signal (currently only supports 'Fisher')
+    References:
+      * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
+    """
 
-  Returns:
-    A `list[tf.Tensor]` of pruning signals corresponding to masks
+    if masks is None:
+        masks = tf.get_collection(MASK_COLLECTION)
 
-  References:
-    * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
-  """
+    if method not in ["Fisher"]:
+        raise ValueError("Pruning method '{0}' not supported.".format(method))
 
-  if masks is None:
-    masks = tf.get_collection(MASK_COLLECTION)
+    if not masks:
+        return []
 
-  if method not in ['Fisher']:
-    raise ValueError('Pruning method \'{0}\' not supported.'.format(method))
+    with tf.variable_scope("pruning_opt", reuse=True):
+        # compute gradients of extended masks (yields separate gradient for each data point)
+        grads = tf.gradients(loss, [m.extended for m in masks])
 
-  if not masks:
-    return []
+        # estimate Fisher pruning signals from batch
+        signals_batch = [tf.squeeze(tf.reduce_mean(tf.square(g), 0)) for g in grads]
 
-  with tf.variable_scope('pruning_opt', reuse=True):
-    # compute gradients of extended masks (yields separate gradient for each data point)
-    grads = tf.gradients(loss, [m.extended for m in masks])
+        # update pruning signals
+        signals = [m.pruning_signal for m in masks]
+        signals = [
+            tf.assign(s, decay * s + (1.0 - decay) * f, use_locking=True)
+            for s, f in zip(signals, signals_batch)
+        ]
 
-    # estimate Fisher pruning signals from batch
-    signals_batch = [tf.squeeze(tf.reduce_mean(tf.square(g), 0)) for g in grads]
-
-    # update pruning signals
-    signals = [m.pruning_signal for m in masks]
-    signals = [tf.assign(s, decay * s + (1. - decay) * f, use_locking=True)
-      for s, f in zip(signals, signals_batch)]
-
-  return signals
+    return signals
 
 
 def prune(signals, masks=None):
-  """
-  Prunes a single feature by zeroing the mask entry with the smallest pruning signal.
-
-  Arguments:
-    signals: list[tf.Tensor]
-      A list of pruning signals
-
-    masks: list[tf.Tensor]
-      A list of corresponding masks, defaults to `tf.get_collection(MASK_COLLECTION)`
-
-  Returns:
-    A `tf.Operation` which updates masks
-  """
-
-  if masks is None:
-    masks = tf.get_collection(MASK_COLLECTION)
-
-  with tf.variable_scope('pruning_opt', reuse=True):
-    # make sure we don't select already pruned units
-    signals = [tf.where(m > .5, s, tf.zeros_like(s) + np.inf) for m, s in zip(masks, signals)]
-
-    # find units with smallest pruning signal in each layer
-    min_idx = [tf.argmin(s) for s in signals]
-    min_signals = [s[i] for s, i in zip(signals, min_idx)]
-
-    # find layer with smallest pruning signal
-    l = tf.argmin(min_signals)
-
-    # construct pruning operations, one for each mask
-    updates = []
-    for k, i in enumerate(min_idx):
-      # set mask of layer l to 0 where pruning signal is smallest
-      updates.append(
-        tf.cond(
-          tf.equal(l, k),
-          lambda: tf.scatter_update(
-            masks[k], tf.Print(i, [i], message="Pruning layer [{0}] at index ".format(k)), 0.),
-          lambda: masks[k]))
-
-    updates = tf.group(updates, name='prune')
-
-  return updates
+    """
+    Prunes a single feature by zeroing the mask entry with the smallest pruning signal.
+
+    Arguments:
+      signals: list[tf.Tensor]
+        A list of pruning signals
+
+      masks: list[tf.Tensor]
+        A list of corresponding masks, defaults to `tf.get_collection(MASK_COLLECTION)`
+
+    Returns:
+      A `tf.Operation` which updates masks
+    """
+
+    if masks is None:
+        masks = tf.get_collection(MASK_COLLECTION)
+
+    with tf.variable_scope("pruning_opt", reuse=True):
+        # make sure we don't select already pruned units
+        signals = [
+            tf.where(m > 0.5, s, tf.zeros_like(s) + np.inf)
+            for m, s in zip(masks, signals)
+        ]
+
+        # find units with smallest pruning signal in each layer
+        min_idx = [tf.argmin(s) for s in signals]
+        min_signals = [s[i] for s, i in zip(signals, min_idx)]
+
+        # find layer with smallest pruning signal
+        l = tf.argmin(min_signals)
+
+        # construct pruning operations, one for each mask
+        updates = []
+        for k, i in enumerate(min_idx):
+            # set mask of layer l to 0 where pruning signal is smallest
+            updates.append(
+                tf.cond(
+                    tf.equal(l, k),
+                    lambda: tf.scatter_update(
+                        masks[k],
+                        tf.Print(
+                            i, [i], message="Pruning layer [{0}] at index ".format(k)
+                        ),
+                        0.0,
+                    ),
+                    lambda: masks[k],
+                )
+            )
+
+        updates = tf.group(updates, name="prune")
+
+    return updates
diff --git a/twml/twml/contrib/readers/batch_prediction_request.py b/twml/twml/contrib/readers/batch_prediction_request.py
index 4408b33b4..3341cc851 100644
--- a/twml/twml/contrib/readers/batch_prediction_request.py
+++ b/twml/twml/contrib/readers/batch_prediction_request.py
@@ -4,5 +4,5 @@
 """
 
 from twitter.deepbird.io.legacy.contrib.readers.batch_prediction_request import (
-  BatchPredictionRequest  # noqa: F401
-)
+    BatchPredictionRequest,
+)  # noqa: F401
diff --git a/twml/twml/contrib/readers/data_record.py b/twml/twml/contrib/readers/data_record.py
index ae8cc0b68..dc339c83a 100644
--- a/twml/twml/contrib/readers/data_record.py
+++ b/twml/twml/contrib/readers/data_record.py
@@ -5,6 +5,8 @@
 """
 
 from twitter.deepbird.io.legacy.contrib.readers.data_record import (
-  SUPPORTED_DENSE_FEATURE_TYPES,  # noqa: F401
-  DataRecord,  # noqa: F401
-)
+    SUPPORTED_DENSE_FEATURE_TYPES,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.readers.data_record import (
+    DataRecord,
+)  # noqa: F401
diff --git a/twml/twml/contrib/readers/hashed_batch_prediction_request.py b/twml/twml/contrib/readers/hashed_batch_prediction_request.py
index 3454f8483..d97c47a2f 100644
--- a/twml/twml/contrib/readers/hashed_batch_prediction_request.py
+++ b/twml/twml/contrib/readers/hashed_batch_prediction_request.py
@@ -4,5 +4,5 @@
 """
 
 from twitter.deepbird.io.legacy.contrib.readers.hashed_batch_prediction_request import (
-  HashedBatchPredictionRequest  # noqa: F401
-)
+    HashedBatchPredictionRequest,
+)  # noqa: F401
diff --git a/twml/twml/contrib/trainers/__init__.py b/twml/twml/contrib/trainers/__init__.py
index 3226cd805..cc9508628 100644
--- a/twml/twml/contrib/trainers/__init__.py
+++ b/twml/twml/contrib/trainers/__init__.py
@@ -1,5 +1,7 @@
 # pylint: disable=wildcard-import
 """This module contains experimental trainer classes"""
-from .batch_prediction_request_trainer import BatchPredictionRequestTrainer  # noqa: F401
+from .batch_prediction_request_trainer import (
+    BatchPredictionRequestTrainer,
+)  # noqa: F401
 from .pruning_data_record_trainer import PruningDataRecordTrainer  # noqa: F401
-from .trainer_utils import build_keras_trainer # noqa: F401
+from .trainer_utils import build_keras_trainer  # noqa: F401
diff --git a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py b/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
index 2effa87ed..f86186fd9 100644
--- a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
+++ b/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
@@ -8,173 +8,198 @@
 from twml.trainers import DataRecordTrainer
 
 
-class BatchPredictionRequestTrainer(DataRecordTrainer):  # pylint: disable=abstract-method
-  """
-  The ``BatchPredictionRequestTrainer`` implementation is intended to satisfy use cases
-  that input is BatchPredictionRequest at Twitter and also where only the build_graph methods
-  needs to be overridden. For this reason, ``Trainer.[train,eval]_input_fn`` methods
-  assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
-
-  For use-cases that differ from this common Twitter use-case,
-  further Trainer methods can be overridden.
-  If that still doesn't provide enough flexibility, the user can always
-  use the tf.estimator.Esimator or tf.session.run directly.
-  """
-
-  def __init__(
-          self, name, params,
-          build_graph_fn,
-          feature_config=None,
-          **kwargs):
+class BatchPredictionRequestTrainer(
+    DataRecordTrainer
+):  # pylint: disable=abstract-method
     """
-    The BatchPredictionRequestTrainer constructor builds a
-    ``tf.estimator.Estimator`` and stores it in self.estimator.
-    For this reason, BatchPredictionRequestTrainer accepts the same Estimator constructor arguments.
-    It also accepts additional arguments to facilitate metric evaluation and multi-phase training
-    (init_from_dir, init_map).
-
-    Args:
-      parent arguments:
-        See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
-        for a full list of arguments accepted by the parent class.
-      name, params, build_graph_fn (and other parent class args):
-        see documentation for twml.Trainer and twml.DataRecordTrainer doc.
-      feature_config:
-        An object of type FeatureConfig describing what features to decode.
-        Defaults to None. But it is needed in the following cases:
-          - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
-          - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
-
-      **kwargs:
-        further kwargs can be specified and passed to the Estimator constructor.
+    The ``BatchPredictionRequestTrainer`` implementation is intended to satisfy use cases
+    that input is BatchPredictionRequest at Twitter and also where only the build_graph methods
+    needs to be overridden. For this reason, ``Trainer.[train,eval]_input_fn`` methods
+    assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
+
+    For use-cases that differ from this common Twitter use-case,
+    further Trainer methods can be overridden.
+    If that still doesn't provide enough flexibility, the user can always
+    use the tf.estimator.Esimator or tf.session.run directly.
     """
 
-    # Check and update train_batch_size and eval_batch_size in params before initialization
-    # to print correct parameter logs and does not stop running
-    # This overwrites batch_size parameter constrains in twml.trainers.Trainer.check_params
-    updated_params = self.check_batch_size_params(params)
-    super(BatchPredictionRequestTrainer, self).__init__(
-      name=name, params=updated_params, build_graph_fn=build_graph_fn, **kwargs)
-
-  def check_batch_size_params(self, params):
-    """ Verify that params has the correct key,values """
-    # updated_params is an instance of tensorflow.contrib.training.HParams
-    updated_params = twml.util.convert_to_hparams(params)
-    param_values = updated_params.values()
-
-    # twml.trainers.Trainer.check_params already checks other constraints,
-    # such as being an integer
-    if 'train_batch_size' in param_values:
-      if not isinstance(updated_params.train_batch_size, int):
-        raise ValueError("Expecting params.train_batch_size to be an integer.")
-      if param_values['train_batch_size'] != 1:
-        # This can be a bit annoying to force users to pass the batch sizes,
-        # but it is good to let them know what they actually use in the models
-        # Use warning instead of ValueError in there to continue the run
-        # and print out that train_batch_size is changed
-        warnings.warn('You are processing BatchPredictionRequest data, '
-          'train_batch_size is always 1.\n'
-          'The number of DataRecords in a batch is determined by the size '
-          'of each BatchPredictionRequest.\n'
-          'If you did not pass train.batch_size or eval.batch_size, and '
-          'the default batch_size 32 was in use,\n'
-          'please pass --train.batch_size 1 --eval.batch_size 1')
-        # If the upper error warning, change/pass --train.batch_size 1
-        # so that train_batch_size = 1
-        updated_params.train_batch_size = 1
-
-    if 'eval_batch_size' in param_values:
-      if not isinstance(updated_params.train_batch_size, int):
-        raise ValueError('Expecting params.eval_batch_size to be an integer.')
-      if param_values['eval_batch_size'] != 1:
-        # This can be a bit annoying to force users to pass the batch sizes,
-        # but it is good to let them know what they actually use in the models
-        # Use warning instead of ValueError in there to continue the run
-        # and print out that eval_batch_size is changed
-        warnings.warn('You are processing BatchPredictionRequest data, '
-          'eval_batch_size is also always 1.\n'
-          'The number of DataRecords in a batch is determined by the size '
-          'of each BatchPredictionRequest.\n'
-          'If you did not pass train.batch_size or eval.batch_size, and '
-          'the default batch_size 32 was in use,\n'
-          'please pass --train.batch_size 1 --eval.batch_size 1')
-        # If the upper warning raises, change/pass --eval.batch_size 1
-        # so that eval_batch_size = 1
-        updated_params.eval_batch_size = 1
-
-    if 'eval_batch_size' not in param_values:
-      updated_params.eval_batch_size = 1
-
-    if not updated_params.eval_batch_size:
-      updated_params.eval_batch_size = 1
-
-    return updated_params
-
-  @staticmethod
-  def add_batch_prediction_request_arguments():
-    """
-    Add commandline args to parse typically for the BatchPredictionRequestTrainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-    for a list and description of all cmd-line arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    parser = super(BatchPredictionRequestTrainer,
-      BatchPredictionRequestTrainer).add_parser_arguments()
-
-    # mlp arguments
-    parser.add_argument(
-      '--model.use_existing_discretizer', action='store_true',
-      dest="model_use_existing_discretizer",
-      help='Load a pre-trained calibration or train a new one')
-    parser.add_argument(
-      '--model.use_binary_values', action='store_true',
-      dest='model_use_binary_values',
-      help='Use the use_binary_values optimization')
-
-    # control hom many featues we keep in sparse tensors
-    # 12 is enough for learning-to-rank for now
-    parser.add_argument(
-      '--input_size_bits', type=int, default=12,
-      help='Number of bits allocated to the input size')
-
-    parser.add_argument(
-      '--loss_function', type=str, default='ranknet',
-      dest='loss_function',
-      help='Options are pairwise: ranknet (default), lambdarank, '
-      'listnet, listmle, attrank, '
-      'pointwise')
-
-    # whether convert sparse tensors to dense tensor
-    # in order to use dense normalization methods
-    parser.add_argument(
-      '--use_dense_tensor', action='store_true',
-      dest='use_dense_tensor',
-      default=False,
-      help='If use_dense_tensor is False, '
-      'sparse tensor and spare normalization are in use. '
-      'If use_dense_tensor is True, '
-      'dense tensor and dense normalization are in use.')
-
-    parser.add_argument(
-      '--dense_normalization', type=str, default='mean_max_normalizaiton',
-      dest='dense_normalization',
-      help='Options are mean_max_normalizaiton (default), standard_normalizaiton')
-
-    parser.add_argument(
-      '--sparse_normalization', type=str, default='SparseMaxNorm',
-      dest='sparse_normalization',
-      help='Options are SparseMaxNorm (default), SparseBatchNorm')
-
-    # so far only used in pairwise learning-to-rank
-    parser.add_argument(
-      '--mask', type=str, default='full_mask',
-      dest='mask',
-      help='Options are full_mask (default), diag_mask')
-
-    return parser
+    def __init__(self, name, params, build_graph_fn, feature_config=None, **kwargs):
+        """
+        The BatchPredictionRequestTrainer constructor builds a
+        ``tf.estimator.Estimator`` and stores it in self.estimator.
+        For this reason, BatchPredictionRequestTrainer accepts the same Estimator constructor arguments.
+        It also accepts additional arguments to facilitate metric evaluation and multi-phase training
+        (init_from_dir, init_map).
+
+        Args:
+          parent arguments:
+            See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
+            for a full list of arguments accepted by the parent class.
+          name, params, build_graph_fn (and other parent class args):
+            see documentation for twml.Trainer and twml.DataRecordTrainer doc.
+          feature_config:
+            An object of type FeatureConfig describing what features to decode.
+            Defaults to None. But it is needed in the following cases:
+              - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
+              - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
+
+          **kwargs:
+            further kwargs can be specified and passed to the Estimator constructor.
+        """
+
+        # Check and update train_batch_size and eval_batch_size in params before initialization
+        # to print correct parameter logs and does not stop running
+        # This overwrites batch_size parameter constrains in twml.trainers.Trainer.check_params
+        updated_params = self.check_batch_size_params(params)
+        super(BatchPredictionRequestTrainer, self).__init__(
+            name=name, params=updated_params, build_graph_fn=build_graph_fn, **kwargs
+        )
+
+    def check_batch_size_params(self, params):
+        """Verify that params has the correct key,values"""
+        # updated_params is an instance of tensorflow.contrib.training.HParams
+        updated_params = twml.util.convert_to_hparams(params)
+        param_values = updated_params.values()
+
+        # twml.trainers.Trainer.check_params already checks other constraints,
+        # such as being an integer
+        if "train_batch_size" in param_values:
+            if not isinstance(updated_params.train_batch_size, int):
+                raise ValueError("Expecting params.train_batch_size to be an integer.")
+            if param_values["train_batch_size"] != 1:
+                # This can be a bit annoying to force users to pass the batch sizes,
+                # but it is good to let them know what they actually use in the models
+                # Use warning instead of ValueError in there to continue the run
+                # and print out that train_batch_size is changed
+                warnings.warn(
+                    "You are processing BatchPredictionRequest data, "
+                    "train_batch_size is always 1.\n"
+                    "The number of DataRecords in a batch is determined by the size "
+                    "of each BatchPredictionRequest.\n"
+                    "If you did not pass train.batch_size or eval.batch_size, and "
+                    "the default batch_size 32 was in use,\n"
+                    "please pass --train.batch_size 1 --eval.batch_size 1"
+                )
+                # If the upper error warning, change/pass --train.batch_size 1
+                # so that train_batch_size = 1
+                updated_params.train_batch_size = 1
+
+        if "eval_batch_size" in param_values:
+            if not isinstance(updated_params.train_batch_size, int):
+                raise ValueError("Expecting params.eval_batch_size to be an integer.")
+            if param_values["eval_batch_size"] != 1:
+                # This can be a bit annoying to force users to pass the batch sizes,
+                # but it is good to let them know what they actually use in the models
+                # Use warning instead of ValueError in there to continue the run
+                # and print out that eval_batch_size is changed
+                warnings.warn(
+                    "You are processing BatchPredictionRequest data, "
+                    "eval_batch_size is also always 1.\n"
+                    "The number of DataRecords in a batch is determined by the size "
+                    "of each BatchPredictionRequest.\n"
+                    "If you did not pass train.batch_size or eval.batch_size, and "
+                    "the default batch_size 32 was in use,\n"
+                    "please pass --train.batch_size 1 --eval.batch_size 1"
+                )
+                # If the upper warning raises, change/pass --eval.batch_size 1
+                # so that eval_batch_size = 1
+                updated_params.eval_batch_size = 1
+
+        if "eval_batch_size" not in param_values:
+            updated_params.eval_batch_size = 1
+
+        if not updated_params.eval_batch_size:
+            updated_params.eval_batch_size = 1
+
+        return updated_params
+
+    @staticmethod
+    def add_batch_prediction_request_arguments():
+        """
+        Add commandline args to parse typically for the BatchPredictionRequestTrainer class.
+        Typically, the user calls this function and then parses cmd-line arguments
+        into an argparse.Namespace object which is then passed to the Trainer constructor
+        via the params argument.
+
+        See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
+        for a list and description of all cmd-line arguments.
+
+        Returns:
+          argparse.ArgumentParser instance with some useful args already added.
+        """
+        parser = super(
+            BatchPredictionRequestTrainer, BatchPredictionRequestTrainer
+        ).add_parser_arguments()
+
+        # mlp arguments
+        parser.add_argument(
+            "--model.use_existing_discretizer",
+            action="store_true",
+            dest="model_use_existing_discretizer",
+            help="Load a pre-trained calibration or train a new one",
+        )
+        parser.add_argument(
+            "--model.use_binary_values",
+            action="store_true",
+            dest="model_use_binary_values",
+            help="Use the use_binary_values optimization",
+        )
+
+        # control hom many featues we keep in sparse tensors
+        # 12 is enough for learning-to-rank for now
+        parser.add_argument(
+            "--input_size_bits",
+            type=int,
+            default=12,
+            help="Number of bits allocated to the input size",
+        )
+
+        parser.add_argument(
+            "--loss_function",
+            type=str,
+            default="ranknet",
+            dest="loss_function",
+            help="Options are pairwise: ranknet (default), lambdarank, "
+            "listnet, listmle, attrank, "
+            "pointwise",
+        )
+
+        # whether convert sparse tensors to dense tensor
+        # in order to use dense normalization methods
+        parser.add_argument(
+            "--use_dense_tensor",
+            action="store_true",
+            dest="use_dense_tensor",
+            default=False,
+            help="If use_dense_tensor is False, "
+            "sparse tensor and spare normalization are in use. "
+            "If use_dense_tensor is True, "
+            "dense tensor and dense normalization are in use.",
+        )
+
+        parser.add_argument(
+            "--dense_normalization",
+            type=str,
+            default="mean_max_normalizaiton",
+            dest="dense_normalization",
+            help="Options are mean_max_normalizaiton (default), standard_normalizaiton",
+        )
+
+        parser.add_argument(
+            "--sparse_normalization",
+            type=str,
+            default="SparseMaxNorm",
+            dest="sparse_normalization",
+            help="Options are SparseMaxNorm (default), SparseBatchNorm",
+        )
+
+        # so far only used in pairwise learning-to-rank
+        parser.add_argument(
+            "--mask",
+            type=str,
+            default="full_mask",
+            dest="mask",
+            help="Options are full_mask (default), diag_mask",
+        )
+
+        return parser
diff --git a/twml/twml/contrib/trainers/pruning_data_record_trainer.py b/twml/twml/contrib/trainers/pruning_data_record_trainer.py
index 4796e5390..55a39f9b2 100644
--- a/twml/twml/contrib/trainers/pruning_data_record_trainer.py
+++ b/twml/twml/contrib/trainers/pruning_data_record_trainer.py
@@ -1,59 +1,77 @@
 import tensorflow.compat.v1 as tf
 
-from twml.trainers import DataRecordTrainer
 from twml.contrib.optimizers import PruningOptimizer
+from twml.trainers import DataRecordTrainer
 
 
 class PruningDataRecordTrainer(DataRecordTrainer):
-  @staticmethod
-  def get_train_op(params, loss):
-    train_op = DataRecordTrainer.get_train_op(params, loss)
-
-    optimizer = PruningOptimizer(learning_rate=params.get('learning_rate'))
-
-    return optimizer.minimize(
-        loss=loss,
-        prune_every=params.get('pruning_iter', 5000),
-        burn_in=params.get('pruning_burn_in', 100000),
-        decay=params.get('pruning_decay', .9999),
-        flops_target=params.get('pruning_flops_target', 250000),
-        update_params=train_op,
-        global_step=tf.train.get_global_step())
-
-  def __init__(self, name, params, build_graph_fn, feature_config=None, **kwargs):
-    kwargs['optimize_loss_fn'] = self.get_train_op
-
-    super(PruningDataRecordTrainer, self).__init__(
-      name=name,
-      params=params,
-      build_graph_fn=build_graph_fn,
-      feature_config=feature_config,
-      **kwargs)
-
-  def export_model(self, *args, **kwargs):
-    # TODO: modify graph before exporting to take into account masks
-    return super(PruningDataRecordTrainer, self).export_model(*args, **kwargs)
-
-  @staticmethod
-  def add_parser_arguments():
-    parser = DataRecordTrainer.add_parser_arguments()
-    parser.add_argument(
-      "--pruning.iter", "--pruning_iter", type=int, default=5000,
-      dest="pruning_iter",
-      help="A single feature or feature map is pruned every this many iterations")
-    parser.add_argument(
-      "--pruning.burn_in", "--pruning_burn_in", type=int, default=100000,
-      dest="pruning_burn_in",
-      help="Only start pruning after collecting statistics for this many training steps")
-    parser.add_argument(
-      "--pruning.flops_target", "--pruning_flops_target", type=int, default=250000,
-      dest="pruning_flops_target",
-      help="Stop pruning when estimated number of floating point operations reached this target. \
-      For example, a small feed-forward network might require 250,000 FLOPs to run.")
-    parser.add_argument(
-      "--pruning.decay", "--pruning_decay", type=float, default=.9999,
-      dest="pruning_decay",
-      help="A float value in [0.0, 1.0) controlling an exponential moving average of pruning \
+    @staticmethod
+    def get_train_op(params, loss):
+        train_op = DataRecordTrainer.get_train_op(params, loss)
+
+        optimizer = PruningOptimizer(learning_rate=params.get("learning_rate"))
+
+        return optimizer.minimize(
+            loss=loss,
+            prune_every=params.get("pruning_iter", 5000),
+            burn_in=params.get("pruning_burn_in", 100000),
+            decay=params.get("pruning_decay", 0.9999),
+            flops_target=params.get("pruning_flops_target", 250000),
+            update_params=train_op,
+            global_step=tf.train.get_global_step(),
+        )
+
+    def __init__(self, name, params, build_graph_fn, feature_config=None, **kwargs):
+        kwargs["optimize_loss_fn"] = self.get_train_op
+
+        super(PruningDataRecordTrainer, self).__init__(
+            name=name,
+            params=params,
+            build_graph_fn=build_graph_fn,
+            feature_config=feature_config,
+            **kwargs
+        )
+
+    def export_model(self, *args, **kwargs):
+        # TODO: modify graph before exporting to take into account masks
+        return super(PruningDataRecordTrainer, self).export_model(*args, **kwargs)
+
+    @staticmethod
+    def add_parser_arguments():
+        parser = DataRecordTrainer.add_parser_arguments()
+        parser.add_argument(
+            "--pruning.iter",
+            "--pruning_iter",
+            type=int,
+            default=5000,
+            dest="pruning_iter",
+            help="A single feature or feature map is pruned every this many iterations",
+        )
+        parser.add_argument(
+            "--pruning.burn_in",
+            "--pruning_burn_in",
+            type=int,
+            default=100000,
+            dest="pruning_burn_in",
+            help="Only start pruning after collecting statistics for this many training steps",
+        )
+        parser.add_argument(
+            "--pruning.flops_target",
+            "--pruning_flops_target",
+            type=int,
+            default=250000,
+            dest="pruning_flops_target",
+            help="Stop pruning when estimated number of floating point operations reached this target. \
+      For example, a small feed-forward network might require 250,000 FLOPs to run.",
+        )
+        parser.add_argument(
+            "--pruning.decay",
+            "--pruning_decay",
+            type=float,
+            default=0.9999,
+            dest="pruning_decay",
+            help="A float value in [0.0, 1.0) controlling an exponential moving average of pruning \
       signal statistics. A value of 0.9999 can be thought of as averaging statistics over 10,000 \
-      steps.")
-    return parser
+      steps.",
+        )
+        return parser
diff --git a/twml/twml/contrib/trainers/trainer_utils.py b/twml/twml/contrib/trainers/trainer_utils.py
index f279571be..9be8fa8d4 100644
--- a/twml/twml/contrib/trainers/trainer_utils.py
+++ b/twml/twml/contrib/trainers/trainer_utils.py
@@ -24,88 +24,82 @@
 .. note: this util handles the most common case. If you have cases not satisfied by this util,
          consider writing your own build_graph to wrap your keras models.
 """
-from twitter.deepbird.hparam import HParams
-
 import tensorflow  # noqa: F401
 import tensorflow.compat.v2 as tf
+from twitter.deepbird.hparam import HParams
 
 import twml
 
 
 def build_keras_trainer(
-  name,
-  model_factory,
-  save_dir,
-  loss_fn=None,
-  metrics_fn=None,
-  **kwargs):
-  """
-  Compile the given model_factory into a twml Trainer.
+    name, model_factory, save_dir, loss_fn=None, metrics_fn=None, **kwargs
+):
+    """
+    Compile the given model_factory into a twml Trainer.
 
-  Args:
-    name: a string name for the returned twml Trainer.
+    Args:
+      name: a string name for the returned twml Trainer.
 
-    model_factory: a callable that returns a keras model when called.
-      This keras model is expected to solve a binary classification problem.
-      This keras model takes a dict of tensors as input, and outputs a logit or probability.
+      model_factory: a callable that returns a keras model when called.
+        This keras model is expected to solve a binary classification problem.
+        This keras model takes a dict of tensors as input, and outputs a logit or probability.
 
-    save_dir: a directory where the trainer saves data. Can be an HDFS path.
+      save_dir: a directory where the trainer saves data. Can be an HDFS path.
 
-    loss_fn: the loss function to use. Defaults to tf.keras.losses.BinaryCrossentropy.
+      loss_fn: the loss function to use. Defaults to tf.keras.losses.BinaryCrossentropy.
 
-    metrics_fn: metrics function used by TensorFlow estimators.
-    Defaults to twml.metrics.get_binary_class_metric_fn().
+      metrics_fn: metrics function used by TensorFlow estimators.
+      Defaults to twml.metrics.get_binary_class_metric_fn().
 
-    **kwargs: for people familiar with twml Trainer's options, they can be passed in here
-      as kwargs, and they will be forwarded to Trainer as opts.
-      See https://cgit.twitter.biz/source/tree/twml/twml/argument_parser.py#n43 for available args.
+      **kwargs: for people familiar with twml Trainer's options, they can be passed in here
+        as kwargs, and they will be forwarded to Trainer as opts.
+        See https://cgit.twitter.biz/source/tree/twml/twml/argument_parser.py#n43 for available args.
 
-  Returns:
-    a twml.trainers.Trainer object which can be used for training and exporting models.
-  """
-  build_graph = create_build_graph_fn(model_factory, loss_fn)
+    Returns:
+      a twml.trainers.Trainer object which can be used for training and exporting models.
+    """
+    build_graph = create_build_graph_fn(model_factory, loss_fn)
 
-  if metrics_fn is None:
-    metrics_fn = twml.metrics.get_binary_class_metric_fn()
+    if metrics_fn is None:
+        metrics_fn = twml.metrics.get_binary_class_metric_fn()
 
-  opts = HParams(**kwargs)
-  opts.add_hparam('save_dir', save_dir)
+    opts = HParams(**kwargs)
+    opts.add_hparam("save_dir", save_dir)
 
-  return twml.trainers.Trainer(
-    name,
-    opts,
-    build_graph_fn=build_graph,
-    save_dir=save_dir,
-    metric_fn=metrics_fn)
+    return twml.trainers.Trainer(
+        name, opts, build_graph_fn=build_graph, save_dir=save_dir, metric_fn=metrics_fn
+    )
 
 
 def create_build_graph_fn(model_factory, loss_fn=None):
-  """Create a build graph function from the given keras model."""
-
-  def build_graph(features, label, mode, params, config=None):
-    # create model from model factory.
-    model = model_factory()
-
-    # create loss function if the user didn't specify one.
-    if loss_fn is None:
-      build_graph_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
-    else:
-      build_graph_loss_fn = loss_fn
-
-    output = model(features)
-    if mode == 'infer':
-      loss = None
-    else:
-      weights = features.get('weights', None)
-      loss = build_graph_loss_fn(y_true=label, y_pred=output, sample_weight=weights)
-
-    if isinstance(output, dict):
-      if loss is None:
-        return output
-      else:
-        output['loss'] = loss
-        return output
-    else:
-      return {'output': output, 'loss': loss}
-
-  return build_graph
+    """Create a build graph function from the given keras model."""
+
+    def build_graph(features, label, mode, params, config=None):
+        # create model from model factory.
+        model = model_factory()
+
+        # create loss function if the user didn't specify one.
+        if loss_fn is None:
+            build_graph_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
+        else:
+            build_graph_loss_fn = loss_fn
+
+        output = model(features)
+        if mode == "infer":
+            loss = None
+        else:
+            weights = features.get("weights", None)
+            loss = build_graph_loss_fn(
+                y_true=label, y_pred=output, sample_weight=weights
+            )
+
+        if isinstance(output, dict):
+            if loss is None:
+                return output
+            else:
+                output["loss"] = loss
+                return output
+        else:
+            return {"output": output, "loss": loss}
+
+    return build_graph
diff --git a/twml/twml/contrib/utils/__init__.py b/twml/twml/contrib/utils/__init__.py
index 56a083048..faa93031c 100644
--- a/twml/twml/contrib/utils/__init__.py
+++ b/twml/twml/contrib/utils/__init__.py
@@ -1,18 +1,24 @@
 # pylint: disable=wildcard-import
 """This module contains experimental util functions for contrib."""
 
-from .math_fns import safe_div, safe_log, cal_ndcg, cal_swapped_ndcg  # noqa: F401
-from .masks import diag_mask, full_mask  # noqa: F401
-from .normalizer import mean_max_normalizaiton, standard_normalizaiton  # noqa: F401
-from .scores import get_pairwise_scores, get_pairwise_label_scores  # noqa: F401
-# pointwise functions
-from .loss_fns import get_pointwise_loss  # noqa: F401
-# ranknet functions
-from .loss_fns import get_pair_loss  # noqa: F401
-# listwise functions
-from .loss_fns import get_attrank_loss, get_listnet_loss, get_listmle_loss  # noqa: F401
+from . import interp  # noqa: F401
+from .device import (
+    get_device_map,
+    get_gpu_count,
+    get_gpu_list,  # noqa: F401
+    is_gpu_available,
+)
+
 # lambdarank functions
+# listwise functions
+# ranknet functions
+# pointwise functions
 from .loss_fns import get_lambda_pair_loss  # noqa: F401
-from .device import get_device_map, get_gpu_list, get_gpu_count, is_gpu_available  # noqa: F401
+from .loss_fns import get_pair_loss  # noqa: F401
+from .loss_fns import get_pointwise_loss  # noqa: F401
+from .loss_fns import get_attrank_loss, get_listmle_loss, get_listnet_loss  # noqa: F401
+from .masks import diag_mask, full_mask  # noqa: F401
+from .math_fns import cal_ndcg, cal_swapped_ndcg, safe_div, safe_log  # noqa: F401
+from .normalizer import mean_max_normalizaiton, standard_normalizaiton  # noqa: F401
+from .scores import get_pairwise_label_scores, get_pairwise_scores  # noqa: F401
 from .similarities import cosine_similarity  # noqa: F401
-from . import interp # noqa: F401
diff --git a/twml/twml/contrib/utils/datasets.py b/twml/twml/contrib/utils/datasets.py
index d31ea3ae4..3c3d8022a 100644
--- a/twml/twml/contrib/utils/datasets.py
+++ b/twml/twml/contrib/utils/datasets.py
@@ -6,88 +6,89 @@
 
 
 def resolve_train_and_eval_files_overlap(
-  train_files, eval_files, fraction_kept_for_eval, seed=None
+    train_files, eval_files, fraction_kept_for_eval, seed=None
 ):
-  """Resolve any overlap between train and eval files.
+    """Resolve any overlap between train and eval files.
 
-  Specifically, if there's an overlap between `train_files` and `eval_files`, then a fraction of
-  the overlap (i.e. `fraction_kept_for_eval`) will be randomly assigned (exclusively) to the
-  `eval_files`.
+    Specifically, if there's an overlap between `train_files` and `eval_files`, then a fraction of
+    the overlap (i.e. `fraction_kept_for_eval`) will be randomly assigned (exclusively) to the
+    `eval_files`.
 
-  The following example demonstrates its usage:
+    The following example demonstrates its usage:
 
-  >>> orig_train_files = ['f1', 'f2', 'f3', 'f4']
-  >>> orig_eval_files = ['f1', 'f2', 'f3']
-  >>> resolved_train_files, resolved_eval_files = resolve_train_and_eval_files_overlap(
-  ...     orig_train_files, orig_eval_files, 0.5
-  ... )
-  >>> set(resolved_train_files) & set(resolved_eval_files) == set()
-  True
-  >>> len(resolved_train_files) == 3
-  True
-  >>> len(resolved_eval_files) == 2
-  True
+    >>> orig_train_files = ['f1', 'f2', 'f3', 'f4']
+    >>> orig_eval_files = ['f1', 'f2', 'f3']
+    >>> resolved_train_files, resolved_eval_files = resolve_train_and_eval_files_overlap(
+    ...     orig_train_files, orig_eval_files, 0.5
+    ... )
+    >>> set(resolved_train_files) & set(resolved_eval_files) == set()
+    True
+    >>> len(resolved_train_files) == 3
+    True
+    >>> len(resolved_eval_files) == 2
+    True
 
-  Args:
-    train_files: A list of the files used for training.
-    eval_files: A list of the files used for validation.
-    fraction_kept_for_eval: A fraction of files in the intersection between `train_files` and
-      `eval_files` exclusively kept for evaluation.
-    seed: A seed for generating random numbers.
+    Args:
+      train_files: A list of the files used for training.
+      eval_files: A list of the files used for validation.
+      fraction_kept_for_eval: A fraction of files in the intersection between `train_files` and
+        `eval_files` exclusively kept for evaluation.
+      seed: A seed for generating random numbers.
 
-  Returns:
-    A tuple `(new_train_files, new_eval_files)` with the overlapping resolved.
-  """
+    Returns:
+      A tuple `(new_train_files, new_eval_files)` with the overlapping resolved.
+    """
 
-  rng = random.Random(seed)
+    rng = random.Random(seed)
 
-  train_files = set(train_files)
-  eval_files = set(eval_files)
-  overlapping_files = train_files & eval_files
-  train_files_selected_for_eval = set(rng.sample(
-    overlapping_files,
-    int(len(overlapping_files) * fraction_kept_for_eval)
-  ))
-  train_files = train_files - train_files_selected_for_eval
-  eval_files = (eval_files - overlapping_files) | train_files_selected_for_eval
-  return list(train_files), list(eval_files)
+    train_files = set(train_files)
+    eval_files = set(eval_files)
+    overlapping_files = train_files & eval_files
+    train_files_selected_for_eval = set(
+        rng.sample(
+            overlapping_files, int(len(overlapping_files) * fraction_kept_for_eval)
+        )
+    )
+    train_files = train_files - train_files_selected_for_eval
+    eval_files = (eval_files - overlapping_files) | train_files_selected_for_eval
+    return list(train_files), list(eval_files)
 
 
 def get_time_based_dataset_files_for_train_and_eval(
-  base_path,
-  train_start_datetime,
-  train_end_datetime,
-  eval_start_datetime,
-  eval_end_datetime,
-  fraction_kept_for_eval,
-  datetime_prefix_format='%Y/%m/%d/%H',
-  extension='lzo',
-  parallelism=1
+    base_path,
+    train_start_datetime,
+    train_end_datetime,
+    eval_start_datetime,
+    eval_end_datetime,
+    fraction_kept_for_eval,
+    datetime_prefix_format="%Y/%m/%d/%H",
+    extension="lzo",
+    parallelism=1,
 ):
-  """Get train/eval dataset files organized with a time-based prefix.
+    """Get train/eval dataset files organized with a time-based prefix.
 
-  This is just a convenience built around `get_dataset_files_prefixed_by_time` and
-  `resolve_train_and_eval_files_overlap`. Please refer to these functions for documentation.
-  """
+    This is just a convenience built around `get_dataset_files_prefixed_by_time` and
+    `resolve_train_and_eval_files_overlap`. Please refer to these functions for documentation.
+    """
 
-  train_files = get_time_based_dataset_files(
-    base_path=base_path,
-    start_datetime=train_start_datetime,
-    end_datetime=train_end_datetime,
-    datetime_prefix_format=datetime_prefix_format,
-    extension=extension,
-    parallelism=parallelism
-  )
-  eval_files = get_time_based_dataset_files(
-    base_path=base_path,
-    start_datetime=eval_start_datetime,
-    end_datetime=eval_end_datetime,
-    datetime_prefix_format=datetime_prefix_format,
-    extension=extension,
-    parallelism=parallelism
-  )
-  return resolve_train_and_eval_files_overlap(
-    train_files=train_files,
-    eval_files=eval_files,
-    fraction_kept_for_eval=fraction_kept_for_eval
-  )
+    train_files = get_time_based_dataset_files(
+        base_path=base_path,
+        start_datetime=train_start_datetime,
+        end_datetime=train_end_datetime,
+        datetime_prefix_format=datetime_prefix_format,
+        extension=extension,
+        parallelism=parallelism,
+    )
+    eval_files = get_time_based_dataset_files(
+        base_path=base_path,
+        start_datetime=eval_start_datetime,
+        end_datetime=eval_end_datetime,
+        datetime_prefix_format=datetime_prefix_format,
+        extension=extension,
+        parallelism=parallelism,
+    )
+    return resolve_train_and_eval_files_overlap(
+        train_files=train_files,
+        eval_files=eval_files,
+        fraction_kept_for_eval=fraction_kept_for_eval,
+    )
diff --git a/twml/twml/contrib/utils/device.py b/twml/twml/contrib/utils/device.py
index ab189c98a..c3f358b23 100644
--- a/twml/twml/contrib/utils/device.py
+++ b/twml/twml/contrib/utils/device.py
@@ -6,22 +6,22 @@
 
 
 def get_device_map():
-  """Returns the map of device name to device type"""
-  local_device_protos = device_lib.list_local_devices()
-  return {x.name: x.device_type for x in local_device_protos}
+    """Returns the map of device name to device type"""
+    local_device_protos = device_lib.list_local_devices()
+    return {x.name: x.device_type for x in local_device_protos}
 
 
 def get_gpu_list():
-  """Returns the list of GPUs available"""
-  device_map = get_device_map()
-  return [name for name in device_map if device_map[name] == 'GPU']
+    """Returns the list of GPUs available"""
+    device_map = get_device_map()
+    return [name for name in device_map if device_map[name] == "GPU"]
 
 
 def get_gpu_count():
-  """Returns the count of GPUs available"""
-  return len(get_gpu_list())
+    """Returns the count of GPUs available"""
+    return len(get_gpu_list())
 
 
 def is_gpu_available():
-  """Returns if GPUs are available"""
-  return get_gpu_count() > 0
+    """Returns if GPUs are available"""
+    return get_gpu_count() > 0
diff --git a/twml/twml/contrib/utils/interp.py b/twml/twml/contrib/utils/interp.py
index 419d89030..35af00988 100644
--- a/twml/twml/contrib/utils/interp.py
+++ b/twml/twml/contrib/utils/interp.py
@@ -4,91 +4,97 @@
 
 import libtwml
 import tensorflow.compat.v1 as tf
+
 import twml
 
 
 def linear_interp1(inputs, ref_inputs, ref_outputs):
-  """
-  Perform 1D linear interpolation.
-  Arguments:
-    inputs:
-      The query input values.
-    ref_inputs:
-      Reference grid points used for interpolation.
-    ref_outputs:
-      Reference output values used for interpolation.
-
-  Returns:
-    The interpolated outputs for the requested input values.
-  """
-
-  inputs = tf.convert_to_tensor(inputs)
-  ref_inputs = tf.convert_to_tensor(ref_inputs)
-  ref_outputs = tf.convert_to_tensor(ref_outputs)
-
-  ndims = inputs.shape.ndims
-  ref_inputs_ndims = ref_inputs.shape.ndims
-  ref_outputs_ndims = ref_inputs.shape.ndims
-
-  if (ref_inputs_ndims != ndims):
-    raise ValueError("Dimension mismatch. inputs: %d, ref_inputs: %d" % (ndims, ref_inputs_ndims))
-
-  if (ref_outputs_ndims != ndims):
-    raise ValueError("Dimension mismatch. inputs: %d, ref_outputs: %d" % (ndims, ref_outputs_ndims))
-
-  if ndims > 2:
-    raise ValueError("Input dimensions should be < 2D. But got %d." % ndims)
-
-  original_input_shape = tf.shape(inputs)
-  # This is needed because isotonic_calibration expects:
-  # - inputs of size [num_samples, num_classes]
-  # - ref_inputs, ref_outputs of size [num_classes, num_bins]
-  inputs = tf.reshape(inputs, [-1, 1])
-  ref_inputs = tf.reshape(ref_inputs, [1, -1])
-  ref_outputs = tf.reshape(ref_outputs, [1, -1])
-
-  # isotonic_calibration is simply doing linear interpolation.
-  # This needs to be renamed in the future to make it consistent.
-  outputs = libtwml.ops.isotonic_calibration(inputs, ref_inputs, ref_outputs)
-  return tf.reshape(outputs, original_input_shape)
+    """
+    Perform 1D linear interpolation.
+    Arguments:
+      inputs:
+        The query input values.
+      ref_inputs:
+        Reference grid points used for interpolation.
+      ref_outputs:
+        Reference output values used for interpolation.
+
+    Returns:
+      The interpolated outputs for the requested input values.
+    """
+
+    inputs = tf.convert_to_tensor(inputs)
+    ref_inputs = tf.convert_to_tensor(ref_inputs)
+    ref_outputs = tf.convert_to_tensor(ref_outputs)
+
+    ndims = inputs.shape.ndims
+    ref_inputs_ndims = ref_inputs.shape.ndims
+    ref_outputs_ndims = ref_inputs.shape.ndims
+
+    if ref_inputs_ndims != ndims:
+        raise ValueError(
+            "Dimension mismatch. inputs: %d, ref_inputs: %d" % (ndims, ref_inputs_ndims)
+        )
+
+    if ref_outputs_ndims != ndims:
+        raise ValueError(
+            "Dimension mismatch. inputs: %d, ref_outputs: %d"
+            % (ndims, ref_outputs_ndims)
+        )
+
+    if ndims > 2:
+        raise ValueError("Input dimensions should be < 2D. But got %d." % ndims)
+
+    original_input_shape = tf.shape(inputs)
+    # This is needed because isotonic_calibration expects:
+    # - inputs of size [num_samples, num_classes]
+    # - ref_inputs, ref_outputs of size [num_classes, num_bins]
+    inputs = tf.reshape(inputs, [-1, 1])
+    ref_inputs = tf.reshape(ref_inputs, [1, -1])
+    ref_outputs = tf.reshape(ref_outputs, [1, -1])
+
+    # isotonic_calibration is simply doing linear interpolation.
+    # This needs to be renamed in the future to make it consistent.
+    outputs = libtwml.ops.isotonic_calibration(inputs, ref_inputs, ref_outputs)
+    return tf.reshape(outputs, original_input_shape)
 
 
 def linear_interp1_by_class(inputs, input_classes, ref_inputs, ref_outputs):
-  """
-  Perform 1D linear interpolation.
-  Arguments:
-    inputs:
-      The query input values.
-    input_classes:
-      The class index to use from the reference grid.
-    ref_inputs:
-      Reference 2D grid points used for interpolation.
-      Each row denotes the grid from a different class.
-    ref_outputs:
-      Reference 2D output values used for interpolation.
-      Each row denotes the grid from a different class.
-
-  Returns:
-    The interpolated outputs for the requested input values.
-  """
-
-  inputs = tf.convert_to_tensor(inputs)
-  input_classes = tf.convert_to_tensor(input_classes)
-  ref_inputs = tf.convert_to_tensor(ref_inputs)
-  ref_outputs = tf.convert_to_tensor(ref_outputs)
-
-  original_input_shape = tf.shape(inputs)
-
-  # pass through
-  def in_func(x):
-    return x
-
-  # indexed function
-  def cond_func(i, fn):
-    idx = input_classes[i]
-    x = tf.expand_dims(fn(), axis=0)
-    return linear_interp1(x, ref_inputs[idx], ref_outputs[idx])
-
-  # Use while loop for now, needs to be replace by a custom C++ op later.
-  outputs = twml.util.batch_apply(in_func, inputs, cond_func=cond_func)
-  return tf.reshape(outputs, original_input_shape)
+    """
+    Perform 1D linear interpolation.
+    Arguments:
+      inputs:
+        The query input values.
+      input_classes:
+        The class index to use from the reference grid.
+      ref_inputs:
+        Reference 2D grid points used for interpolation.
+        Each row denotes the grid from a different class.
+      ref_outputs:
+        Reference 2D output values used for interpolation.
+        Each row denotes the grid from a different class.
+
+    Returns:
+      The interpolated outputs for the requested input values.
+    """
+
+    inputs = tf.convert_to_tensor(inputs)
+    input_classes = tf.convert_to_tensor(input_classes)
+    ref_inputs = tf.convert_to_tensor(ref_inputs)
+    ref_outputs = tf.convert_to_tensor(ref_outputs)
+
+    original_input_shape = tf.shape(inputs)
+
+    # pass through
+    def in_func(x):
+        return x
+
+    # indexed function
+    def cond_func(i, fn):
+        idx = input_classes[i]
+        x = tf.expand_dims(fn(), axis=0)
+        return linear_interp1(x, ref_inputs[idx], ref_outputs[idx])
+
+    # Use while loop for now, needs to be replace by a custom C++ op later.
+    outputs = twml.util.batch_apply(in_func, inputs, cond_func=cond_func)
+    return tf.reshape(outputs, original_input_shape)
diff --git a/twml/twml/contrib/utils/loss_fns.py b/twml/twml/contrib/utils/loss_fns.py
index eb25b430a..f41be8a11 100644
--- a/twml/twml/contrib/utils/loss_fns.py
+++ b/twml/twml/contrib/utils/loss_fns.py
@@ -1,302 +1,335 @@
 import tensorflow.compat.v1 as tf
+
 from twml.contrib.utils import masks, math_fns
 
 
-def get_pair_loss(pairwise_label_scores, pairwise_predicted_scores,
-                  params):
-  """
-  Paiwise learning-to-rank ranknet loss
-  Check paper https://www.microsoft.com/en-us/research/publication/
-  learning-to-rank-using-gradient-descent/
-  for more information
-  Args:
-    pairwise_label_scores: a dense tensor of shape [n_data, n_data]
-    pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    params: network parameters
-  mask options: full_mask and diag_mask
-  Returns:
-    average loss over pairs defined by the masks
-  """
-  n_data = tf.shape(pairwise_label_scores)[0]
-  if params.mask == "full_mask":
-    # full_mask that only covers pairs that have different labels
-    # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
-    mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
-  else:
-    # diag_mask that covers all pairs
-    # (only selfs/diags are 0s)
-    mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
-
-  # pairwise sigmoid_cross_entropy_with_logits loss
-  loss = tf.cond(tf.equal(pair_count, 0), lambda: 0.,
-    lambda: _get_average_cross_entropy_loss(pairwise_label_scores,
-      pairwise_predicted_scores, mask, pair_count))
-  return loss
-
-
-def get_lambda_pair_loss(pairwise_label_scores, pairwise_predicted_scores,
-                  params, swapped_ndcg):
-  """
-  Paiwise learning-to-rank lambdarank loss
-  faster than the previous gradient method
-  Note: this loss depends on ranknet cross-entropy
-  delta NDCG is applied to ranknet cross-entropy
-  Hence, it is still a gradient descent method
-  Check paper http://citeseerx.ist.psu.edu/viewdoc/
-  download?doi=10.1.1.180.634&rep=rep1&type=pdf for more information
-  for more information
-  Args:
-    pairwise_label_scores: a dense tensor of shape [n_data, n_data]
-    pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    params: network parameters
-    swapped_ndcg: swapped ndcg of shape [n_data, n_data]
-    ndcg values when swapping each pair in the prediction ranking order
-  mask options: full_mask and diag_mask
-  Returns:
-    average loss over pairs defined by the masks
-  """
-  n_data = tf.shape(pairwise_label_scores)[0]
-  if params.mask == "full_mask":
-    # full_mask that only covers pairs that have different labels
-    # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
-    mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
-  else:
-    # diag_mask that covers all pairs
-    # (only selfs/diags are 0s)
-    mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
-
-  # pairwise sigmoid_cross_entropy_with_logits loss
-  loss = tf.cond(tf.equal(pair_count, 0), lambda: 0.,
-    lambda: _get_average_cross_entropy_loss(pairwise_label_scores,
-      pairwise_predicted_scores, mask, pair_count, swapped_ndcg))
-  return loss
-
-
-def _get_average_cross_entropy_loss(pairwise_label_scores, pairwise_predicted_scores,
-                                    mask, pair_count, swapped_ndcg=None):
-  """
-  Average the loss for a batchPredictionRequest based on a desired number of pairs
-  """
-  loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=pairwise_label_scores,
-    logits=pairwise_predicted_scores)
-  loss = mask * loss
-  if swapped_ndcg is not None:
-    loss = loss * swapped_ndcg
-  loss = tf.reduce_sum(loss) / pair_count
-  return loss
+def get_pair_loss(pairwise_label_scores, pairwise_predicted_scores, params):
+    """
+    Paiwise learning-to-rank ranknet loss
+    Check paper https://www.microsoft.com/en-us/research/publication/
+    learning-to-rank-using-gradient-descent/
+    for more information
+    Args:
+      pairwise_label_scores: a dense tensor of shape [n_data, n_data]
+      pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
+      n_data is the number of tweet candidates in a BatchPredictionRequest
+      params: network parameters
+    mask options: full_mask and diag_mask
+    Returns:
+      average loss over pairs defined by the masks
+    """
+    n_data = tf.shape(pairwise_label_scores)[0]
+    if params.mask == "full_mask":
+        # full_mask that only covers pairs that have different labels
+        # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
+        mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
+    else:
+        # diag_mask that covers all pairs
+        # (only selfs/diags are 0s)
+        mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
+
+    # pairwise sigmoid_cross_entropy_with_logits loss
+    loss = tf.cond(
+        tf.equal(pair_count, 0),
+        lambda: 0.0,
+        lambda: _get_average_cross_entropy_loss(
+            pairwise_label_scores, pairwise_predicted_scores, mask, pair_count
+        ),
+    )
+    return loss
+
+
+def get_lambda_pair_loss(
+    pairwise_label_scores, pairwise_predicted_scores, params, swapped_ndcg
+):
+    """
+    Paiwise learning-to-rank lambdarank loss
+    faster than the previous gradient method
+    Note: this loss depends on ranknet cross-entropy
+    delta NDCG is applied to ranknet cross-entropy
+    Hence, it is still a gradient descent method
+    Check paper http://citeseerx.ist.psu.edu/viewdoc/
+    download?doi=10.1.1.180.634&rep=rep1&type=pdf for more information
+    for more information
+    Args:
+      pairwise_label_scores: a dense tensor of shape [n_data, n_data]
+      pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
+      n_data is the number of tweet candidates in a BatchPredictionRequest
+      params: network parameters
+      swapped_ndcg: swapped ndcg of shape [n_data, n_data]
+      ndcg values when swapping each pair in the prediction ranking order
+    mask options: full_mask and diag_mask
+    Returns:
+      average loss over pairs defined by the masks
+    """
+    n_data = tf.shape(pairwise_label_scores)[0]
+    if params.mask == "full_mask":
+        # full_mask that only covers pairs that have different labels
+        # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
+        mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
+    else:
+        # diag_mask that covers all pairs
+        # (only selfs/diags are 0s)
+        mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
+
+    # pairwise sigmoid_cross_entropy_with_logits loss
+    loss = tf.cond(
+        tf.equal(pair_count, 0),
+        lambda: 0.0,
+        lambda: _get_average_cross_entropy_loss(
+            pairwise_label_scores,
+            pairwise_predicted_scores,
+            mask,
+            pair_count,
+            swapped_ndcg,
+        ),
+    )
+    return loss
+
+
+def _get_average_cross_entropy_loss(
+    pairwise_label_scores,
+    pairwise_predicted_scores,
+    mask,
+    pair_count,
+    swapped_ndcg=None,
+):
+    """
+    Average the loss for a batchPredictionRequest based on a desired number of pairs
+    """
+    loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=pairwise_label_scores, logits=pairwise_predicted_scores
+    )
+    loss = mask * loss
+    if swapped_ndcg is not None:
+        loss = loss * swapped_ndcg
+    loss = tf.reduce_sum(loss) / pair_count
+    return loss
 
 
 def get_listmle_loss(labels, predicted_scores):
-  r"""
-  listwise learning-to-rank listMLE loss
-  Note: Simplified MLE formula is used in here (omit the proof in here)
-  \sum_{s=1}^{n-1} (-predicted_scores + ln(\sum_{i=s}^n exp(predicted_scores)))
-  n is tf.shape(predicted_scores)[0]
-  Check paper http://icml2008.cs.helsinki.fi/papers/167.pdf for more information
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-  Returns:
-    average loss
-  """
-  labels = tf.reshape(labels, [-1, 1])
-  n_data = tf.shape(labels)[0]
-  predicted_scores = tf.reshape(predicted_scores, [-1, 1])
-
-  predicted_scores_ordered_by_labels = _get_ordered_predicted_scores(labels,
-    predicted_scores, n_data)
-
-  loss = (-1) * tf.reduce_sum(predicted_scores)
-  # sum over 1 to n_data - 1
-  temp = tf.gather(predicted_scores_ordered_by_labels, [n_data - 1])
-  temp = tf.reshape(temp, [])
-  loss = tf.add(loss, temp)
-
-  exps = tf.exp(predicted_scores_ordered_by_labels)
-  exp_sum = tf.reduce_sum(exps)
-  # clip exp_sum for safer log
-  loss = tf.add(loss, math_fns.safe_log(exp_sum))
-
-  iteration = tf.constant(0)
-
-  def _cond(iteration, loss, exp_sum, exp):
-    return tf.less(iteration, n_data - 2)
-
-  def _gen_loop_body():
-    def loop_body(iteration, loss, exp_sum, exps):
-      temp = tf.gather(exps, [iteration])
-      temp = tf.reshape(temp, [])
-      exp_sum = tf.subtract(exp_sum, temp)
-      # clip exp_sum for safer log
-      loss = tf.add(loss, math_fns.safe_log(exp_sum))
-      return tf.add(iteration, 1), loss, exp_sum, exps
-    return loop_body
-
-  iteration, loss, exp_sum, exps = tf.while_loop(_cond, _gen_loop_body(),
-    (iteration, loss, exp_sum, exps))
-  loss = loss / tf.cast(n_data, dtype=tf.float32)
-  return loss
+    r"""
+    listwise learning-to-rank listMLE loss
+    Note: Simplified MLE formula is used in here (omit the proof in here)
+    \sum_{s=1}^{n-1} (-predicted_scores + ln(\sum_{i=s}^n exp(predicted_scores)))
+    n is tf.shape(predicted_scores)[0]
+    Check paper http://icml2008.cs.helsinki.fi/papers/167.pdf for more information
+    Args:
+      labels: a dense tensor of shape [n_data, 1]
+      n_data is the number of tweet candidates in a BatchPredictionRequest
+      predicted_scores: a dense tensor of same shape and type as labels
+    Returns:
+      average loss
+    """
+    labels = tf.reshape(labels, [-1, 1])
+    n_data = tf.shape(labels)[0]
+    predicted_scores = tf.reshape(predicted_scores, [-1, 1])
+
+    predicted_scores_ordered_by_labels = _get_ordered_predicted_scores(
+        labels, predicted_scores, n_data
+    )
+
+    loss = (-1) * tf.reduce_sum(predicted_scores)
+    # sum over 1 to n_data - 1
+    temp = tf.gather(predicted_scores_ordered_by_labels, [n_data - 1])
+    temp = tf.reshape(temp, [])
+    loss = tf.add(loss, temp)
+
+    exps = tf.exp(predicted_scores_ordered_by_labels)
+    exp_sum = tf.reduce_sum(exps)
+    # clip exp_sum for safer log
+    loss = tf.add(loss, math_fns.safe_log(exp_sum))
+
+    iteration = tf.constant(0)
+
+    def _cond(iteration, loss, exp_sum, exp):
+        return tf.less(iteration, n_data - 2)
+
+    def _gen_loop_body():
+        def loop_body(iteration, loss, exp_sum, exps):
+            temp = tf.gather(exps, [iteration])
+            temp = tf.reshape(temp, [])
+            exp_sum = tf.subtract(exp_sum, temp)
+            # clip exp_sum for safer log
+            loss = tf.add(loss, math_fns.safe_log(exp_sum))
+            return tf.add(iteration, 1), loss, exp_sum, exps
+
+        return loop_body
+
+    iteration, loss, exp_sum, exps = tf.while_loop(
+        _cond, _gen_loop_body(), (iteration, loss, exp_sum, exps)
+    )
+    loss = loss / tf.cast(n_data, dtype=tf.float32)
+    return loss
 
 
 def _get_ordered_predicted_scores(labels, predicted_scores, n_data):
-  """
-  Order predicted_scores based on sorted labels
-  """
-  sorted_labels, ordered_labels_indices = tf.nn.top_k(
-    tf.transpose(labels), k=n_data)
-  ordered_labels_indices = tf.transpose(ordered_labels_indices)
-  predicted_scores_ordered_by_labels = tf.gather_nd(predicted_scores,
-    ordered_labels_indices)
-  return predicted_scores_ordered_by_labels
+    """
+    Order predicted_scores based on sorted labels
+    """
+    sorted_labels, ordered_labels_indices = tf.nn.top_k(tf.transpose(labels), k=n_data)
+    ordered_labels_indices = tf.transpose(ordered_labels_indices)
+    predicted_scores_ordered_by_labels = tf.gather_nd(
+        predicted_scores, ordered_labels_indices
+    )
+    return predicted_scores_ordered_by_labels
 
 
 def get_attrank_loss(labels, predicted_scores, weights=None):
-  """
-  Modified listwise learning-to-rank AttRank loss
-  Check paper https://arxiv.org/abs/1804.05936 for more information
-  Note: there is an inconsistency between the paper statement and
-  their public code
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  # The authors immeplemented the following, which is basically listnet
-  # attention_labels = _get_attentions(labels)
-  # attention_labels = tf.reshape(attention_labels, [1, -1])
-  # predicted_scores = tf.reshape(predicted_scores, [1, -1])
-  # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=attention_labels,
-  #   logits=predicted_scores))
-
-  # The paper proposed the following
-  # attention_labels = _get_attentions(labels)
-  # # However the following line is wrong based on their statement
-  # # as _get_attentions can give 0 results when input < 0
-  # # and the result cannot be used in _get_attrank_cross_entropy
-  # # log(a_i^S)
-  # # attention_predicted_scores = _get_attentions(predicted_scores)
-  # loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
-  # # the range of attention_predicted_scores is [0, 1)
-  # # this gives sigmoid [0.5, 0.732)
-  # # hence, it is not good to use in sigmoid_cross_entropy_with_logits either
-
-  # Implemented the following instead
-  # _get_attentions is applied to labels
-  # softmax is applied to predicted_scores
-  reshaped_labels = tf.reshape(labels, [1, -1])
-  attention_labels = _get_attentions(reshaped_labels)
-  reshaped_predicted_scores = tf.reshape(predicted_scores, [1, -1])
-  attention_predicted_scores = tf.nn.softmax(reshaped_predicted_scores)
-  loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
-  return loss
+    """
+    Modified listwise learning-to-rank AttRank loss
+    Check paper https://arxiv.org/abs/1804.05936 for more information
+    Note: there is an inconsistency between the paper statement and
+    their public code
+    Args:
+      labels: a dense tensor of shape [n_data, 1]
+      n_data is the number of tweet candidates in a BatchPredictionRequest
+      predicted_scores: a dense tensor of same shape and type as labels
+      weights: a dense tensor of the same shape as labels
+    Returns:
+      average loss
+    """
+    # The authors immeplemented the following, which is basically listnet
+    # attention_labels = _get_attentions(labels)
+    # attention_labels = tf.reshape(attention_labels, [1, -1])
+    # predicted_scores = tf.reshape(predicted_scores, [1, -1])
+    # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=attention_labels,
+    #   logits=predicted_scores))
+
+    # The paper proposed the following
+    # attention_labels = _get_attentions(labels)
+    # # However the following line is wrong based on their statement
+    # # as _get_attentions can give 0 results when input < 0
+    # # and the result cannot be used in _get_attrank_cross_entropy
+    # # log(a_i^S)
+    # # attention_predicted_scores = _get_attentions(predicted_scores)
+    # loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
+    # # the range of attention_predicted_scores is [0, 1)
+    # # this gives sigmoid [0.5, 0.732)
+    # # hence, it is not good to use in sigmoid_cross_entropy_with_logits either
+
+    # Implemented the following instead
+    # _get_attentions is applied to labels
+    # softmax is applied to predicted_scores
+    reshaped_labels = tf.reshape(labels, [1, -1])
+    attention_labels = _get_attentions(reshaped_labels)
+    reshaped_predicted_scores = tf.reshape(predicted_scores, [1, -1])
+    attention_predicted_scores = tf.nn.softmax(reshaped_predicted_scores)
+    loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
+    return loss
 
 
 def _get_attentions(raw_scores):
-  """
-  Used in attention weights in AttRank loss
-  for a query/batch/batchPreidictionRequest
-  (a rectified softmax function)
-  """
-  not_consider = tf.less_equal(raw_scores, 0)
-  mask = tf.ones(tf.shape(raw_scores)) - tf.cast(not_consider, dtype=tf.float32)
-  mask = tf.cast(mask, dtype=tf.float32)
-  expon_labels = mask * tf.exp(raw_scores)
-
-  expon_label_sum = tf.reduce_sum(expon_labels)
-  # expon_label_sum is safe as a denominator
-  attentions = math_fns.safe_div(expon_labels, expon_label_sum)
-  return attentions
+    """
+    Used in attention weights in AttRank loss
+    for a query/batch/batchPreidictionRequest
+    (a rectified softmax function)
+    """
+    not_consider = tf.less_equal(raw_scores, 0)
+    mask = tf.ones(tf.shape(raw_scores)) - tf.cast(not_consider, dtype=tf.float32)
+    mask = tf.cast(mask, dtype=tf.float32)
+    expon_labels = mask * tf.exp(raw_scores)
+
+    expon_label_sum = tf.reduce_sum(expon_labels)
+    # expon_label_sum is safe as a denominator
+    attentions = math_fns.safe_div(expon_labels, expon_label_sum)
+    return attentions
 
 
 def _get_attrank_cross_entropy(labels, logits):
-  # logits is not safe based on their satement
-  # do not use this function directly elsewhere
-  results = labels * math_fns.safe_log(logits) + (1 - labels) * math_fns.safe_log(1 - logits)
-  results = (-1) * results
-  results = tf.reduce_mean(results)
-  return results
+    # logits is not safe based on their satement
+    # do not use this function directly elsewhere
+    results = labels * math_fns.safe_log(logits) + (1 - labels) * math_fns.safe_log(
+        1 - logits
+    )
+    results = (-1) * results
+    results = tf.reduce_mean(results)
+    return results
 
 
 def get_listnet_loss(labels, predicted_scores, weights=None):
-  """
-  Listwise learning-to-rank listet loss
-  Check paper https://www.microsoft.com/en-us/research/
-  wp-content/uploads/2016/02/tr-2007-40.pdf
-  for more information
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  # top one probability is the same as softmax
-  labels_top_one_probs = _get_top_one_probs(labels)
-  predicted_scores_top_one_probs = _get_top_one_probs(predicted_scores)
-
-  if weights is None:
+    """
+    Listwise learning-to-rank listet loss
+    Check paper https://www.microsoft.com/en-us/research/
+    wp-content/uploads/2016/02/tr-2007-40.pdf
+    for more information
+    Args:
+      labels: a dense tensor of shape [n_data, 1]
+      n_data is the number of tweet candidates in a BatchPredictionRequest
+      predicted_scores: a dense tensor of same shape and type as labels
+      weights: a dense tensor of the same shape as labels
+    Returns:
+      average loss
+    """
+    # top one probability is the same as softmax
+    labels_top_one_probs = _get_top_one_probs(labels)
+    predicted_scores_top_one_probs = _get_top_one_probs(predicted_scores)
+
+    if weights is None:
+        loss = tf.reduce_mean(
+            _get_listnet_cross_entropy(
+                labels=labels_top_one_probs, logits=predicted_scores_top_one_probs
+            )
+        )
+        return loss
+
     loss = tf.reduce_mean(
-      _get_listnet_cross_entropy(labels=labels_top_one_probs,
-      logits=predicted_scores_top_one_probs))
+        _get_listnet_cross_entropy(
+            labels=labels_top_one_probs, logits=predicted_scores_top_one_probs
+        )
+        * weights
+    ) / tf.reduce_mean(weights)
     return loss
 
-  loss = tf.reduce_mean(
-    _get_listnet_cross_entropy(labels=labels_top_one_probs,
-    logits=predicted_scores_top_one_probs) * weights) / tf.reduce_mean(weights)
-  return loss
-
 
 def _get_top_one_probs(labels):
-  """
-  Used in listnet top-one probabilities
-  for a query/batch/batchPreidictionRequest
-  (essentially a softmax function)
-  """
-  expon_labels = tf.exp(labels)
-  expon_label_sum = tf.reduce_sum(expon_labels)
-  # expon_label_sum is safe as a denominator
-  attentions = expon_labels / expon_label_sum
-  return attentions
+    """
+    Used in listnet top-one probabilities
+    for a query/batch/batchPreidictionRequest
+    (essentially a softmax function)
+    """
+    expon_labels = tf.exp(labels)
+    expon_label_sum = tf.reduce_sum(expon_labels)
+    # expon_label_sum is safe as a denominator
+    attentions = expon_labels / expon_label_sum
+    return attentions
 
 
 def _get_listnet_cross_entropy(labels, logits):
-  """
-  Used in listnet
-  cross entropy on top-one probabilities
-  between ideal/label top-one probabilities
-  and predicted/logits top-one probabilities
-  for a query/batch/batchPreidictionRequest
-  """
-  # it is safe to use log on logits
-  # that come from _get_top_one_probs
-  # do not use this function directly elsewhere
-  results = (-1) * labels * math_fns.safe_log(logits)
-  return results
+    """
+    Used in listnet
+    cross entropy on top-one probabilities
+    between ideal/label top-one probabilities
+    and predicted/logits top-one probabilities
+    for a query/batch/batchPreidictionRequest
+    """
+    # it is safe to use log on logits
+    # that come from _get_top_one_probs
+    # do not use this function directly elsewhere
+    results = (-1) * labels * math_fns.safe_log(logits)
+    return results
 
 
 def get_pointwise_loss(labels, predicted_scores, weights=None):
-  """
-  Pointwise learning-to-rank pointwise loss
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  if weights is None:
+    """
+    Pointwise learning-to-rank pointwise loss
+    Args:
+      labels: a dense tensor of shape [n_data, 1]
+      n_data is the number of tweet candidates in a BatchPredictionRequest
+      predicted_scores: a dense tensor of same shape and type as labels
+      weights: a dense tensor of the same shape as labels
+    Returns:
+      average loss
+    """
+    if weights is None:
+        loss = tf.reduce_mean(
+            tf.nn.sigmoid_cross_entropy_with_logits(
+                labels=labels, logits=predicted_scores
+            )
+        )
+        return loss
     loss = tf.reduce_mean(
-      tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-      logits=predicted_scores))
+        tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=predicted_scores)
+        * weights
+    ) / tf.reduce_mean(weights)
     return loss
-  loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-        logits=predicted_scores) * weights) / tf.reduce_mean(weights)
-  return loss
diff --git a/twml/twml/contrib/utils/masks.py b/twml/twml/contrib/utils/masks.py
index f3143dc52..5083633c0 100644
--- a/twml/twml/contrib/utils/masks.py
+++ b/twml/twml/contrib/utils/masks.py
@@ -2,37 +2,37 @@
 
 
 def diag_mask(n_data, pairwise_label_scores):
-  """
-  This is so far only used in pariwise learning-to-rank
-  Args:
-    n_data: a int `Tensor`.
-    pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
-  Returns:
-    values in pairwise_label_scores except the diagonal
-    each cell contains a paiwise score difference
-    only selfs/diags are 0s
-  """
-  mask = tf.ones([n_data, n_data]) - tf.diag(tf.ones([n_data]))
-  mask = tf.cast(mask, dtype=tf.float32)
-  pair_count = tf.to_float(n_data) * (tf.to_float(n_data) - 1)
-  pair_count = tf.cast(pair_count, dtype=tf.float32)
-  return mask, pair_count
+    """
+    This is so far only used in pariwise learning-to-rank
+    Args:
+      n_data: a int `Tensor`.
+      pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
+    Returns:
+      values in pairwise_label_scores except the diagonal
+      each cell contains a paiwise score difference
+      only selfs/diags are 0s
+    """
+    mask = tf.ones([n_data, n_data]) - tf.diag(tf.ones([n_data]))
+    mask = tf.cast(mask, dtype=tf.float32)
+    pair_count = tf.to_float(n_data) * (tf.to_float(n_data) - 1)
+    pair_count = tf.cast(pair_count, dtype=tf.float32)
+    return mask, pair_count
 
 
 def full_mask(n_data, pairwise_label_scores):
-  """
-  This is so far only used in pariwise learning-to-rank
-  Args:
-    n_data: a int `Tensor`.
-    pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
-  Returns:
-    values in pairwise_label_scores except pairs that have the same labels
-    each cell contains a paiwise score difference
-    all pairwise_label_scores = 0.5: selfs and same labels are 0s
-  """
-  not_consider = tf.equal(pairwise_label_scores, 0.5)
-  mask = tf.ones([n_data, n_data]) - tf.cast(not_consider, dtype=tf.float32)
-  mask = tf.cast(mask, dtype=tf.float32)
-  pair_count = tf.reduce_sum(mask)
-  pair_count = tf.cast(pair_count, dtype=tf.float32)
-  return mask, pair_count
+    """
+    This is so far only used in pariwise learning-to-rank
+    Args:
+      n_data: a int `Tensor`.
+      pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
+    Returns:
+      values in pairwise_label_scores except pairs that have the same labels
+      each cell contains a paiwise score difference
+      all pairwise_label_scores = 0.5: selfs and same labels are 0s
+    """
+    not_consider = tf.equal(pairwise_label_scores, 0.5)
+    mask = tf.ones([n_data, n_data]) - tf.cast(not_consider, dtype=tf.float32)
+    mask = tf.cast(mask, dtype=tf.float32)
+    pair_count = tf.reduce_sum(mask)
+    pair_count = tf.cast(pair_count, dtype=tf.float32)
+    return mask, pair_count
diff --git a/twml/twml/contrib/utils/math_fns.py b/twml/twml/contrib/utils/math_fns.py
index 2d9e72282..abe263da0 100644
--- a/twml/twml/contrib/utils/math_fns.py
+++ b/twml/twml/contrib/utils/math_fns.py
@@ -5,167 +5,171 @@
 # Copied from metrics_impl.py
 # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/metrics_impl.py#L216
 def safe_div(numerator, denominator, name=None):
-  """
-  Example usage: calculating NDCG = DCG / IDCG to handle cases when
-  IDCG = 0 returns 0 instead of Infinity 
-  Do not use this dividing funciton unless it makes sense to your problem
-  Divides two tensors element-wise, returns 0 if the denominator is <= 0.
-  Args:
-    numerator: a real `Tensor`.
-    denominator: a real `Tensor`, with dtype matching `numerator`.
-    name: Name for the returned op.
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  t = math_ops.truediv(numerator, denominator)
-  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-  condition = math_ops.greater(denominator, zero)
-  zero = math_ops.cast(zero, t.dtype)
-  return array_ops.where(condition, t, zero, name=name)
+    """
+    Example usage: calculating NDCG = DCG / IDCG to handle cases when
+    IDCG = 0 returns 0 instead of Infinity
+    Do not use this dividing funciton unless it makes sense to your problem
+    Divides two tensors element-wise, returns 0 if the denominator is <= 0.
+    Args:
+      numerator: a real `Tensor`.
+      denominator: a real `Tensor`, with dtype matching `numerator`.
+      name: Name for the returned op.
+    Returns:
+      0 if `denominator` <= 0, else `numerator` / `denominator`
+    """
+    t = math_ops.truediv(numerator, denominator)
+    zero = array_ops.zeros_like(t, dtype=denominator.dtype)
+    condition = math_ops.greater(denominator, zero)
+    zero = math_ops.cast(zero, t.dtype)
+    return array_ops.where(condition, t, zero, name=name)
 
 
 def cal_ndcg(label_scores, predicted_scores, top_k_int=1):
-  """
-  Calculate NDCG score for top_k_int ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: An int or an int `Tensor`.
-  Returns:
-    a `Tensor` that holds DCG / IDCG.
-  """
-  sorted_labels, predicted_order = _get_ranking_orders(
-    label_scores, predicted_scores, top_k_int=top_k_int)
-
-  predicted_relevance = _get_relevance_scores(predicted_order)
-  sorted_relevance = _get_relevance_scores(sorted_labels)
-
-  cg_discount = _get_cg_discount(top_k_int)
-
-  dcg = _dcg_idcg(predicted_relevance, cg_discount)
-  idcg = _dcg_idcg(sorted_relevance, cg_discount)
-  # the ndcg score of the batch
-  # idcg is 0 if label_scores are all 0
-  ndcg = safe_div(dcg, idcg, 'one_ndcg')
-  return ndcg
+    """
+    Calculate NDCG score for top_k_int ranking positions
+    Args:
+      label_scores: a real `Tensor`.
+      predicted_scores: a real `Tensor`, with dtype matching label_scores
+      top_k_int: An int or an int `Tensor`.
+    Returns:
+      a `Tensor` that holds DCG / IDCG.
+    """
+    sorted_labels, predicted_order = _get_ranking_orders(
+        label_scores, predicted_scores, top_k_int=top_k_int
+    )
+
+    predicted_relevance = _get_relevance_scores(predicted_order)
+    sorted_relevance = _get_relevance_scores(sorted_labels)
+
+    cg_discount = _get_cg_discount(top_k_int)
+
+    dcg = _dcg_idcg(predicted_relevance, cg_discount)
+    idcg = _dcg_idcg(sorted_relevance, cg_discount)
+    # the ndcg score of the batch
+    # idcg is 0 if label_scores are all 0
+    ndcg = safe_div(dcg, idcg, "one_ndcg")
+    return ndcg
 
 
 def cal_swapped_ndcg(label_scores, predicted_scores, top_k_int):
-  """
-  Calculate swapped NDCG score in Lambda Rank for full/top k ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: An int or an int `Tensor`. 
-  Returns:
-    a `Tensor` that holds swapped NDCG by .
-  """
-  sorted_labels, predicted_order = _get_ranking_orders(
-    label_scores, predicted_scores, top_k_int=top_k_int)
+    """
+    Calculate swapped NDCG score in Lambda Rank for full/top k ranking positions
+    Args:
+      label_scores: a real `Tensor`.
+      predicted_scores: a real `Tensor`, with dtype matching label_scores
+      top_k_int: An int or an int `Tensor`.
+    Returns:
+      a `Tensor` that holds swapped NDCG by .
+    """
+    sorted_labels, predicted_order = _get_ranking_orders(
+        label_scores, predicted_scores, top_k_int=top_k_int
+    )
 
-  predicted_relevance = _get_relevance_scores(predicted_order)
-  sorted_relevance = _get_relevance_scores(sorted_labels)
+    predicted_relevance = _get_relevance_scores(predicted_order)
+    sorted_relevance = _get_relevance_scores(sorted_labels)
 
-  cg_discount = _get_cg_discount(top_k_int)
+    cg_discount = _get_cg_discount(top_k_int)
 
-  # cg_discount is safe as a denominator
-  dcg_k = predicted_relevance / cg_discount
-  dcg = tf.reduce_sum(dcg_k)
+    # cg_discount is safe as a denominator
+    dcg_k = predicted_relevance / cg_discount
+    dcg = tf.reduce_sum(dcg_k)
 
-  idcg_k = sorted_relevance / cg_discount
-  idcg = tf.reduce_sum(idcg_k)
+    idcg_k = sorted_relevance / cg_discount
+    idcg = tf.reduce_sum(idcg_k)
 
-  ndcg = safe_div(dcg, idcg, 'ndcg_in_lambdarank_training')
+    ndcg = safe_div(dcg, idcg, "ndcg_in_lambdarank_training")
 
-  # remove the gain from label i then add the gain from label j
-  tiled_ij = tf.tile(dcg_k, [1, top_k_int])
-  new_ij = (predicted_relevance / tf.transpose(cg_discount))
+    # remove the gain from label i then add the gain from label j
+    tiled_ij = tf.tile(dcg_k, [1, top_k_int])
+    new_ij = predicted_relevance / tf.transpose(cg_discount)
 
-  tiled_ji = tf.tile(tf.transpose(dcg_k), [top_k_int, 1])
-  new_ji = tf.transpose(predicted_relevance) / cg_discount
+    tiled_ji = tf.tile(tf.transpose(dcg_k), [top_k_int, 1])
+    new_ji = tf.transpose(predicted_relevance) / cg_discount
 
-  # if swap i and j, remove the stale cg for i, then add the new cg for i,
-  # remove the stale cg for j, and then add the new cg for j
-  new_dcg = dcg - tiled_ij + new_ij - tiled_ji + new_ji
+    # if swap i and j, remove the stale cg for i, then add the new cg for i,
+    # remove the stale cg for j, and then add the new cg for j
+    new_dcg = dcg - tiled_ij + new_ij - tiled_ji + new_ji
 
-  new_ndcg = safe_div(new_dcg, idcg, 'new_ndcg_in_lambdarank_training')
-  swapped_ndcg = tf.abs(ndcg - new_ndcg)
-  return swapped_ndcg
+    new_ndcg = safe_div(new_dcg, idcg, "new_ndcg_in_lambdarank_training")
+    swapped_ndcg = tf.abs(ndcg - new_ndcg)
+    return swapped_ndcg
 
 
 def _dcg_idcg(relevance_scores, cg_discount):
-  """
-  Calculate DCG scores for top_k_int ranking positions
-  Args:
-    relevance_scores: a real `Tensor`.
-    cg_discount: a real `Tensor`, with dtype matching relevance_scores
-  Returns:
-    a `Tensor` that holds \\sum_{i=1}^k \frac{relevance_scores_k}{cg_discount}  
-  """
-  # cg_discount is safe
-  dcg_k = relevance_scores / cg_discount
-  return tf.reduce_sum(dcg_k)
+    """
+    Calculate DCG scores for top_k_int ranking positions
+    Args:
+      relevance_scores: a real `Tensor`.
+      cg_discount: a real `Tensor`, with dtype matching relevance_scores
+    Returns:
+      a `Tensor` that holds \\sum_{i=1}^k \frac{relevance_scores_k}{cg_discount}
+    """
+    # cg_discount is safe
+    dcg_k = relevance_scores / cg_discount
+    return tf.reduce_sum(dcg_k)
 
 
 def _get_ranking_orders(label_scores, predicted_scores, top_k_int=1):
-  """
-  Calculate DCG scores for top_k_int ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: an integer or an int `Tensor`.
-  Returns:
-    two `Tensors` that hold sorted_labels: the ground truth relevance socres
-    and predicted_order: relevance socres based on sorted predicted_scores
-  """
-  # sort predictions_scores and label_scores
-  # size [batch_size/num of DataRecords, 1]
-  label_scores = tf.reshape(label_scores, [-1, 1])
-  predicted_scores = tf.reshape(predicted_scores, [-1, 1])
-  # sorted_labels contians the relevance scores of the correct order
-  sorted_labels, ordered_labels_indices = tf.nn.top_k(
-    tf.transpose(label_scores), k=top_k_int)
-  sorted_labels = tf.transpose(sorted_labels)
-  # sort predicitons and use the indices to obtain the relevance scores of the predicted order
-  sorted_predictions, ordered_predictions_indices = tf.nn.top_k(
-    tf.transpose(predicted_scores), k=top_k_int)
-  ordered_predictions_indices_for_labels = tf.transpose(ordered_predictions_indices)
-  # predicted_order contians the relevance scores of the predicted order
-  predicted_order = tf.gather_nd(label_scores, ordered_predictions_indices_for_labels)
-  return sorted_labels, predicted_order
+    """
+    Calculate DCG scores for top_k_int ranking positions
+    Args:
+      label_scores: a real `Tensor`.
+      predicted_scores: a real `Tensor`, with dtype matching label_scores
+      top_k_int: an integer or an int `Tensor`.
+    Returns:
+      two `Tensors` that hold sorted_labels: the ground truth relevance socres
+      and predicted_order: relevance socres based on sorted predicted_scores
+    """
+    # sort predictions_scores and label_scores
+    # size [batch_size/num of DataRecords, 1]
+    label_scores = tf.reshape(label_scores, [-1, 1])
+    predicted_scores = tf.reshape(predicted_scores, [-1, 1])
+    # sorted_labels contians the relevance scores of the correct order
+    sorted_labels, ordered_labels_indices = tf.nn.top_k(
+        tf.transpose(label_scores), k=top_k_int
+    )
+    sorted_labels = tf.transpose(sorted_labels)
+    # sort predicitons and use the indices to obtain the relevance scores of the predicted order
+    sorted_predictions, ordered_predictions_indices = tf.nn.top_k(
+        tf.transpose(predicted_scores), k=top_k_int
+    )
+    ordered_predictions_indices_for_labels = tf.transpose(ordered_predictions_indices)
+    # predicted_order contians the relevance scores of the predicted order
+    predicted_order = tf.gather_nd(label_scores, ordered_predictions_indices_for_labels)
+    return sorted_labels, predicted_order
 
 
 def _get_cg_discount(top_k_int=1):
-  r"""
-  Calculate discounted gain factor for ranking position till top_k_int
-  Args:
-    top_k_int: An int or an int `Tensor`.
-  Returns:
-    a `Tensor` that holds \log_{2}(i + 1), i \in [1, k] 
-  """
-  log_2 = tf.log(tf.constant(2.0, dtype=tf.float32))
-  # top_k_range needs to start from 1 to top_k_int
-  top_k_range = tf.range(top_k_int) + 1
-  top_k_range = tf.reshape(top_k_range, [-1, 1])
-  # cast top_k_range to float
-  top_k_range = tf.cast(top_k_range, dtype=tf.float32)
-  cg_discount = tf.log(top_k_range + 1.0) / log_2
-  return cg_discount
+    r"""
+    Calculate discounted gain factor for ranking position till top_k_int
+    Args:
+      top_k_int: An int or an int `Tensor`.
+    Returns:
+      a `Tensor` that holds \log_{2}(i + 1), i \in [1, k]
+    """
+    log_2 = tf.log(tf.constant(2.0, dtype=tf.float32))
+    # top_k_range needs to start from 1 to top_k_int
+    top_k_range = tf.range(top_k_int) + 1
+    top_k_range = tf.reshape(top_k_range, [-1, 1])
+    # cast top_k_range to float
+    top_k_range = tf.cast(top_k_range, dtype=tf.float32)
+    cg_discount = tf.log(top_k_range + 1.0) / log_2
+    return cg_discount
 
 
 def _get_relevance_scores(scores):
-  return 2 ** scores - 1
+    return 2**scores - 1
 
 
 def safe_log(raw_scores, name=None):
-  """
-  Calculate log of a tensor, handling cases that
-  raw_scores are close to 0s
-  Args:
-    raw_scores: An float `Tensor`.
-  Returns:
-    A float `Tensor` that hols the safe log base e of input
-  """
-  epsilon = 1E-8
-  clipped_raw_scores = tf.maximum(raw_scores, epsilon)
-  return tf.log(clipped_raw_scores)
+    """
+    Calculate log of a tensor, handling cases that
+    raw_scores are close to 0s
+    Args:
+      raw_scores: An float `Tensor`.
+    Returns:
+      A float `Tensor` that hols the safe log base e of input
+    """
+    epsilon = 1e-8
+    clipped_raw_scores = tf.maximum(raw_scores, epsilon)
+    return tf.log(clipped_raw_scores)
diff --git a/twml/twml/contrib/utils/normalizer.py b/twml/twml/contrib/utils/normalizer.py
index a6a7035b8..4c91bbf4f 100644
--- a/twml/twml/contrib/utils/normalizer.py
+++ b/twml/twml/contrib/utils/normalizer.py
@@ -1,39 +1,41 @@
 import tensorflow.compat.v1 as tf
+
 from twml.contrib.utils import math_fns
 
 
 def mean_max_normalizaiton(dense_tensor):
-  """
-  In-batch normalization
-  Args:
-    dense_tensor: A dense `Tensor`.
-  Returns:
-    (dense_tensor - mean) / abs(max value)
-  Note:
-    when dense_tensor is of size [1, ?] it will give 0
-    If this is not what you want handle it outside the function
-  """
-  dense_mean = tf.reduce_mean(dense_tensor, reduction_indices=[0])
-  dense_abs_max = tf.abs(tf.reduce_max(dense_tensor, reduction_indices=[0]))
-  dense_tensor = math_fns.safe_div(dense_tensor - dense_mean, dense_abs_max,
-    'mean_max_normalization_in_batch')
-  return dense_tensor
+    """
+    In-batch normalization
+    Args:
+      dense_tensor: A dense `Tensor`.
+    Returns:
+      (dense_tensor - mean) / abs(max value)
+    Note:
+      when dense_tensor is of size [1, ?] it will give 0
+      If this is not what you want handle it outside the function
+    """
+    dense_mean = tf.reduce_mean(dense_tensor, reduction_indices=[0])
+    dense_abs_max = tf.abs(tf.reduce_max(dense_tensor, reduction_indices=[0]))
+    dense_tensor = math_fns.safe_div(
+        dense_tensor - dense_mean, dense_abs_max, "mean_max_normalization_in_batch"
+    )
+    return dense_tensor
 
 
 def standard_normalizaiton(dense_tensor):
-  """
-  In-batch normalization
-  z-normalization or standard_normalization in batch
-  Args:
-    dense_tensor: A dense `Tensor`.
-  Returns:
-    (dense_tensor - mean) / variance
-  Note:
-    when dense_tensor is of size [1, ?] it will give 0
-    If this is not what you want handle it outside the function
-  """
-  epsilon = 1E-7
-  dense_mean, dense_variance = tf.nn.moments(dense_tensor, 0)
-  # using epsilon is safer than math_fns.safe_div in here
-  dense_tensor = (dense_tensor - dense_mean) / (dense_variance + epsilon)
-  return dense_tensor
+    """
+    In-batch normalization
+    z-normalization or standard_normalization in batch
+    Args:
+      dense_tensor: A dense `Tensor`.
+    Returns:
+      (dense_tensor - mean) / variance
+    Note:
+      when dense_tensor is of size [1, ?] it will give 0
+      If this is not what you want handle it outside the function
+    """
+    epsilon = 1e-7
+    dense_mean, dense_variance = tf.nn.moments(dense_tensor, 0)
+    # using epsilon is safer than math_fns.safe_div in here
+    dense_tensor = (dense_tensor - dense_mean) / (dense_variance + epsilon)
+    return dense_tensor
diff --git a/twml/twml/contrib/utils/scores.py b/twml/twml/contrib/utils/scores.py
index 84e792c13..cfe96d603 100644
--- a/twml/twml/contrib/utils/scores.py
+++ b/twml/twml/contrib/utils/scores.py
@@ -2,32 +2,32 @@
 
 
 def get_pairwise_scores(tensor_input):
-  """
-  This is so far used in pariwise learning-to-rank
+    """
+    This is so far used in pariwise learning-to-rank
 
-  Arguments:
-    tensor_input: a dense `Tensor` of shape [n_data, 1]
-      n_data is the number of teet candidates
+    Arguments:
+      tensor_input: a dense `Tensor` of shape [n_data, 1]
+        n_data is the number of teet candidates
 
-  Returns:
-    pairwise scores: a dense `Tensor` of shape [n_data, n_data].
-  """
-  return tensor_input - tf.transpose(tensor_input)
+    Returns:
+      pairwise scores: a dense `Tensor` of shape [n_data, n_data].
+    """
+    return tensor_input - tf.transpose(tensor_input)
 
 
 def get_pairwise_label_scores(labels):
-  """
-  This is so far used in pariwise learning-to-rank
-  Args:
-    labels: a dense `Tensor` of shape [n_data, 1]
-      n_data is the number of teet candidates
-  Returns:
-    pairwise label scores: a dense `Tensor` of shape [n_data, n_data].
-      each value is within [0, 1]
-  """
-  # raw pairwise label scores/differences
-  pairwise_label_scores = get_pairwise_scores(labels)
-  # sanity check to make sure values in differences_ij are [-1, 1]
-  differences_ij = tf.maximum(tf.minimum(1.0, pairwise_label_scores), -1.0)
-  # values in pairwise_label_scores are within [0, 1] for cross entropy
-  return (1.0 / 2.0) * (1.0 + differences_ij)
+    """
+    This is so far used in pariwise learning-to-rank
+    Args:
+      labels: a dense `Tensor` of shape [n_data, 1]
+        n_data is the number of teet candidates
+    Returns:
+      pairwise label scores: a dense `Tensor` of shape [n_data, n_data].
+        each value is within [0, 1]
+    """
+    # raw pairwise label scores/differences
+    pairwise_label_scores = get_pairwise_scores(labels)
+    # sanity check to make sure values in differences_ij are [-1, 1]
+    differences_ij = tf.maximum(tf.minimum(1.0, pairwise_label_scores), -1.0)
+    # values in pairwise_label_scores are within [0, 1] for cross entropy
+    return (1.0 / 2.0) * (1.0 + differences_ij)
diff --git a/twml/twml/contrib/utils/similarities.py b/twml/twml/contrib/utils/similarities.py
index 212065f88..341639bf3 100644
--- a/twml/twml/contrib/utils/similarities.py
+++ b/twml/twml/contrib/utils/similarities.py
@@ -2,16 +2,16 @@
 
 
 def cosine_similarity(x1, x2, axis):
-  """
-  cosine similarity of two tensors.
+    """
+    cosine similarity of two tensors.
 
-  Arguments:
-    x1:
-      A tf.Tensor
-    x2:
-      A tf.Tensor
-    axis: Dimension along which to normalize.
-  """
-  normalize_x1 = tf.nn.l2_normalize(x1, axis=axis)
-  normalize_x2 = tf.nn.l2_normalize(x2, axis=axis)
-  return tf.reduce_sum(tf.multiply(normalize_x1, normalize_x2), axis=axis)
+    Arguments:
+      x1:
+        A tf.Tensor
+      x2:
+        A tf.Tensor
+      axis: Dimension along which to normalize.
+    """
+    normalize_x1 = tf.nn.l2_normalize(x1, axis=axis)
+    normalize_x2 = tf.nn.l2_normalize(x2, axis=axis)
+    return tf.reduce_sum(tf.multiply(normalize_x1, normalize_x2), axis=axis)
diff --git a/twml/twml/dataset.py b/twml/twml/dataset.py
index 4356fdc7c..3e23197a1 100644
--- a/twml/twml/dataset.py
+++ b/twml/twml/dataset.py
@@ -3,370 +3,431 @@
 """
 import numbers
 
+import tensorflow.compat.v1 as tf
 from absl import logging
 from kazoo.client import KazooClient
 from libtwml import OPLIB
-import tensorflow.compat.v1 as tf
+
 from twml.constants import DEFAULT_ZOOKEEPER_BASE_ZNODE, DEFAULT_ZOOKEEPER_HOST
 
 
 class BlockFormatDataset(tf.data.Dataset):
-  """A ``tf.data.Dataset`` comprising records from one or more TFRecord files."""
+    """A ``tf.data.Dataset`` comprising records from one or more TFRecord files."""
+
+    def __init__(self, filenames, compression_type="auto", buffer_size=1 << 20):
+        """
+        Creates a ``BlockFormatDataset``.
+
+        Args:
+          filenames:
+            A `tf.string` tensor containing one or more filenames.
+          compression_type:
+            A string specifying the compression type.
+            Can be one of 'gz' (or 'gzip'), 'none', 'auto' (default).
+            When compression_type == 'auto', it is inferred from file extension.
+          buffer_size:
+            Buffer size to be used during decompression. default: 1<<20.
+        """
+        self._filenames = tf.convert_to_tensor(
+            filenames, dtype=tf.string, name="filenames"
+        )
+        self._compression_type = tf.convert_to_tensor(
+            compression_type.lower(), name="compression_type"
+        )
+        self._buffer_size = tf.convert_to_tensor(
+            buffer_size, dtype=tf.int64, name="buffer_size"
+        )
+        # Parent class calss self._as_variant_tensor in init. So call this at the end.
+        super(BlockFormatDataset, self).__init__()
+
+    def _as_variant_tensor(self):
+        """
+        Create the resource handle for the dataset.
+        """
+        try:
+            block_format_dataset = __import__(
+                "libtwml_internal"
+            ).OPLIB.block_format_dataset
+            return block_format_dataset(self._filenames)
+        except ImportError:
+            block_format_dataset = OPLIB.block_format_dataset_v2
+            return block_format_dataset(
+                self._filenames, self._compression_type, self._buffer_size
+            )
+
+    def _inputs(self):
+        return []
+
+    @property
+    def output_shapes(self):
+        """Return output shapes"""
+        return tf.TensorShape([])
+
+    @property
+    def output_types(self):
+        """Return output types"""
+        return tf.string
+
+    @property
+    def output_classes(self):
+        """Return output classes"""
+        return tf.Tensor
 
-  def __init__(self, filenames, compression_type="auto", buffer_size=1 << 20):
-    """
-    Creates a ``BlockFormatDataset``.
 
-    Args:
-      filenames:
-        A `tf.string` tensor containing one or more filenames.
-      compression_type:
-        A string specifying the compression type.
-        Can be one of 'gz' (or 'gzip'), 'none', 'auto' (default).
-        When compression_type == 'auto', it is inferred from file extension.
-      buffer_size:
-        Buffer size to be used during decompression. default: 1<<20.
+def downsample_dataset(dataset, sample_rate, rate_name):
+    """
+    Downsample a tf.data.Dataset at sample_rate
     """
-    self._filenames = tf.convert_to_tensor(filenames, dtype=tf.string, name="filenames")
-    self._compression_type = tf.convert_to_tensor(compression_type.lower(), name="compression_type")
-    self._buffer_size = tf.convert_to_tensor(buffer_size, dtype=tf.int64, name="buffer_size")
-    # Parent class calss self._as_variant_tensor in init. So call this at the end.
-    super(BlockFormatDataset, self).__init__()
+    if sample_rate is None or sample_rate == 1.0:
+        return dataset
+    elif not isinstance(sample_rate, numbers.Real):
+        raise TypeError("dataset %s must be a real number" % rate_name)
+    elif sample_rate <= 0 or sample_rate > 1:
+        raise ValueError("dataset %s must be in range (0, 1])" % rate_name)
+    return dataset.filter(lambda _: tf.squeeze(tf.random_uniform([1])) < sample_rate)
 
-  def _as_variant_tensor(self):
+
+def _filenames_dataset(files, shards=None, shard_index=None):
     """
-    Create the resource handle for the dataset.
+    Get a tf.data.Dataset with file names from a list of files
+    Optionally shard the file list (see stream_block_format_dataset)
     """
-    try:
-      block_format_dataset = __import__("libtwml_internal").OPLIB.block_format_dataset
-      return block_format_dataset(self._filenames)
-    except ImportError:
-      block_format_dataset = OPLIB.block_format_dataset_v2
-      return block_format_dataset(self._filenames, self._compression_type, self._buffer_size)
+    files = tf.data.Dataset.from_tensor_slices(files)
+
+    if [shards, shard_index] != [None, None]:
+        logging.info(
+            "Sharding files dataset (index: %d, shards: %d)" % (shard_index, shards)
+        )
+        files = files.shard(num_shards=shards, index=shard_index)
 
-  def _inputs(self):
-    return []
+    return files
 
-  @property
-  def output_shapes(self):
-    """Return output shapes"""
-    return tf.TensorShape([])
 
-  @property
-  def output_types(self):
-    """Return output types"""
-    return tf.string
+def stream_block_format_dataset(
+    files,
+    parse_fn,
+    batch_size,
+    num_threads,
+    shuffle=True,
+    repeat=False,
+    block_length=None,
+    part_file_parallelism=None,
+    file_shuffle_size=None,
+    record_shuffle_size=None,
+    dataset_fn=None,
+    keep_rate=None,
+    parts_downsampling_rate=None,
+    prefetch_size=2,
+    shards=None,
+    shard_index=None,
+    shuffle_files=True,
+    interleave=True,
+):
+    """
+    Helper function to stream a list of part files.
+
+    Args:
+      files:
+        List of input files which will create a dataset.
+      parse_fn:
+        A function that takes a byte tensor containing a datarecord and decodes it.
+      batch_size:
+        The batch size for each step.
+      num_threads:
+        Number of threads working on the data in parallel.
+      shuffle:
+        Shuffle records within each file using ``record_shuffle_size``. Defaults to True.
+      repeat:
+        Repeat the dataset indefinitely. Defaults to False.
+        Useful when you want to use an ``[train,eval]_steps`` greater than the size of the dataset
+        (otherwise ``Estimator.[train,evaluate]`` stop when the end of the dataset is reached).
+      block_length (optional):
+        Number of consecutive records to pull from a single part file.
+        Defaults to batch_size.
+      part_file_parallelism (optional):
+        Number of part files to read from in parallel. Once a part file is completely read, it will
+        be replaced by the next part file in the part file list.
+
+        ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
+        the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
+        equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
+        if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
+        thread pool will be underutilized, since it can never be the case that every reader thread has
+        a part file to read from.
+
+      file_shuffle_size (optional):
+        the buffer_size used for shuffling of the list of files.
+        Defaults to 1000. For example, if you have 2000 files, the first
+        1000 files are shuffled together, iterated through, then the next 1000 files are shuffled
+        and iterated through.
+      record_shuffle_size (optional):
+        the ``buffer_size`` used for shuffling records in each thread.
+        Defaults to ``batch_size * 8`` records.
+      dataset_fn (optional):
+        A function of that modifies the dataset after it reads different interleaved parts files.
+        Defaults to:
+
+        .. code-block:: python
+
+          def dataset_fn(dataset, parse_fn, batch_size):
+            return dataset.batch(batch_size).map(parse_fn, 1)
+
+      keep_rate (optional):
+        A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
+        distribution with p = 1 - keep_rate.
+        Defaults to None (no records dropped).
+
+      parts_downsampling_rate (optional):
+        A float value in ``(0.0, 1.0]`` that indicates the factor by which to downsample part files.
+        For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
+
+        Note that this argument is only useful in conjunction with a [train,eval]_steps of -1
+        (that is, when the entire dataset is used). Furthermore, note that even in this case, each
+        epoch will see a different set of part files. This is because new part files are re-sampled
+        every epoch. In other words, this argument is only provided for backwards compatibility with
+        DeepBird v1. We recommend you use a smaller [train,eval]_steps (or specify a keep_rate)
+        instead.
+
+      shards (optional):
+        Number of partitions to shard the dataset into. This is useful for codistillation and other
+        techniques that require each worker to train on disjoint partitions of the dataset.
+        The dataset is not sharded by default.
+
+      shard_index (optional):
+        Which partition of the dataset to use if ``shards`` is set.
+
+      shuffle_files (optional):
+        Shuffle the list of files. Defaults to True.
+        When False, files are iterated in the order they are passed in.
+
+      interleave (optional):
+        Interleave records from multiple files in parallel. Defaults to True.
+
+    Returns:
+      tf.data.DataSet of batches of HashedDataRecord resource handles decoded and streamed online.
+    """
+    # Creating a dataset from an input directory
 
-  @property
-  def output_classes(self):
-    """Return output classes"""
-    return tf.Tensor
+    files = _filenames_dataset(files, shards=shards, shard_index=shard_index)
 
+    file_shuffle_size = file_shuffle_size if file_shuffle_size is not None else 100000
+    record_shuffle_size = (
+        record_shuffle_size if record_shuffle_size is not None else (batch_size * 8)
+    )
+    block_length = block_length if block_length is not None else batch_size
 
-def downsample_dataset(dataset, sample_rate, rate_name):
-  """
-  Downsample a tf.data.Dataset at sample_rate
-  """
-  if sample_rate is None or sample_rate == 1.0:
-    return dataset
-  elif not isinstance(sample_rate, numbers.Real):
-    raise TypeError("dataset %s must be a real number" % rate_name)
-  elif sample_rate <= 0 or sample_rate > 1:
-    raise ValueError("dataset %s must be in range (0, 1])" % rate_name)
-  return dataset.filter(lambda _: tf.squeeze(tf.random_uniform([1])) < sample_rate)
+    logging.info("NUM_THREADS: %d", num_threads)
 
+    if repeat:
+        files = files.repeat()
 
-def _filenames_dataset(files, shards=None, shard_index=None):
-  """
-  Get a tf.data.Dataset with file names from a list of files
-  Optionally shard the file list (see stream_block_format_dataset)
-  """
-  files = tf.data.Dataset.from_tensor_slices(files)
+    if shuffle_files:
+        # Randomly shuffle the files list.
+        files = files.shuffle(buffer_size=file_shuffle_size)
 
-  if [shards, shard_index] != [None, None]:
-    logging.info("Sharding files dataset (index: %d, shards: %d)" % (shard_index, shards))
-    files = files.shard(num_shards=shards, index=shard_index)
+    # Downsample parts files
+    files = downsample_dataset(
+        files, parts_downsampling_rate, "parts_downsampling_rate"
+    )
 
-  return files
+    # Interleave the result from BlockFormatDataset
+    # block_length == batch_size results in batch_size records being read from a single file.
+    def map_fn(filenames):
+        """function that maps each filename to a BlockFormatDataset"""
+        # reach each file using BlockFormatDataset
+        dataset = BlockFormatDataset(filenames)
 
+        # early prefetching can sometimes improve performance (like on GCS)
+        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
 
-def stream_block_format_dataset(
-        files, parse_fn, batch_size, num_threads,
-        shuffle=True, repeat=False,
-        block_length=None, part_file_parallelism=None, file_shuffle_size=None,
-        record_shuffle_size=None, dataset_fn=None,
-        keep_rate=None, parts_downsampling_rate=None, prefetch_size=2,
-        shards=None, shard_index=None, shuffle_files=True, interleave=True):
-  """
-  Helper function to stream a list of part files.
-
-  Args:
-    files:
-      List of input files which will create a dataset.
-    parse_fn:
-      A function that takes a byte tensor containing a datarecord and decodes it.
-    batch_size:
-      The batch size for each step.
-    num_threads:
-      Number of threads working on the data in parallel.
-    shuffle:
-      Shuffle records within each file using ``record_shuffle_size``. Defaults to True.
-    repeat:
-      Repeat the dataset indefinitely. Defaults to False.
-      Useful when you want to use an ``[train,eval]_steps`` greater than the size of the dataset
-      (otherwise ``Estimator.[train,evaluate]`` stop when the end of the dataset is reached).
-    block_length (optional):
-      Number of consecutive records to pull from a single part file.
-      Defaults to batch_size.
-    part_file_parallelism (optional):
-      Number of part files to read from in parallel. Once a part file is completely read, it will
-      be replaced by the next part file in the part file list.
-
-      ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
-      the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
-      equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
-      if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
-      thread pool will be underutilized, since it can never be the case that every reader thread has
-      a part file to read from.
-
-    file_shuffle_size (optional):
-      the buffer_size used for shuffling of the list of files.
-      Defaults to 1000. For example, if you have 2000 files, the first
-      1000 files are shuffled together, iterated through, then the next 1000 files are shuffled
-      and iterated through.
-    record_shuffle_size (optional):
-      the ``buffer_size`` used for shuffling records in each thread.
-      Defaults to ``batch_size * 8`` records.
-    dataset_fn (optional):
-      A function of that modifies the dataset after it reads different interleaved parts files.
-      Defaults to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          return dataset.batch(batch_size).map(parse_fn, 1)
-
-    keep_rate (optional):
-      A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
-      distribution with p = 1 - keep_rate.
-      Defaults to None (no records dropped).
-
-    parts_downsampling_rate (optional):
-      A float value in ``(0.0, 1.0]`` that indicates the factor by which to downsample part files.
-      For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
-
-      Note that this argument is only useful in conjunction with a [train,eval]_steps of -1
-      (that is, when the entire dataset is used). Furthermore, note that even in this case, each
-      epoch will see a different set of part files. This is because new part files are re-sampled
-      every epoch. In other words, this argument is only provided for backwards compatibility with
-      DeepBird v1. We recommend you use a smaller [train,eval]_steps (or specify a keep_rate)
-      instead.
-
-    shards (optional):
-      Number of partitions to shard the dataset into. This is useful for codistillation and other
-      techniques that require each worker to train on disjoint partitions of the dataset.
-      The dataset is not sharded by default.
-
-    shard_index (optional):
-      Which partition of the dataset to use if ``shards`` is set.
-
-    shuffle_files (optional):
-      Shuffle the list of files. Defaults to True.
-      When False, files are iterated in the order they are passed in.
-
-    interleave (optional):
-      Interleave records from multiple files in parallel. Defaults to True.
-
-  Returns:
-    tf.data.DataSet of batches of HashedDataRecord resource handles decoded and streamed online.
-  """
-  # Creating a dataset from an input directory
-
-  files = _filenames_dataset(files, shards=shards, shard_index=shard_index)
-
-  file_shuffle_size = file_shuffle_size if file_shuffle_size is not None else 100000
-  record_shuffle_size = record_shuffle_size if record_shuffle_size is not None else (batch_size * 8)
-  block_length = block_length if block_length is not None else batch_size
-
-  logging.info("NUM_THREADS: %d", num_threads)
-
-  if repeat:
-    files = files.repeat()
-
-  if shuffle_files:
-    # Randomly shuffle the files list.
-    files = files.shuffle(buffer_size=file_shuffle_size)
-
-  # Downsample parts files
-  files = downsample_dataset(files, parts_downsampling_rate, "parts_downsampling_rate")
-
-  # Interleave the result from BlockFormatDataset
-  # block_length == batch_size results in batch_size records being read from a single file.
-  def map_fn(filenames):
-    '''function that maps each filename to a BlockFormatDataset'''
-    # reach each file using BlockFormatDataset
-    dataset = BlockFormatDataset(filenames)
-
-    # early prefetching can sometimes improve performance (like on GCS)
-    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-
-    # Shuffling before repeating ensures strong ordering.
-    if shuffle:
-      dataset = dataset.shuffle(buffer_size=record_shuffle_size)
+        # Shuffling before repeating ensures strong ordering.
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=record_shuffle_size)
 
-    return dataset
+        return dataset
 
-  if interleave:
-    part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism
-    dataset = files.interleave(
-      map_fn, cycle_length=part_file_parallelism, block_length=block_length, num_parallel_calls=num_threads)
-  else:
-    dataset = files.flat_map(map_fn)
+    if interleave:
+        part_file_parallelism = (
+            num_threads if part_file_parallelism is None else part_file_parallelism
+        )
+        dataset = files.interleave(
+            map_fn,
+            cycle_length=part_file_parallelism,
+            block_length=block_length,
+            num_parallel_calls=num_threads,
+        )
+    else:
+        dataset = files.flat_map(map_fn)
 
-  # Downsample DataRecords
-  dataset = downsample_dataset(dataset, keep_rate, "keep_rate")
+    # Downsample DataRecords
+    dataset = downsample_dataset(dataset, keep_rate, "keep_rate")
 
-  if dataset_fn is None:
-    # Create a batch of datarecords and decode them
-    return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).prefetch(prefetch_size)
+    if dataset_fn is None:
+        # Create a batch of datarecords and decode them
+        return (
+            dataset.batch(batch_size)
+            .map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+            .prefetch(prefetch_size)
+        )
 
-  return dataset_fn(dataset, parse_fn, batch_size)
+    return dataset_fn(dataset, parse_fn, batch_size)
 
 
 def cx_zk_path(path):
-  if path is None:
-    raise ValueError("Path for zookeeper dataset pointer is None. You must specify a path.")
-  return_path = "/".join([DEFAULT_ZOOKEEPER_BASE_ZNODE, path])
-  logging.info("Zookeeper path is: {}".format(return_path))
-  return return_path
+    if path is None:
+        raise ValueError(
+            "Path for zookeeper dataset pointer is None. You must specify a path."
+        )
+    return_path = "/".join([DEFAULT_ZOOKEEPER_BASE_ZNODE, path])
+    logging.info("Zookeeper path is: {}".format(return_path))
+    return return_path
 
 
 def zookeeper_ordered_dataset(
-        files, parse_fn, batch_size, zk_counter_path, repeat=False,
-        num_threads=2, block_length=None, part_file_parallelism=None,
-        batch_shuffle_size=None, file_keep_rate=None, record_keep_rate=None,
-        prefetch_size=2, interleave=False, dataset_fn=None, verbose=False):
-  """
-  Make a tf.Dataset given an ordered list of filenames, using Zookeeper to keep track of
-  which file to read, and to coordinate multiple workers.
-
-  Args:
-    files:
-      ordered list of (typically HDFS) filenames. This must remain consistent
-      between different workers, and between worker restarts (e.g. in the case
-      of instance failure or preemption).
-      To ensure this remains consistent, consider using the --train.files_list
-      option from DataRecordTrainer.
-    parse_fn:
-      A function that takes a byte tensor containing a datarecord and decodes it.
-    batch_size:
-      The batch size for each step.
-    zk_counter_path:
-      Path under the root node for the underlying zookeeper shared counter that
-      is used to coordinate distributed iteration over the list of files.
-      Full path will be `'/'.join([DEFAULT_ZOOKEEPER_BASE_ZNODE, zk_counter_path])`.
-    repeat:
-      Default False. Set True to repeat over the files forever.
-    num_threads:
-      Default 2. Number of threads working on the data in parallel.
-      Only used if interleave=True.
-    block_length:
-      Default None. Number of consecutive records to pull from a single part file.
-      If None, then block_length=batch_size will be used.
-      Only used if interleave=True.
-    part_file_parallelism:
-      Default None. Number of part files to read from in parallel. Once a part file is completely
-      read, it will be replaced by the next part file indicated by the zookeeper counter.
-      Only used if interleave=True.
-
-      ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
-      the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
-      equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
-      if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
-      thread pool will be underutilized, since it can never be the case that every reader thread has
-      a part file to read from.
-
-    batch_shuffle_size:
-      Default None. Size of shuffle buffer, for shuffling that will be applied after batching.
-      if None, then batches will not be shuffled. Ignored if dataset_fn is provided.
-    file_keep_rate:
-      Default None. Fraction of files to keep, or None to keep all files.
-    record_keep_rate:
-      Default None. Fraction of records to keep, or None to keep all records.
-    prefetch_size:
-      Default 2. Number of parsed batches to prefetch. Ignored if dataset_fn is provided.
-    interleave:
-      Default False. Set True to use tf.data.Dataset.interleave rather than flat_map.
-    dataset_fn:
-      A function that is applied to the dataset of individual records, after
-      these have been read from the parts files.
-      If ``None`` (the default), the behavior will be as though dataset_fn were set to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          dataset = dataset.batch(batch_size)
-          dataset = dataset.map(parse_fn, tf.data.experimental.AUTOTUNE)
-          if batch_shuffle_size:
-            dataset = dataset.shuffle(batch_shuffle_size)
-          return dataset.prefetch(prefetch_size)
-
-    verbose:
-      Default False. Set True to log the names of files loaded by TF.
-  """
-  block_length = batch_size if block_length is None else block_length
-  part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism
-
-  def zk_index_generator(my_files=files):
-    zk = KazooClient(hosts=DEFAULT_ZOOKEEPER_HOST)
-    zk.start()
-    my_counter = zk.Counter(cx_zk_path(zk_counter_path), default=0)
-    while True:
-      my_counter += 1
-      counter_pre_value = my_counter.pre_value
-      if repeat:
-        counter_pre_value = counter_pre_value % len(my_files)
-      if counter_pre_value >= len(my_files):
-        break
-      else:
-        chosen_file = my_files[counter_pre_value]
-        if verbose:
-          logging.info("{}. yielding {}".format(counter_pre_value, chosen_file))
-        yield chosen_file
-    zk.stop()
-
-  files = tf.data.Dataset.from_generator(zk_index_generator, tf.string)
-
-  # Downsample parts files
-  files = downsample_dataset(files, file_keep_rate, "file_keep_rate")
-
-  def map_fn(filenames):
-    return BlockFormatDataset(filenames).prefetch(20)
-
-  # Dont interleave for sequential training
-  if interleave:
-    dataset = files.interleave(
-      map_fn,
-      cycle_length=part_file_parallelism,
-      block_length=block_length,
-      num_parallel_calls=num_threads)
-  else:
-    dataset = files.flat_map(map_fn)
-
-  # Downsample DataRecords
-  dataset = downsample_dataset(dataset, record_keep_rate, "record_keep_rate")
-
-  if dataset_fn is None:
-    # Create a batch of datarecords and decode them
-    dataset = dataset.batch(batch_size)
-    dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    # shuffle after batching and parsing for performance reasons
-    # faster b/c 1 random selection is made per batch rather than per record
-    if batch_shuffle_size:
-      dataset = dataset.shuffle(buffer_size=batch_shuffle_size)
-    dataset = dataset.prefetch(prefetch_size)
-
-  else:
-    dataset = dataset_fn(dataset, parse_fn, batch_size)
-
-  return dataset
+    files,
+    parse_fn,
+    batch_size,
+    zk_counter_path,
+    repeat=False,
+    num_threads=2,
+    block_length=None,
+    part_file_parallelism=None,
+    batch_shuffle_size=None,
+    file_keep_rate=None,
+    record_keep_rate=None,
+    prefetch_size=2,
+    interleave=False,
+    dataset_fn=None,
+    verbose=False,
+):
+    """
+    Make a tf.Dataset given an ordered list of filenames, using Zookeeper to keep track of
+    which file to read, and to coordinate multiple workers.
+
+    Args:
+      files:
+        ordered list of (typically HDFS) filenames. This must remain consistent
+        between different workers, and between worker restarts (e.g. in the case
+        of instance failure or preemption).
+        To ensure this remains consistent, consider using the --train.files_list
+        option from DataRecordTrainer.
+      parse_fn:
+        A function that takes a byte tensor containing a datarecord and decodes it.
+      batch_size:
+        The batch size for each step.
+      zk_counter_path:
+        Path under the root node for the underlying zookeeper shared counter that
+        is used to coordinate distributed iteration over the list of files.
+        Full path will be `'/'.join([DEFAULT_ZOOKEEPER_BASE_ZNODE, zk_counter_path])`.
+      repeat:
+        Default False. Set True to repeat over the files forever.
+      num_threads:
+        Default 2. Number of threads working on the data in parallel.
+        Only used if interleave=True.
+      block_length:
+        Default None. Number of consecutive records to pull from a single part file.
+        If None, then block_length=batch_size will be used.
+        Only used if interleave=True.
+      part_file_parallelism:
+        Default None. Number of part files to read from in parallel. Once a part file is completely
+        read, it will be replaced by the next part file indicated by the zookeeper counter.
+        Only used if interleave=True.
+
+        ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
+        the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
+        equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
+        if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
+        thread pool will be underutilized, since it can never be the case that every reader thread has
+        a part file to read from.
+
+      batch_shuffle_size:
+        Default None. Size of shuffle buffer, for shuffling that will be applied after batching.
+        if None, then batches will not be shuffled. Ignored if dataset_fn is provided.
+      file_keep_rate:
+        Default None. Fraction of files to keep, or None to keep all files.
+      record_keep_rate:
+        Default None. Fraction of records to keep, or None to keep all records.
+      prefetch_size:
+        Default 2. Number of parsed batches to prefetch. Ignored if dataset_fn is provided.
+      interleave:
+        Default False. Set True to use tf.data.Dataset.interleave rather than flat_map.
+      dataset_fn:
+        A function that is applied to the dataset of individual records, after
+        these have been read from the parts files.
+        If ``None`` (the default), the behavior will be as though dataset_fn were set to:
+
+        .. code-block:: python
+
+          def dataset_fn(dataset, parse_fn, batch_size):
+            dataset = dataset.batch(batch_size)
+            dataset = dataset.map(parse_fn, tf.data.experimental.AUTOTUNE)
+            if batch_shuffle_size:
+              dataset = dataset.shuffle(batch_shuffle_size)
+            return dataset.prefetch(prefetch_size)
+
+      verbose:
+        Default False. Set True to log the names of files loaded by TF.
+    """
+    block_length = batch_size if block_length is None else block_length
+    part_file_parallelism = (
+        num_threads if part_file_parallelism is None else part_file_parallelism
+    )
+
+    def zk_index_generator(my_files=files):
+        zk = KazooClient(hosts=DEFAULT_ZOOKEEPER_HOST)
+        zk.start()
+        my_counter = zk.Counter(cx_zk_path(zk_counter_path), default=0)
+        while True:
+            my_counter += 1
+            counter_pre_value = my_counter.pre_value
+            if repeat:
+                counter_pre_value = counter_pre_value % len(my_files)
+            if counter_pre_value >= len(my_files):
+                break
+            else:
+                chosen_file = my_files[counter_pre_value]
+                if verbose:
+                    logging.info(
+                        "{}. yielding {}".format(counter_pre_value, chosen_file)
+                    )
+                yield chosen_file
+        zk.stop()
+
+    files = tf.data.Dataset.from_generator(zk_index_generator, tf.string)
+
+    # Downsample parts files
+    files = downsample_dataset(files, file_keep_rate, "file_keep_rate")
+
+    def map_fn(filenames):
+        return BlockFormatDataset(filenames).prefetch(20)
+
+    # Dont interleave for sequential training
+    if interleave:
+        dataset = files.interleave(
+            map_fn,
+            cycle_length=part_file_parallelism,
+            block_length=block_length,
+            num_parallel_calls=num_threads,
+        )
+    else:
+        dataset = files.flat_map(map_fn)
+
+    # Downsample DataRecords
+    dataset = downsample_dataset(dataset, record_keep_rate, "record_keep_rate")
+
+    if dataset_fn is None:
+        # Create a batch of datarecords and decode them
+        dataset = dataset.batch(batch_size)
+        dataset = dataset.map(
+            parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE
+        )
+        # shuffle after batching and parsing for performance reasons
+        # faster b/c 1 random selection is made per batch rather than per record
+        if batch_shuffle_size:
+            dataset = dataset.shuffle(buffer_size=batch_shuffle_size)
+        dataset = dataset.prefetch(prefetch_size)
+
+    else:
+        dataset = dataset_fn(dataset, parse_fn, batch_size)
+
+    return dataset
diff --git a/twml/twml/errors.py b/twml/twml/errors.py
index 9b50fcd79..ff4aecdb1 100644
--- a/twml/twml/errors.py
+++ b/twml/twml/errors.py
@@ -4,10 +4,12 @@
 
 
 class EarlyStopError(Exception):
-  """Exception used to indicate evaluator needs to early stop."""
-  pass
+    """Exception used to indicate evaluator needs to early stop."""
+
+    pass
 
 
 class CheckpointNotFoundError(Exception):
-  """Exception used to indicate a checkpoint hasnt been found."""
-  pass
+    """Exception used to indicate a checkpoint hasnt been found."""
+
+    pass
diff --git a/twml/twml/export_output_fns.py b/twml/twml/export_output_fns.py
index f72e1d0fe..21e3b6899 100644
--- a/twml/twml/export_output_fns.py
+++ b/twml/twml/export_output_fns.py
@@ -1,4 +1,4 @@
-'''
+"""
 Contains implemenations of DataRecordTrainer.get_export_output_fns that specify how to
 export model graph outputs from build_graph to DataRecords for prediction servers.
 
@@ -6,12 +6,16 @@
 the DataRecordTrainer constructor to customize how to export their model outputs.
 
 Modelers may also provide a custom implementation of export_output_fn using these as reference.
-'''
+"""
 
 # pylint: disable=invalid-name
 from twitter.deepbird.io.legacy.export_output_fns import (
-  batch_prediction_continuous_output_fn,  # noqa: F401
-  batch_prediction_tensor_output_fn,  # noqa: F401
-  default_output_fn,  # noqa: F401
-  variable_length_continuous_output_fn,  # noqa: F401
-)
+    batch_prediction_continuous_output_fn,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.export_output_fns import (
+    batch_prediction_tensor_output_fn,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.export_output_fns import default_output_fn  # noqa: F401
+from twitter.deepbird.io.legacy.export_output_fns import (
+    variable_length_continuous_output_fn,
+)  # noqa: F401
diff --git a/twml/twml/feature_config.py b/twml/twml/feature_config.py
index 37004f442..a1a0e6a4e 100644
--- a/twml/twml/feature_config.py
+++ b/twml/twml/feature_config.py
@@ -10,45 +10,45 @@
 
 
 class FeatureConfig(feature_config.FeatureConfig):
-  def get_feature_spec(self):
-    """
-    Generates a serialization-friendly dict representing this FeatureConfig.
-    """
-    doc = super(FeatureConfig, self).get_feature_spec()
-    # Override the class in the spec.
-    doc["class"] = "twml.FeatureConfig"
-    return doc
+    def get_feature_spec(self):
+        """
+        Generates a serialization-friendly dict representing this FeatureConfig.
+        """
+        doc = super(FeatureConfig, self).get_feature_spec()
+        # Override the class in the spec.
+        doc["class"] = "twml.FeatureConfig"
+        return doc
 
 
 class FeatureConfigBuilder(feature_config.FeatureConfigBuilder):
-  def build(self):
-    # Overwrite self.build() to return twml.FeatureConfig instead
-    """
-    Builds and returns FeatureConfig object.
-    """
-
-    (
-      features,
-      tensor_types,
-      sparse_tensor_types,
-      feature_map,
-      feature_name_to_feature_parser,
-      feature_in_bq_name,
-    ) = self._build()
-
-    return FeatureConfig(
-      features=features,
-      labels=self._labels,
-      weight=self._weight,
-      filters=self._filter_features,
-      tensor_types=tensor_types,
-      sparse_tensor_types=sparse_tensor_types,
-      feature_types=feature_map,
-      decode_mode=self._decode_mode,
-      legacy_sparse=self._legacy_sparse,
-      feature_name_to_feature_parser=self._feature_name_to_feature_parser,
-      feature_in_bq_name=self._feature_in_bq_name,
-    )
+    def build(self):
+        # Overwrite self.build() to return twml.FeatureConfig instead
+        """
+        Builds and returns FeatureConfig object.
+        """
+
+        (
+            features,
+            tensor_types,
+            sparse_tensor_types,
+            feature_map,
+            feature_name_to_feature_parser,
+            feature_in_bq_name,
+        ) = self._build()
+
+        return FeatureConfig(
+            features=features,
+            labels=self._labels,
+            weight=self._weight,
+            filters=self._filter_features,
+            tensor_types=tensor_types,
+            sparse_tensor_types=sparse_tensor_types,
+            feature_types=feature_map,
+            decode_mode=self._decode_mode,
+            legacy_sparse=self._legacy_sparse,
+            feature_name_to_feature_parser=self._feature_name_to_feature_parser,
+            feature_in_bq_name=self._feature_in_bq_name,
+        )
 
 
 _name_to_id = feature_config._name_to_id
diff --git a/twml/twml/filters.py b/twml/twml/filters.py
index e48633808..fdf20f405 100644
--- a/twml/twml/filters.py
+++ b/twml/twml/filters.py
@@ -1,9 +1,10 @@
-'''
+"""
 Includes functions to filter features dict build from
 data records.
-'''
+"""
 
 from twitter.deepbird.io.legacy.filters import (
-  balance_binary_class_samples,  # noqa: F401
-  sparse_keep_feature_if,  # noqa: F401
-  sparse_keep_sample_if)  # noqa: F401
+    balance_binary_class_samples,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.filters import sparse_keep_feature_if  # noqa: F401
+from twitter.deepbird.io.legacy.filters import sparse_keep_sample_if  # noqa: F401
diff --git a/twml/twml/hooks.py b/twml/twml/hooks.py
index cdf733535..dc11af521 100644
--- a/twml/twml/hooks.py
+++ b/twml/twml/hooks.py
@@ -1,562 +1,590 @@
 """ This file contains tf.train.SessionRunHooks defined by TWML """
-from datetime import datetime
 import json
 import operator
 import os
+from datetime import datetime
 
-from absl import logging
 import numpy as np
 import tensorflow.compat.v1 as tf
-from tensorflow.python.training.basic_session_run_hooks import NeverTriggerTimer, SecondOrStepTimer
+from absl import logging
+from tensorflow.python.training.basic_session_run_hooks import (
+    NeverTriggerTimer,
+    SecondOrStepTimer,
+)
+
 import twml
 
 
 class StepProgressHook(tf.train.SessionRunHook):
-  """Hook that displays a progress bar to monitor global step progress """
-
-  def __init__(self, max_step):
-    """
-    Initializes a `StepProgressHook`.
-    This hook displays a progress bar for max_steps.
-
-    Note that this hook only works for training and calibration.
-
-    Args:
-      max_steps:
-        maximum steps to monitor in progress bar.
-        When this many steps is reached, the progress bar will be full.
-    """
-    self._max_step = max_step
-    self._start_step = 0
-    self._global_step_tensor = None
-    self._progress_bar = None
-
-  def begin(self):
-    """ sets the global_step_tensor """
-    self._global_step_tensor = tf.train.get_or_create_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError("Global step should be created to use StepProgressHook.")
-
-  def after_create_session(self, session, coord):
-    """ creates the progress bar and keeps track of the first global step upon session creation """
-    global_step = session.run(self._global_step_tensor)
-    self._start_step = global_step
-    self._progress_bar = tf.keras.utils.Progbar(self._max_step)
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    """ invoked before calling session.run """
-    return tf.train.SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    """ invoked after run is called. Updates the progress bar. """
-    step = run_context.session.run(self._global_step_tensor)
-    self._progress_bar.update(step - self._start_step)
+    """Hook that displays a progress bar to monitor global step progress"""
+
+    def __init__(self, max_step):
+        """
+        Initializes a `StepProgressHook`.
+        This hook displays a progress bar for max_steps.
+
+        Note that this hook only works for training and calibration.
+
+        Args:
+          max_steps:
+            maximum steps to monitor in progress bar.
+            When this many steps is reached, the progress bar will be full.
+        """
+        self._max_step = max_step
+        self._start_step = 0
+        self._global_step_tensor = None
+        self._progress_bar = None
+
+    def begin(self):
+        """sets the global_step_tensor"""
+        self._global_step_tensor = tf.train.get_or_create_global_step()
+        if self._global_step_tensor is None:
+            raise RuntimeError("Global step should be created to use StepProgressHook.")
+
+    def after_create_session(self, session, coord):
+        """creates the progress bar and keeps track of the first global step upon session creation"""
+        global_step = session.run(self._global_step_tensor)
+        self._start_step = global_step
+        self._progress_bar = tf.keras.utils.Progbar(self._max_step)
+
+    def before_run(self, run_context):  # pylint: disable=unused-argument
+        """invoked before calling session.run"""
+        return tf.train.SessionRunArgs(self._global_step_tensor)
+
+    def after_run(self, run_context, run_values):
+        """invoked after run is called. Updates the progress bar."""
+        step = run_context.session.run(self._global_step_tensor)
+        self._progress_bar.update(step - self._start_step)
 
 
 class GetMetricsHook(tf.train.SessionRunHook):
-  """
-  Hook used to obtain evaluation metrics.
-  Typically used for early-stopping by obtaining the value of a
-  metric at the end of an epoch.
-  Note that the metric tensor and its commensurate update Op
-  are responsible for aggregating the metric during the session
-  (one session per epoch). Used for evaluation.
-  """
-
-  def __init__(self, get_metrics_fn):
-    """GetMetricsHook constructor.
-
-    Args:
-      get_metrics_fn:
-        Function that returns a dict mapping metric keys to
-        tensors as a tf.Tensor.
-        See Trainer.learn for an example use-case.
+    """
+    Hook used to obtain evaluation metrics.
+    Typically used for early-stopping by obtaining the value of a
+    metric at the end of an epoch.
+    Note that the metric tensor and its commensurate update Op
+    are responsible for aggregating the metric during the session
+    (one session per epoch). Used for evaluation.
     """
 
-    self._get_metrics_fn = get_metrics_fn
-    self._metric_tensors = None
-    self.metric_values = None
+    def __init__(self, get_metrics_fn):
+        """GetMetricsHook constructor.
 
-  def begin(self):
-    """ sets the global_step_tensor and metric tensor"""
-    self._metric_tensors = self._get_metrics_fn()
-    assert isinstance(self._metric_tensors, dict)
+        Args:
+          get_metrics_fn:
+            Function that returns a dict mapping metric keys to
+            tensors as a tf.Tensor.
+            See Trainer.learn for an example use-case.
+        """
 
-  def end(self, session):
-    self.metric_values = session.run(self._metric_tensors)
+        self._get_metrics_fn = get_metrics_fn
+        self._metric_tensors = None
+        self.metric_values = None
 
+    def begin(self):
+        """sets the global_step_tensor and metric tensor"""
+        self._metric_tensors = self._get_metrics_fn()
+        assert isinstance(self._metric_tensors, dict)
 
-class EarlyStopHook(GetMetricsHook):
-  """
-  A GetMetricsHook augmented with early-stopping logic for use
-  within the Trainer.learn method.
-  """
-
-  def __init__(self,
-               metric,
-               patience,
-               minimize,
-               get_estimator_spec_fn,
-               checkpoint_dir,
-               file_path=None,
-               exit_on_end=True,
-               start_epoch=0,
-               tolerance=0):
-    """
-    Prepare early-stopping hook and variables.
+    def end(self, session):
+        self.metric_values = session.run(self._metric_tensors)
 
-    Args:
-      metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-      minimize:
-        Set this to True for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        e.g. when maximizing the condition is current_metric > best_metric + tolerance."
-        Defaults to 0.
-      get_estimator_spec_fn:
-        function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      checkpoint_dir:
-        path to directory containing the Estimator checkpoints.
-      file_path:
-        path to file that is used by this hook to communicate early-stopping
-        to StopIfExistsHook. This hook would be used for evaluation, while
-        the StopIfExistsHooks (the listeners) would be used for training.
-        When the file is created, the StopIfExistsHooks detect and terminate training.
-        This argument is used by ``Trainer.train_and_evaluate``.
-      exit_on_end:
-        when the end() method is called to indicate that the session is terminating,
-        and exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the evaluation job.
-        This is set to False by the trainer for non distributed jobs.
-      start_epoch:
-        Specifies the starting epoch number. This is used for logging purposes only.
-    """
-    if not isinstance(metric, str):
-      raise ValueError("Expecting string for metric arg")
-    if not isinstance(patience, int):
-      raise ValueError("Expecting positive number for metric arg")
-
-    self.should_stop = False
-    self._metric = metric
-    self._patience = patience
-    self._current_patience = patience
-    self._checkpoint_dir = checkpoint_dir
-    self._exit_on_end = exit_on_end
-    self._latest_checkpoint_path = None
-    # used for distributed training (tf.estimator.train_and_evaluate)
-    self._file_path = file_path
-    self._epoch = start_epoch
-    if self._file_path is not None:
-      # TODO try to read epoch from a file that we create
-      if tf.io.gfile.exists(self._file_path):
-        # delete the file if it exists (not sure this makes sense)
-        logging.info("EarlyStopHook: Removing existing file: %s.", self._file_path)
-        tf.io.gfile.remove(self._file_path)
-
-    # best_checkpoint dir will contain the best checkpoint
-    self._best_checkpoint_path = os.path.join(checkpoint_dir, 'best_checkpoint')
-    self._eval_checkpoint_path = os.path.join(checkpoint_dir, 'eval_checkpoint')
-    self._best_metric_path = os.path.join(self._best_checkpoint_path, self._metric)
-
-    if tf.io.gfile.exists(self._best_metric_path):
-      with tf.io.gfile.GFile(self._best_metric_path, mode="r") as f:
-        best_metric_from_file = float(f.read())
-    else:
-      best_metric_from_file = None
-
-    if minimize:
-      # current < best : is better
-      self._is_better_than = operator.lt
-      # worse metric possible
-      if best_metric_from_file is None:
-        self._best_metric = np.inf
-      else:
-        self._best_metric = best_metric_from_file - tolerance
-      # used for printing
-      self._early_stop_name = "minimum"
-    else:
-      # current > best : is better
-      self._is_better_than = operator.gt
-      # worse metric possible
-      if best_metric_from_file is None:
-        self._best_metric = -np.inf
-      else:
-        self._best_metric = best_metric_from_file + tolerance
-      # used for printing
-      self._early_stop_name = "maximum"
-
-    def get_metrics_fn():
-      """ function to get metric tensors to early-stopping """
-      estimator_spec = get_estimator_spec_fn()
-      eval_metric_ops = estimator_spec.eval_metric_ops
-      if metric not in eval_metric_ops:
-        raise ValueError(
-          "Expecting early_stop_metric '%s' key in eval_metric_ops dict"
-          % (metric))
-      # get the value_op from the (value_op, update_op) value
-      return {k: v[0] for k, v in eval_metric_ops.items()}
-
-    # initialize GetMetricsHook to get current value of metric from session
-    super(EarlyStopHook, self).__init__(get_metrics_fn=get_metrics_fn)
-
-  def early_stop(self, epoch):
-    """
-    Looks at the current value of the early stopping metric.
-    Decrements current patience. If metric improves, patience is reset
-    and latest checkpoint is moved to checkpoint_dir/best_checkpoint.
-    If current patience reaches zero, returns True.
 
-    Args:
-      epoch:
-        The current epoch number.
-
-    Returns:
-      True when early-stopped. False otherwise.
-    """
-    # decrement patience
-    self._current_patience -= 1
-
-    # get the current metric value
-    current_metric = self.metric_values[self._metric]
-
-    if self._is_better_than(current_metric, self._best_metric):
-      # save best version of model
-      self._best_metric = current_metric
-      logging.info(
-        "Found new %s %s=%f @ epoch %d",
-        self._early_stop_name, self._metric, self._best_metric, epoch)
-      # backup the file to checkpoint_dir/best_checkpoint
-      assert self._latest_checkpoint_path, "expecting latest checkpoint"
-      logging.info("Backing up " + self._latest_checkpoint_path)
-
-      try:
-        eval_checkpoint = tf.train.latest_checkpoint(self._eval_checkpoint_path)
-        twml.util.backup_checkpoint(
-          checkpoint_path_prefix=eval_checkpoint,
-          backup_path=self._best_checkpoint_path)
-      except twml.errors.CheckpointNotFoundError as ex:
-        msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
-        raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
-
-      tf.io.gfile.makedirs(os.path.dirname(self._best_metric_path))
-      with tf.io.gfile.GFile(self._best_metric_path, mode="w") as f:
-        # Write with enough precision
-        f.write("%.8f" % self._best_metric)
-
-      # reset patience
-      self._current_patience = self._patience
-
-    elif self._current_patience > 0:
-      logging.info("No new %s found after %d epochs",
-                   self._early_stop_name, self._patience - self._current_patience)
-    elif self._current_patience == 0:
-      logging.info(
-        "No new %s found after %d epochs. Early-stopping experiment.",
-        self._early_stop_name, self._patience)
-      return True
-
-    return False
-
-  def cleanup_checkpoints(self):
+class EarlyStopHook(GetMetricsHook):
     """
-    makes it so that the best checkpoint is the only checkpoint
-    in checkpoint_dir.
+    A GetMetricsHook augmented with early-stopping logic for use
+    within the Trainer.learn method.
     """
-    raise NotImplementedError("cleanup_checkpoints is no longer supported")
 
-  def end(self, session):
-    """
-    This method is called at the end of an evaluation/epoch.
-    When file_path constructor argument is provided, this
-    will call ``early_stop()``.
-    When ``early_stop()`` returns True, it creates the file_path,
-    which will be detected by StopIfExistsHooks
-    and stop training for all workers and the chief. It will
-    also call ``cleanup_checkpoints()``.
-    """
-    super(EarlyStopHook, self).end(session)
-
-    # Checks for early stopping criteria and makes a backup
-    self.should_stop = self.early_stop(self._epoch)
-
-    if self._file_path is not None:
-      if self.should_stop:
-        # create a file to inform workers
-        with tf.io.gfile.GFile(self._file_path, "wb") as gfile:
-          gfile.write("early-stop\n")
-        # makes the best checkpoint the only checkpoint in save_dir.
-        msg = "early-stopping evaluation at epoch %d" % self._epoch
-        logging.info(msg)
-        if self._exit_on_end:
-          raise twml.errors.EarlyStopError(msg)
-      else:
+    def __init__(
+        self,
+        metric,
+        patience,
+        minimize,
+        get_estimator_spec_fn,
+        checkpoint_dir,
+        file_path=None,
+        exit_on_end=True,
+        start_epoch=0,
+        tolerance=0,
+    ):
+        """
+        Prepare early-stopping hook and variables.
+
+        Args:
+          metric:
+            String specifying the metric to early-stop on. Required with positive
+            ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
+            The string is used to extract the relevant tensor Op from the dict returned by
+            the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
+            the string is one of those. For multi-class (that is, multi-metric)
+            metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
+            of the ``multi_metric_names`` (one per class).
+          patience:
+            Maximum number of epochs to wait for an improvement in the early_stop_metric
+            before breaking off training. For example, a patience of 10 means that
+            training will have 10 epochs to improve the metric before it is killed.
+            Whenever the metric is improved before running out of patience,
+            patience is reset to ``early_stop_patience``.
+          minimize:
+            Set this to True for metrics that need to be minimized
+            (like ``loss``). Metrics like ``accuracy`` that need to be maximized
+            should set this to False.
+          tolerance:
+            A non-negative tolerance for comparing early_stop_metric.
+            e.g. when maximizing the condition is current_metric > best_metric + tolerance."
+            Defaults to 0.
+          get_estimator_spec_fn:
+            function that returns the current EstimatorSpec.
+            The EstimatorSpec is used to obtain the current eval_metric_ops.
+          checkpoint_dir:
+            path to directory containing the Estimator checkpoints.
+          file_path:
+            path to file that is used by this hook to communicate early-stopping
+            to StopIfExistsHook. This hook would be used for evaluation, while
+            the StopIfExistsHooks (the listeners) would be used for training.
+            When the file is created, the StopIfExistsHooks detect and terminate training.
+            This argument is used by ``Trainer.train_and_evaluate``.
+          exit_on_end:
+            when the end() method is called to indicate that the session is terminating,
+            and exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the evaluation job.
+            This is set to False by the trainer for non distributed jobs.
+          start_epoch:
+            Specifies the starting epoch number. This is used for logging purposes only.
+        """
+        if not isinstance(metric, str):
+            raise ValueError("Expecting string for metric arg")
+        if not isinstance(patience, int):
+            raise ValueError("Expecting positive number for metric arg")
+
+        self.should_stop = False
+        self._metric = metric
+        self._patience = patience
+        self._current_patience = patience
+        self._checkpoint_dir = checkpoint_dir
+        self._exit_on_end = exit_on_end
         self._latest_checkpoint_path = None
-
-    self._epoch += 1
-
-  def begin(self):
-    """
-    Saves the latest_checkpoint in case it gets superseded by another checkpoint.
-    Remember that when used with train_and_evaluate, the chief saves checkpoints
-    continuouly. The chief could save a checkpoint after evaluation started.
-    So saving the checkpoint at the beginning of evaluation ensures that we
-    later save the correct best checkpoint.
-    """
-    super(EarlyStopHook, self).begin()
-    self._latest_checkpoint_path = tf.train.latest_checkpoint(self._checkpoint_dir)
-
-    assert self._latest_checkpoint_path, "expecting latest checkpoint"
-    # Backup to temporary directory
-    try:
-      twml.util.backup_checkpoint(
-        checkpoint_path_prefix=self._latest_checkpoint_path,
-        backup_path=self._eval_checkpoint_path)
-    except twml.errors.CheckpointNotFoundError as ex:
-      msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
-      raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
+        # used for distributed training (tf.estimator.train_and_evaluate)
+        self._file_path = file_path
+        self._epoch = start_epoch
+        if self._file_path is not None:
+            # TODO try to read epoch from a file that we create
+            if tf.io.gfile.exists(self._file_path):
+                # delete the file if it exists (not sure this makes sense)
+                logging.info(
+                    "EarlyStopHook: Removing existing file: %s.", self._file_path
+                )
+                tf.io.gfile.remove(self._file_path)
+
+        # best_checkpoint dir will contain the best checkpoint
+        self._best_checkpoint_path = os.path.join(checkpoint_dir, "best_checkpoint")
+        self._eval_checkpoint_path = os.path.join(checkpoint_dir, "eval_checkpoint")
+        self._best_metric_path = os.path.join(self._best_checkpoint_path, self._metric)
+
+        if tf.io.gfile.exists(self._best_metric_path):
+            with tf.io.gfile.GFile(self._best_metric_path, mode="r") as f:
+                best_metric_from_file = float(f.read())
+        else:
+            best_metric_from_file = None
+
+        if minimize:
+            # current < best : is better
+            self._is_better_than = operator.lt
+            # worse metric possible
+            if best_metric_from_file is None:
+                self._best_metric = np.inf
+            else:
+                self._best_metric = best_metric_from_file - tolerance
+            # used for printing
+            self._early_stop_name = "minimum"
+        else:
+            # current > best : is better
+            self._is_better_than = operator.gt
+            # worse metric possible
+            if best_metric_from_file is None:
+                self._best_metric = -np.inf
+            else:
+                self._best_metric = best_metric_from_file + tolerance
+            # used for printing
+            self._early_stop_name = "maximum"
+
+        def get_metrics_fn():
+            """function to get metric tensors to early-stopping"""
+            estimator_spec = get_estimator_spec_fn()
+            eval_metric_ops = estimator_spec.eval_metric_ops
+            if metric not in eval_metric_ops:
+                raise ValueError(
+                    "Expecting early_stop_metric '%s' key in eval_metric_ops dict"
+                    % (metric)
+                )
+            # get the value_op from the (value_op, update_op) value
+            return {k: v[0] for k, v in eval_metric_ops.items()}
+
+        # initialize GetMetricsHook to get current value of metric from session
+        super(EarlyStopHook, self).__init__(get_metrics_fn=get_metrics_fn)
+
+    def early_stop(self, epoch):
+        """
+        Looks at the current value of the early stopping metric.
+        Decrements current patience. If metric improves, patience is reset
+        and latest checkpoint is moved to checkpoint_dir/best_checkpoint.
+        If current patience reaches zero, returns True.
+
+        Args:
+          epoch:
+            The current epoch number.
+
+        Returns:
+          True when early-stopped. False otherwise.
+        """
+        # decrement patience
+        self._current_patience -= 1
+
+        # get the current metric value
+        current_metric = self.metric_values[self._metric]
+
+        if self._is_better_than(current_metric, self._best_metric):
+            # save best version of model
+            self._best_metric = current_metric
+            logging.info(
+                "Found new %s %s=%f @ epoch %d",
+                self._early_stop_name,
+                self._metric,
+                self._best_metric,
+                epoch,
+            )
+            # backup the file to checkpoint_dir/best_checkpoint
+            assert self._latest_checkpoint_path, "expecting latest checkpoint"
+            logging.info("Backing up " + self._latest_checkpoint_path)
+
+            try:
+                eval_checkpoint = tf.train.latest_checkpoint(self._eval_checkpoint_path)
+                twml.util.backup_checkpoint(
+                    checkpoint_path_prefix=eval_checkpoint,
+                    backup_path=self._best_checkpoint_path,
+                )
+            except twml.errors.CheckpointNotFoundError as ex:
+                msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
+                raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
+
+            tf.io.gfile.makedirs(os.path.dirname(self._best_metric_path))
+            with tf.io.gfile.GFile(self._best_metric_path, mode="w") as f:
+                # Write with enough precision
+                f.write("%.8f" % self._best_metric)
+
+            # reset patience
+            self._current_patience = self._patience
+
+        elif self._current_patience > 0:
+            logging.info(
+                "No new %s found after %d epochs",
+                self._early_stop_name,
+                self._patience - self._current_patience,
+            )
+        elif self._current_patience == 0:
+            logging.info(
+                "No new %s found after %d epochs. Early-stopping experiment.",
+                self._early_stop_name,
+                self._patience,
+            )
+            return True
+
+        return False
+
+    def cleanup_checkpoints(self):
+        """
+        makes it so that the best checkpoint is the only checkpoint
+        in checkpoint_dir.
+        """
+        raise NotImplementedError("cleanup_checkpoints is no longer supported")
+
+    def end(self, session):
+        """
+        This method is called at the end of an evaluation/epoch.
+        When file_path constructor argument is provided, this
+        will call ``early_stop()``.
+        When ``early_stop()`` returns True, it creates the file_path,
+        which will be detected by StopIfExistsHooks
+        and stop training for all workers and the chief. It will
+        also call ``cleanup_checkpoints()``.
+        """
+        super(EarlyStopHook, self).end(session)
+
+        # Checks for early stopping criteria and makes a backup
+        self.should_stop = self.early_stop(self._epoch)
+
+        if self._file_path is not None:
+            if self.should_stop:
+                # create a file to inform workers
+                with tf.io.gfile.GFile(self._file_path, "wb") as gfile:
+                    gfile.write("early-stop\n")
+                # makes the best checkpoint the only checkpoint in save_dir.
+                msg = "early-stopping evaluation at epoch %d" % self._epoch
+                logging.info(msg)
+                if self._exit_on_end:
+                    raise twml.errors.EarlyStopError(msg)
+            else:
+                self._latest_checkpoint_path = None
+
+        self._epoch += 1
+
+    def begin(self):
+        """
+        Saves the latest_checkpoint in case it gets superseded by another checkpoint.
+        Remember that when used with train_and_evaluate, the chief saves checkpoints
+        continuouly. The chief could save a checkpoint after evaluation started.
+        So saving the checkpoint at the beginning of evaluation ensures that we
+        later save the correct best checkpoint.
+        """
+        super(EarlyStopHook, self).begin()
+        self._latest_checkpoint_path = tf.train.latest_checkpoint(self._checkpoint_dir)
+
+        assert self._latest_checkpoint_path, "expecting latest checkpoint"
+        # Backup to temporary directory
+        try:
+            twml.util.backup_checkpoint(
+                checkpoint_path_prefix=self._latest_checkpoint_path,
+                backup_path=self._eval_checkpoint_path,
+            )
+        except twml.errors.CheckpointNotFoundError as ex:
+            msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
+            raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
 
 
 class MetricsUpdateHook(GetMetricsHook):
-  """
-  A GetMetricsHook augmented with logic to map SessionRun events to metrics updates.
-  It is mainly used by `TrackRun` to persist model metrics via Model Repo.
-  """
-
-  def __init__(self,
-               get_estimator_spec_fn,
-               add_metrics_fn,
-               every_n_iter=None,
-               every_n_secs=None
-               ):
     """
-    Args:
-      get_estimator_spec_fn:
-        function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      add_metrics_fn: `function` callback used to report metrics, called automatically
-        at the end of every epoch.
-      every_n_iter: `int`, log the metrics once every N local
-        steps taken in the current epoch.
-      every_n_secs: `int` or `float`, log the metrics once every N
-        seconds passed in the current epoch. Exactly one of `every_n_iter` and `every_n_secs`
-        should be provided.
-    Raises:
-      ValueError: if `every_n_iter` is non-positive or if not exactly one of `every_n_iter` and
-        `every_n_secs` is set when `add_progress_metrics_fn` is provided.
+    A GetMetricsHook augmented with logic to map SessionRun events to metrics updates.
+    It is mainly used by `TrackRun` to persist model metrics via Model Repo.
     """
-    only_log_at_end = (every_n_iter is None) and (every_n_secs is None)
-
-    if (not only_log_at_end and every_n_iter and every_n_secs):
-      raise ValueError(
-        'exactly one of every_n_iter and every_n_secs must be provided'
-      )
-
-    # TODO: should have a minimum to avoid too many calls to ModelRepo?
-    if every_n_iter is not None and every_n_iter <= 0:
-      raise ValueError("invalid every_n_iter=%s." % every_n_iter)
-
-    self._timer = (
-      NeverTriggerTimer() if only_log_at_end else
-      SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)
-    )
 
-    self._should_trigger = False
-    self._iter_count = 0
-
-    self._add_metrics_fn = add_metrics_fn
-
-    def get_metrics_fn():
-      """
-      Function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      """
-      estimator_spec = get_estimator_spec_fn()
-      eval_metric_ops = estimator_spec.eval_metric_ops
-      # get the value_op from the (value_op, update_op) value
-      return {k: v[0] for k, v in eval_metric_ops.items()}
-    super(MetricsUpdateHook, self).__init__(get_metrics_fn=get_metrics_fn)
-
-  def report_metrics(self):
-    """
-    Triggers a metrics report.
-    """
-    self._timer.update_last_triggered_step(self._iter_count)
-    if self.metric_values is not None:
-      self._add_metrics_fn(self.metric_values)
+    def __init__(
+        self,
+        get_estimator_spec_fn,
+        add_metrics_fn,
+        every_n_iter=None,
+        every_n_secs=None,
+    ):
+        """
+        Args:
+          get_estimator_spec_fn:
+            function that returns the current EstimatorSpec.
+            The EstimatorSpec is used to obtain the current eval_metric_ops.
+          add_metrics_fn: `function` callback used to report metrics, called automatically
+            at the end of every epoch.
+          every_n_iter: `int`, log the metrics once every N local
+            steps taken in the current epoch.
+          every_n_secs: `int` or `float`, log the metrics once every N
+            seconds passed in the current epoch. Exactly one of `every_n_iter` and `every_n_secs`
+            should be provided.
+        Raises:
+          ValueError: if `every_n_iter` is non-positive or if not exactly one of `every_n_iter` and
+            `every_n_secs` is set when `add_progress_metrics_fn` is provided.
+        """
+        only_log_at_end = (every_n_iter is None) and (every_n_secs is None)
+
+        if not only_log_at_end and every_n_iter and every_n_secs:
+            raise ValueError(
+                "exactly one of every_n_iter and every_n_secs must be provided"
+            )
+
+        # TODO: should have a minimum to avoid too many calls to ModelRepo?
+        if every_n_iter is not None and every_n_iter <= 0:
+            raise ValueError("invalid every_n_iter=%s." % every_n_iter)
+
+        self._timer = (
+            NeverTriggerTimer()
+            if only_log_at_end
+            else SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)
+        )
+
+        self._should_trigger = False
+        self._iter_count = 0
+
+        self._add_metrics_fn = add_metrics_fn
+
+        def get_metrics_fn():
+            """
+            Function that returns the current EstimatorSpec.
+              The EstimatorSpec is used to obtain the current eval_metric_ops.
+            """
+            estimator_spec = get_estimator_spec_fn()
+            eval_metric_ops = estimator_spec.eval_metric_ops
+            # get the value_op from the (value_op, update_op) value
+            return {k: v[0] for k, v in eval_metric_ops.items()}
+
+        super(MetricsUpdateHook, self).__init__(get_metrics_fn=get_metrics_fn)
+
+    def report_metrics(self):
+        """
+        Triggers a metrics report.
+        """
+        self._timer.update_last_triggered_step(self._iter_count)
+        if self.metric_values is not None:
+            self._add_metrics_fn(self.metric_values)
+
+    def begin(self):
+        """
+        Triggered before each epoch.
+        """
+        self._timer.reset()
+        self._iter_count = 0
+        return super(MetricsUpdateHook, self).begin()
+
+    def before_run(self, run_context):
+        """
+        Triggered before each step.
+        """
+        self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
+        return super(MetricsUpdateHook, self).before_run(run_context)
+
+    def after_run(self, run_context, run_values):
+        """
+        Triggered after each step.
+        """
+        if self._should_trigger:
+            self.report_metrics()
+        self._iter_count += 1
+        return super(MetricsUpdateHook, self).after_run(run_context, run_values)
+
+    def end(self, session):
+        """
+        Triggered after each epoch.
+        """
+        self.report_metrics()
+        return super(MetricsUpdateHook, self).end(session)
 
-  def begin(self):
-    """
-    Triggered before each epoch.
-    """
-    self._timer.reset()
-    self._iter_count = 0
-    return super(MetricsUpdateHook, self).begin()
 
-  def before_run(self, run_context):
-    """
-    Triggered before each step.
+class EarlyStopDuration(tf.train.SessionRunHook):
     """
-    self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
-    return super(MetricsUpdateHook, self).before_run(run_context)
+    Hook that can be used to terminate a job (training or validation) after a certain duration.
+    The hook is fault tolerant, i.e., if a job is allotted 1 hour to run and fails after 45 minutes,
+    then it will only run for 15 minutes once restarted.
 
-  def after_run(self, run_context, run_values):
-    """
-    Triggered after each step.
-    """
-    if self._should_trigger:
-      self.report_metrics()
-    self._iter_count += 1
-    return super(MetricsUpdateHook, self).after_run(run_context, run_values)
+    Args:
+      max_duration:
+        A float. When this argument is defined, the job will automatically terminate after
+        `max_duration` seconds if it has not already compeleted.
+
+      overwrite:
+        A boolean. If set to True, this hook will overwrite the file containing the elapsed time
+        since the beginning of the job. In a distributed setting, this will be used so only one
+        job writes to the file while all others will have read access. In a distributed setting,
+        if all executors have this parameter set to False, then it just means that the hook will
+        not be fault tolerant. When restarted, the job will restart the clock from 0.
+
+      save_dir:
+        String. A directory (located on a file system that is Tensorflow compatible) where
+        we can store the file which contains the record of the elapsed time. This file is what makes
+        the hook faul tolerant.
 
-  def end(self, session):
-    """
-    Triggered after each epoch.
+      exit_on_end:
+        when exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the job.
+        This is usually set to True to kill a validation job in a distributed setting.
     """
-    self.report_metrics()
-    return super(MetricsUpdateHook, self).end(session)
-
 
-class EarlyStopDuration(tf.train.SessionRunHook):
-  """
-  Hook that can be used to terminate a job (training or validation) after a certain duration.
-  The hook is fault tolerant, i.e., if a job is allotted 1 hour to run and fails after 45 minutes,
-  then it will only run for 15 minutes once restarted.
-
-  Args:
-    max_duration: 
-      A float. When this argument is defined, the job will automatically terminate after
-      `max_duration` seconds if it has not already compeleted. 
-    
-    overwrite:
-      A boolean. If set to True, this hook will overwrite the file containing the elapsed time
-      since the beginning of the job. In a distributed setting, this will be used so only one 
-      job writes to the file while all others will have read access. In a distributed setting,
-      if all executors have this parameter set to False, then it just means that the hook will 
-      not be fault tolerant. When restarted, the job will restart the clock from 0.
-      
-    save_dir:
-      String. A directory (located on a file system that is Tensorflow compatible) where 
-      we can store the file which contains the record of the elapsed time. This file is what makes 
-      the hook faul tolerant.
-
-    exit_on_end:
-      when exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the job.
-      This is usually set to True to kill a validation job in a distributed setting.
-   """
-
-  def __init__(self, max_duration: float, exit_on_end: bool, save_dir: str, overwrite: bool):
-    self._overwrite = overwrite
-    self._save_dir = save_dir
-    self._exit_on_end = exit_on_end
-    self._max_duration = max_duration
-    self._last_time_check = datetime.now()
-
-    # Initialize elapse time file
-    if overwrite:
-      self.elapsed_time()
-
-  @property
-  def elapsed_file_path(self):
-    return os.path.join(self._save_dir, "early_stop_duration.txt")
-
-  def early_stop(self) -> bool:
-    return self.elapsed_time() > self._max_duration
-
-  def elapsed_time(self) -> float:
-    # Recorded elapsed time is 0 unless it's been recorded in a file already
-    recorded_elapsed_time = 0
-    if tf.io.gfile.exists(self.elapsed_file_path):
-      with tf.io.gfile.GFile(self.elapsed_file_path, mode="r") as file:
-        recorded_elapsed_time = json.loads(file.read())["elapsed_time"]
-
-    elapsed_time = recorded_elapsed_time + (datetime.now() - self._last_time_check).total_seconds()
-    self._last_time_check = datetime.now()
-
-    if self._overwrite:
-      # Record the actualized new elapsed time to the file
-      tf.io.gfile.makedirs(os.path.dirname(self.elapsed_file_path))
-      with tf.io.gfile.GFile(self.elapsed_file_path, mode="w") as file:
-        record = {
-          "elapsed_time": elapsed_time,
-          "max_duration": self._max_duration
-        }
-        file.write(json.dumps(record, indent=2))
-
-    return elapsed_time
-
-  def before_run(self, run_context: tf.estimator.SessionRunContext) -> None:
-    if self.early_stop():
-      message = f"""
+    def __init__(
+        self, max_duration: float, exit_on_end: bool, save_dir: str, overwrite: bool
+    ):
+        self._overwrite = overwrite
+        self._save_dir = save_dir
+        self._exit_on_end = exit_on_end
+        self._max_duration = max_duration
+        self._last_time_check = datetime.now()
+
+        # Initialize elapse time file
+        if overwrite:
+            self.elapsed_time()
+
+    @property
+    def elapsed_file_path(self):
+        return os.path.join(self._save_dir, "early_stop_duration.txt")
+
+    def early_stop(self) -> bool:
+        return self.elapsed_time() > self._max_duration
+
+    def elapsed_time(self) -> float:
+        # Recorded elapsed time is 0 unless it's been recorded in a file already
+        recorded_elapsed_time = 0
+        if tf.io.gfile.exists(self.elapsed_file_path):
+            with tf.io.gfile.GFile(self.elapsed_file_path, mode="r") as file:
+                recorded_elapsed_time = json.loads(file.read())["elapsed_time"]
+
+        elapsed_time = (
+            recorded_elapsed_time
+            + (datetime.now() - self._last_time_check).total_seconds()
+        )
+        self._last_time_check = datetime.now()
+
+        if self._overwrite:
+            # Record the actualized new elapsed time to the file
+            tf.io.gfile.makedirs(os.path.dirname(self.elapsed_file_path))
+            with tf.io.gfile.GFile(self.elapsed_file_path, mode="w") as file:
+                record = {
+                    "elapsed_time": elapsed_time,
+                    "max_duration": self._max_duration,
+                }
+                file.write(json.dumps(record, indent=2))
+
+        return elapsed_time
+
+    def before_run(self, run_context: tf.estimator.SessionRunContext) -> None:
+        if self.early_stop():
+            message = f"""
         Stopping job which now exceeded the maximum duration of {self._max_duration} seconds. 
       """
-      logging.info(message)
-      run_context.request_stop()
+            logging.info(message)
+            run_context.request_stop()
 
-      if self._exit_on_end:
-        raise twml.errors.EarlyStopError(message)
+            if self._exit_on_end:
+                raise twml.errors.EarlyStopError(message)
 
 
 class StopAtStepHook(tf.train.StopAtStepHook):
-  """
-  Overrides ``tf.train.StopAtStepHook`` so that
-  a ``stop_requested`` property can be accessed to determine
-  if this hook requested a stop.
-  """
+    """
+    Overrides ``tf.train.StopAtStepHook`` so that
+    a ``stop_requested`` property can be accessed to determine
+    if this hook requested a stop.
+    """
 
-  def __init__(self, *args, **kwargs):
-    super(StopAtStepHook, self).__init__(*args, **kwargs)
-    self._stop_requested = False
+    def __init__(self, *args, **kwargs):
+        super(StopAtStepHook, self).__init__(*args, **kwargs)
+        self._stop_requested = False
 
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
+    @property
+    def stop_requested(self):
+        """true if this hook requested a stop"""
+        return self._stop_requested
 
-  def after_run(self, run_context, run_values):
-    """ sets self.stop_requested to true when requesting a stop """
-    super(StopAtStepHook, self).after_run(run_context, run_values)
-    self._stop_requested = run_context.stop_requested
+    def after_run(self, run_context, run_values):
+        """sets self.stop_requested to true when requesting a stop"""
+        super(StopAtStepHook, self).after_run(run_context, run_values)
+        self._stop_requested = run_context.stop_requested
 
 
 class StopIfExistsHook(tf.train.SessionRunHook):
-  """
-  Hook that requests stop if a file exists.
-  This hook is used with the EarlyStopHook to implement
-  early-stopping for distributed training (tf.estimator.train_and_evaluate).
-  """
-
-  def __init__(self, file_path):
     """
-    Arguments:
-      file_path:
-        path to file. When this hook detects that the file exists,
-        it requests a stop, which effectively kills this worker.
+    Hook that requests stop if a file exists.
+    This hook is used with the EarlyStopHook to implement
+    early-stopping for distributed training (tf.estimator.train_and_evaluate).
     """
-    self._file_path = file_path
-    self._stop_requested = False
-
-  def after_run(self, run_context, run_values):
-    if tf.io.gfile.exists(self._file_path):
-      logging.info("Early-stopping file detected; requesting stop")
-      run_context.request_stop()
-      self._stop_requested = True
-
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
+
+    def __init__(self, file_path):
+        """
+        Arguments:
+          file_path:
+            path to file. When this hook detects that the file exists,
+            it requests a stop, which effectively kills this worker.
+        """
+        self._file_path = file_path
+        self._stop_requested = False
+
+    def after_run(self, run_context, run_values):
+        if tf.io.gfile.exists(self._file_path):
+            logging.info("Early-stopping file detected; requesting stop")
+            run_context.request_stop()
+            self._stop_requested = True
+
+    @property
+    def stop_requested(self):
+        """true if this hook requested a stop"""
+        return self._stop_requested
diff --git a/twml/twml/input_fns.py b/twml/twml/input_fns.py
index 394fc8674..c842985a2 100644
--- a/twml/twml/input_fns.py
+++ b/twml/twml/input_fns.py
@@ -1,129 +1,140 @@
-'''
+"""
 Contains implementations of functions to read input data.
-'''
-from .dataset import stream_block_format_dataset
-
+"""
 import tensorflow.compat.v1 as tf
 
+from .dataset import stream_block_format_dataset
+
 
 def data_record_input_fn(
-        files, batch_size, parse_fn,
-        num_threads=2, repeat=False, dataset_fn=None,
-        keep_rate=None, parts_downsampling_rate=None,
-        shards=None, shard_index=None, shuffle=True, shuffle_files=True, interleave=True,
-        initializable=False, log_tf_data_summaries=False,
-        **kwargs):
-  """
-  Returns a nested structure of tf.Tensors containing the next element.
-  Used by ``train_input_fn`` and ``eval_input_fn`` in DataRecordTrainer.
-  By default, works with DataRecord dataset for compressed partition files.
-
-  Args:
-    files:
-      List of files that will be parsed.
-    batch_size:
-      number of samples per batch.
-    parse_fn:
-      function passed to data loading for parsing individual data records.
-      Usually one of the decoder functions like ``parsers.get_sparse_parse_fn``.
-    num_threads (optional):
-      number of threads used for loading data. Defaults to 2.
-    repeat (optional):
-      Repeat the dataset indefinitely. Defaults to False.
-      Useful when you want to use ``train_steps`` or ``eval_steps``
-      greater than the size of the dataset
-      (otherwise Estimator.[train,evaluate] stops when the end of the dataset is reached).
-    dataset_fn (optional):
-      A function that modifies the dataset after it reads different interleaved parts files.
-      Defaults to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          return dataset.batch(batch_size).map(parse_fn, 1)
-
-    keep_rate (optional):
-      A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
-      distribution with p = 1 - keep_rate.
-      Defaults to None (no records dropped).
-
-    parts_downsampling_rate (optional):
-      A float value in (0.0, 1.0] that indicates the factor by which to downsample part files.
-      For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
-
-    shards (optional):
-      Number of partitions to shard the dataset into. This is useful for codistillation
-      (https://arxiv.org/pdf/1804.03235.pdf) and other techniques that require each worker to
-      train on disjoint partitions of the dataset.
-      The dataset is not sharded by default.
-
-    shard_index (optional):
-      Which partition of the dataset to use if ``shards`` is set.
-
-    shuffle (optional):
-      Whether to shuffle the records. Defaults to True.
-
-    shuffle_files (optional):
-      Shuffle the list of files. Defaults to True.
-      When False, files are iterated in the order they are passed in.
-
-    interleave (optional):
-      Interleave records from multiple files in parallel. Defaults to True.
-
-    initializable (optional):
-      A boolean indicator. When the Dataset Iterator depends on some resource, e.g. a HashTable or
-      a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value (false)
-      is used for most plain iterators.
-
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-  Returns:
-    Iterator of elements of the dataset.
-  """
-  if not parse_fn:
-    raise ValueError("default_input_fn requires a parse_fn")
-
-  if log_tf_data_summaries and not initializable:
-    raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-  dataset = stream_block_format_dataset(
-    files=files,
-    parse_fn=parse_fn,
-    batch_size=batch_size,
-    repeat=repeat,
-    num_threads=num_threads,
-    dataset_fn=dataset_fn,
-    keep_rate=keep_rate,
-    parts_downsampling_rate=parts_downsampling_rate,
-    shards=shards,
-    shard_index=shard_index,
-    shuffle=shuffle,
-    shuffle_files=shuffle_files,
-    interleave=interleave,
+    files,
+    batch_size,
+    parse_fn,
+    num_threads=2,
+    repeat=False,
+    dataset_fn=None,
+    keep_rate=None,
+    parts_downsampling_rate=None,
+    shards=None,
+    shard_index=None,
+    shuffle=True,
+    shuffle_files=True,
+    interleave=True,
+    initializable=False,
+    log_tf_data_summaries=False,
     **kwargs
-  )
-
-  # Add a tf.data.experimental.StatsAggregator
-  # https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/data/experimental/StatsAggregator
-  if log_tf_data_summaries:
-    aggregator = tf.data.experimental.StatsAggregator()
-    options = tf.data.Options()
-    options.experimental_stats.aggregator = aggregator
-    dataset = dataset.with_options(options)
-    stats_summary = aggregator.get_summary()
-    tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
-
-  if initializable:
-    # when the data parsing dpends on some HashTable or Tensor, the iterator is initalizable and
-    # therefore we need to be run explicitly
-    iterator = dataset.make_initializable_iterator()
-    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)
-  else:
-    iterator = dataset.make_one_shot_iterator()
-  return iterator.get_next()
+):
+    """
+    Returns a nested structure of tf.Tensors containing the next element.
+    Used by ``train_input_fn`` and ``eval_input_fn`` in DataRecordTrainer.
+    By default, works with DataRecord dataset for compressed partition files.
+
+    Args:
+      files:
+        List of files that will be parsed.
+      batch_size:
+        number of samples per batch.
+      parse_fn:
+        function passed to data loading for parsing individual data records.
+        Usually one of the decoder functions like ``parsers.get_sparse_parse_fn``.
+      num_threads (optional):
+        number of threads used for loading data. Defaults to 2.
+      repeat (optional):
+        Repeat the dataset indefinitely. Defaults to False.
+        Useful when you want to use ``train_steps`` or ``eval_steps``
+        greater than the size of the dataset
+        (otherwise Estimator.[train,evaluate] stops when the end of the dataset is reached).
+      dataset_fn (optional):
+        A function that modifies the dataset after it reads different interleaved parts files.
+        Defaults to:
+
+        .. code-block:: python
+
+          def dataset_fn(dataset, parse_fn, batch_size):
+            return dataset.batch(batch_size).map(parse_fn, 1)
+
+      keep_rate (optional):
+        A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
+        distribution with p = 1 - keep_rate.
+        Defaults to None (no records dropped).
+
+      parts_downsampling_rate (optional):
+        A float value in (0.0, 1.0] that indicates the factor by which to downsample part files.
+        For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
+
+      shards (optional):
+        Number of partitions to shard the dataset into. This is useful for codistillation
+        (https://arxiv.org/pdf/1804.03235.pdf) and other techniques that require each worker to
+        train on disjoint partitions of the dataset.
+        The dataset is not sharded by default.
+
+      shard_index (optional):
+        Which partition of the dataset to use if ``shards`` is set.
+
+      shuffle (optional):
+        Whether to shuffle the records. Defaults to True.
+
+      shuffle_files (optional):
+        Shuffle the list of files. Defaults to True.
+        When False, files are iterated in the order they are passed in.
+
+      interleave (optional):
+        Interleave records from multiple files in parallel. Defaults to True.
+
+      initializable (optional):
+        A boolean indicator. When the Dataset Iterator depends on some resource, e.g. a HashTable or
+        a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value (false)
+        is used for most plain iterators.
+
+        log_tf_data_summaries (optional):
+          A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
+          tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
+          events files. This requires that `initializable` is `True` above.
+
+    Returns:
+      Iterator of elements of the dataset.
+    """
+    if not parse_fn:
+        raise ValueError("default_input_fn requires a parse_fn")
+
+    if log_tf_data_summaries and not initializable:
+        raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
+
+    dataset = stream_block_format_dataset(
+        files=files,
+        parse_fn=parse_fn,
+        batch_size=batch_size,
+        repeat=repeat,
+        num_threads=num_threads,
+        dataset_fn=dataset_fn,
+        keep_rate=keep_rate,
+        parts_downsampling_rate=parts_downsampling_rate,
+        shards=shards,
+        shard_index=shard_index,
+        shuffle=shuffle,
+        shuffle_files=shuffle_files,
+        interleave=interleave,
+        **kwargs
+    )
+
+    # Add a tf.data.experimental.StatsAggregator
+    # https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/data/experimental/StatsAggregator
+    if log_tf_data_summaries:
+        aggregator = tf.data.experimental.StatsAggregator()
+        options = tf.data.Options()
+        options.experimental_stats.aggregator = aggregator
+        dataset = dataset.with_options(options)
+        stats_summary = aggregator.get_summary()
+        tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
+
+    if initializable:
+        # when the data parsing dpends on some HashTable or Tensor, the iterator is initalizable and
+        # therefore we need to be run explicitly
+        iterator = dataset.make_initializable_iterator()
+        tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)
+    else:
+        iterator = dataset.make_one_shot_iterator()
+    return iterator.get_next()
 
 
 default_input_fn = data_record_input_fn  # pylint: disable=invalid-name
diff --git a/twml/twml/layers/__init__.py b/twml/twml/layers/__init__.py
index 917c61867..ad7e798c5 100644
--- a/twml/twml/layers/__init__.py
+++ b/twml/twml/layers/__init__.py
@@ -9,13 +9,13 @@
 from .batch_prediction_tensor_writer import BatchPredictionTensorWriter  # noqa: F401
 from .batch_prediction_writer import BatchPredictionWriter  # noqa: F401
 from .data_record_tensor_writer import DataRecordTensorWriter  # noqa: F401
-from .full_dense import full_dense, FullDense  # noqa: F401
-from .full_sparse import full_sparse, FullSparse  # noqa: F401
+from .full_dense import FullDense, full_dense  # noqa: F401
+from .full_sparse import FullSparse, full_sparse  # noqa: F401
 from .isotonic import Isotonic  # noqa: F401
 from .layer import Layer  # noqa: F401
 from .mdl import MDL  # noqa: F401
 from .partition import Partition  # noqa: F401
 from .percentile_discretizer import PercentileDiscretizer  # noqa: F401
 from .sequential import Sequential  # noqa: F401
-from .sparse_max_norm import MaxNorm, sparse_max_norm, SparseMaxNorm  # noqa: F401
+from .sparse_max_norm import MaxNorm, SparseMaxNorm, sparse_max_norm  # noqa: F401
 from .stitch import Stitch  # noqa: F401
diff --git a/twml/twml/layers/batch_prediction_tensor_writer.py b/twml/twml/layers/batch_prediction_tensor_writer.py
index 3f6633a8e..da54a8b13 100644
--- a/twml/twml/layers/batch_prediction_tensor_writer.py
+++ b/twml/twml/layers/batch_prediction_tensor_writer.py
@@ -2,50 +2,54 @@
 """
 Implementing Writer Layer
 """
-from .layer import Layer
-
 import libtwml
 
+from .layer import Layer
+
 
 class BatchPredictionTensorWriter(Layer):
-  """
-  A layer that packages keys and dense tensors into a BatchPredictionResponse.
-  Typically used at the out of an exported model for use in a the PredictionEngine
-  (that is, in production) when model predictions are dense tensors.
+    """
+    A layer that packages keys and dense tensors into a BatchPredictionResponse.
+    Typically used at the out of an exported model for use in a the PredictionEngine
+    (that is, in production) when model predictions are dense tensors.
 
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
-   """
+    Arguments:
+        keys:
+          keys to hashmap
+    Output:
+        output:
+          a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
+    """
 
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(BatchPredictionTensorWriter, self).__init__(**kwargs)
-    self.keys = keys
+    def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
+        super(BatchPredictionTensorWriter, self).__init__(**kwargs)
+        self.keys = keys
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
 
-    Raise NotImplementedError.
+        Raise NotImplementedError.
 
-    """
-    raise NotImplementedError
+        """
+        raise NotImplementedError
 
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
+    def call(
+        self, values, **kwargs
+    ):  # pylint: disable=unused-argument, arguments-differ
+        """The logic of the layer lives here.
 
-    Arguments:
-      values:
-        dense tensors corresponding to keys in hashmap
+        Arguments:
+          values:
+            dense tensors corresponding to keys in hashmap
 
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.batch_prediction_tensor_response_writer(self.keys, values)
-    return write_op
+        Returns:
+          The output from the layer
+        """
+        write_op = libtwml.ops.batch_prediction_tensor_response_writer(
+            self.keys, values
+        )
+        return write_op
diff --git a/twml/twml/layers/batch_prediction_writer.py b/twml/twml/layers/batch_prediction_writer.py
index 118d21921..123947b28 100644
--- a/twml/twml/layers/batch_prediction_writer.py
+++ b/twml/twml/layers/batch_prediction_writer.py
@@ -2,50 +2,52 @@
 """
 Implementing Writer Layer
 """
-from .layer import Layer
-
 import libtwml
 
+from .layer import Layer
+
 
 class BatchPredictionWriter(Layer):
-  """
-  A layer that packages keys and values into a BatchPredictionResponse.
-  Typically used at the out of an exported model for use in a the PredictionEngine
-  (that is, in production).
+    """
+    A layer that packages keys and values into a BatchPredictionResponse.
+    Typically used at the out of an exported model for use in a the PredictionEngine
+    (that is, in production).
 
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
-   """
+    Arguments:
+        keys:
+          keys to hashmap
+    Output:
+        output:
+          a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
+    """
 
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(BatchPredictionWriter, self).__init__(**kwargs)
-    self.keys = keys
+    def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
+        super(BatchPredictionWriter, self).__init__(**kwargs)
+        self.keys = keys
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
 
-    Raise NotImplementedError.
+        Raise NotImplementedError.
 
-    """
-    raise NotImplementedError
+        """
+        raise NotImplementedError
 
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
+    def call(
+        self, values, **kwargs
+    ):  # pylint: disable=unused-argument, arguments-differ
+        """The logic of the layer lives here.
 
-    Arguments:
-      values:
-        values corresponding to keys in hashmap
+        Arguments:
+          values:
+            values corresponding to keys in hashmap
 
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.batch_prediction_response_writer(self.keys, values)
-    return write_op
+        Returns:
+          The output from the layer
+        """
+        write_op = libtwml.ops.batch_prediction_response_writer(self.keys, values)
+        return write_op
diff --git a/twml/twml/layers/data_record_tensor_writer.py b/twml/twml/layers/data_record_tensor_writer.py
index 0f70186b4..e6063379e 100644
--- a/twml/twml/layers/data_record_tensor_writer.py
+++ b/twml/twml/layers/data_record_tensor_writer.py
@@ -2,49 +2,51 @@
 """
 Implementing Writer Layer
 """
-from .layer import Layer
-
 import libtwml
 
+from .layer import Layer
+
 
 class DataRecordTensorWriter(Layer):
-  """
-  A layer that packages keys and dense tensors into a DataRecord.
-  This layer was initially added to support exporting user embeddings as tensors.
+    """
+    A layer that packages keys and dense tensors into a DataRecord.
+    This layer was initially added to support exporting user embeddings as tensors.
 
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a DataRecord serialized using Thrift into a uint8 tensor
-   """
+    Arguments:
+        keys:
+          keys to hashmap
+    Output:
+        output:
+          a DataRecord serialized using Thrift into a uint8 tensor
+    """
 
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(DataRecordTensorWriter, self).__init__(**kwargs)
-    self.keys = keys
+    def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
+        super(DataRecordTensorWriter, self).__init__(**kwargs)
+        self.keys = keys
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
 
-    Raises NotImplementedError.
+        Raises NotImplementedError.
 
-    """
-    raise NotImplementedError
+        """
+        raise NotImplementedError
 
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
+    def call(
+        self, values, **kwargs
+    ):  # pylint: disable=unused-argument, arguments-differ
+        """The logic of the layer lives here.
 
-    Arguments:
-      values:
-        dense tensors corresponding to keys in hashmap
+        Arguments:
+          values:
+            dense tensors corresponding to keys in hashmap
 
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.data_record_tensor_writer(self.keys, values)
-    return write_op
+        Returns:
+          The output from the layer
+        """
+        write_op = libtwml.ops.data_record_tensor_writer(self.keys, values)
+        return write_op
diff --git a/twml/twml/layers/full_dense.py b/twml/twml/layers/full_dense.py
index 9c354ad3e..951252c73 100644
--- a/twml/twml/layers/full_dense.py
+++ b/twml/twml/layers/full_dense.py
@@ -2,258 +2,273 @@
 """
 Implementing Full Dense Layer
 """
-from tensorflow.python.layers import core as core_layers
-from tensorflow.python.ops import init_ops
+import tensorflow.compat.v1 as tf
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras.engine.base_layer import InputSpec
-import tensorflow.compat.v1 as tf
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import init_ops
 
 
 class FullDense(core_layers.Dense):
-  """
-  Densely-connected layer class.
-  This is wrapping tensorflow.python.layers.core.Dense
-  This layer implements the operation:
+    """
+    Densely-connected layer class.
+    This is wrapping tensorflow.python.layers.core.Dense
+    This layer implements the operation:
 
-  .. code-block:: python
+    .. code-block:: python
 
-    outputs = activation(inputs.weight + bias)
+      outputs = activation(inputs.weight + bias)
 
-  Where ``activation`` is the activation function passed as the ``activation``
-  argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
-  and ``bias`` is a bias vector created by the layer.
+    Where ``activation`` is the activation function passed as the ``activation``
+    argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
+    and ``bias`` is a bias vector created by the layer.
 
-  Arguments:
-    output_size:
-      Integer or Long, dimensionality of the output space.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+    Arguments:
+      output_size:
+        Integer or Long, dimensionality of the output space.
+      activation:
+        Activation function (callable). Set it to None to maintain a linear activation.
+      weight_initializer:
+        Initializer function for the weight matrix.
+      bias_initializer:
+        Initializer function for the bias.
+      weight_regularizer:
+        Regularizer function for the weight matrix.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      bias_regularizer:
+        Regularizer function for the bias.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      activity_regularizer:
+        Regularizer function for the output.
+      weight_constraint:
+        An optional projection function to be applied to the
+        weight after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+      bias_constraint:
+        An optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+      trainable:
+        Boolean, if `True` also add variables to the graph collection
+        ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+        <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+      name:
+        String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
 
-  Properties:
-    output_size:
-      Python integer, dimensionality of the output space.
-    activation:
-      Activation function (callable).
-    weight_initializer:
-      Initializer instance (or name) for the weight matrix.
-    bias_initializer:
-      Initializer instance (or name) for the bias.
-    weight:
-      Weight matrix (TensorFlow variable or tensor). (weight)
-    bias:
-      Bias vector, if applicable (TensorFlow variable or tensor).
-    weight_regularizer:
-      Regularizer instance for the weight matrix (callable)
-    bias_regularizer:
-      Regularizer instance for the bias (callable).
-    activity_regularizer:
-      Regularizer instance for the output (callable)
-    weight_constraint:
-      Constraint function for the weight matrix.
-    bias_constraint:
-      Constraint function for the bias.
+    Properties:
+      output_size:
+        Python integer, dimensionality of the output space.
+      activation:
+        Activation function (callable).
+      weight_initializer:
+        Initializer instance (or name) for the weight matrix.
+      bias_initializer:
+        Initializer instance (or name) for the bias.
+      weight:
+        Weight matrix (TensorFlow variable or tensor). (weight)
+      bias:
+        Bias vector, if applicable (TensorFlow variable or tensor).
+      weight_regularizer:
+        Regularizer instance for the weight matrix (callable)
+      bias_regularizer:
+        Regularizer instance for the bias (callable).
+      activity_regularizer:
+        Regularizer instance for the output (callable)
+      weight_constraint:
+        Constraint function for the weight matrix.
+      bias_constraint:
+        Constraint function for the bias.
 
-  """
+    """
 
-  def __init__(self, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=None,
-               **kwargs):
-    super(FullDense, self).__init__(units=output_size,
-                                    kernel_initializer=weight_initializer,
-                                    kernel_regularizer=weight_regularizer,
-                                    kernel_constraint=weight_constraint,
-                                    **kwargs)
-    self._num_partitions = num_partitions
+    def __init__(
+        self,
+        output_size,
+        weight_initializer=None,
+        weight_regularizer=None,
+        weight_constraint=None,
+        bias_constraint=None,
+        num_partitions=None,
+        **kwargs
+    ):
+        super(FullDense, self).__init__(
+            units=output_size,
+            kernel_initializer=weight_initializer,
+            kernel_regularizer=weight_regularizer,
+            kernel_constraint=weight_constraint,
+            **kwargs
+        )
+        self._num_partitions = num_partitions
 
-  def build(self, input_shape):
-    '''
-    code adapted from TF 1.12 Keras Dense layer:
-    https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/layers/core.py#L930-L956
-    '''
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1] is None:
-      raise ValueError('The last dimension of the inputs to `Dense` '
-                       'should be defined. Found `None`.')
-    self.input_spec = InputSpec(min_ndim=2,
-                                axes={-1: input_shape[-1]})
+    def build(self, input_shape):
+        """
+        code adapted from TF 1.12 Keras Dense layer:
+        https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/layers/core.py#L930-L956
+        """
+        input_shape = tensor_shape.TensorShape(input_shape)
+        if input_shape[-1] is None:
+            raise ValueError(
+                "The last dimension of the inputs to `Dense` "
+                "should be defined. Found `None`."
+            )
+        self.input_spec = InputSpec(min_ndim=2, axes={-1: input_shape[-1]})
 
-    partitioner = None
-    if self._num_partitions:
-      partitioner = tf.fixed_size_partitioner(self._num_partitions)
+        partitioner = None
+        if self._num_partitions:
+            partitioner = tf.fixed_size_partitioner(self._num_partitions)
 
-    self.kernel = self.add_weight(
-        'kernel',
-        shape=[input_shape[-1], self.units],
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        dtype=self.dtype,
-        partitioner=partitioner,
-        trainable=True)
+        self.kernel = self.add_weight(
+            "kernel",
+            shape=[input_shape[-1], self.units],
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            dtype=self.dtype,
+            partitioner=partitioner,
+            trainable=True,
+        )
 
-    if self.use_bias:
-      self.bias = self.add_weight(
-          'bias',
-          shape=[self.units, ],
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.bias = None
-    self.built = True
+        if self.use_bias:
+            self.bias = self.add_weight(
+                "bias",
+                shape=[
+                    self.units,
+                ],
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                dtype=self.dtype,
+                trainable=True,
+            )
+        else:
+            self.bias = None
+        self.built = True
 
-  @property
-  def output_size(self):
-    """
-    Returns output_size
-    """
-    return self.units
+    @property
+    def output_size(self):
+        """
+        Returns output_size
+        """
+        return self.units
 
-  @property
-  def weight(self):
-    """
-    Returns weight
-    """
-    return self.kernel
+    @property
+    def weight(self):
+        """
+        Returns weight
+        """
+        return self.kernel
 
-  @property
-  def weight_regularizer(self):
-    """
-    Returns weight_regularizer
-    """
-    return self.kernel_regularizer
+    @property
+    def weight_regularizer(self):
+        """
+        Returns weight_regularizer
+        """
+        return self.kernel_regularizer
 
-  @property
-  def weight_initializer(self):
-    """
-    Returns weight_initializer
-    """
-    return self.kernel_initializer
+    @property
+    def weight_initializer(self):
+        """
+        Returns weight_initializer
+        """
+        return self.kernel_initializer
 
-  @property
-  def weight_constraint(self):
-    """
-    Returns weight_constraint
-    """
-    return self.kernel_constraint
+    @property
+    def weight_constraint(self):
+        """
+        Returns weight_constraint
+        """
+        return self.kernel_constraint
 
 
-def full_dense(inputs, output_size,
-               activation=None,
-               use_bias=True,
-               weight_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               weight_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               num_partitions=None,
-               reuse=None):
-  """Functional interface for the densely-connected layer.
-  This layer implements the operation:
-  `outputs = activation(inputs.weight + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `weight` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
+def full_dense(
+    inputs,
+    output_size,
+    activation=None,
+    use_bias=True,
+    weight_initializer=None,
+    bias_initializer=init_ops.zeros_initializer(),
+    weight_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    weight_constraint=None,
+    bias_constraint=None,
+    trainable=True,
+    name=None,
+    num_partitions=None,
+    reuse=None,
+):
+    """Functional interface for the densely-connected layer.
+    This layer implements the operation:
+    `outputs = activation(inputs.weight + bias)`
+    Where `activation` is the activation function passed as the `activation`
+    argument (if not `None`), `weight` is a weights matrix created by the layer,
+    and `bias` is a bias vector created by the layer
+    (only if `use_bias` is `True`).
 
-  Arguments:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    weight_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.get_variable`.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name:
-      String, the name of the layer.
-    reuse:
-      Boolean, whether to reuse the weights of a previous layer
-      by the same name.
+    Arguments:
+      inputs: Tensor input.
+      units: Integer or Long, dimensionality of the output space.
+      activation: Activation function (callable). Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      weight_initializer: Initializer function for the weight matrix.
+        If `None` (default), weights are initialized using the default
+        initializer used by `tf.get_variable`.
+      bias_initializer:
+        Initializer function for the bias.
+      weight_regularizer:
+        Regularizer function for the weight matrix.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      bias_regularizer:
+        Regularizer function for the bias.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      activity_regularizer:
+        Regularizer function for the output.
+      weight_constraint:
+        An optional projection function to be applied to the
+        weight after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+      bias_constraint:
+        An optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+      trainable:
+        Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name:
+        String, the name of the layer.
+      reuse:
+        Boolean, whether to reuse the weights of a previous layer
+        by the same name.
 
-  Returns:
-    Output tensor the same shape as `inputs` except the last dimension is of
-    size `units`.
+    Returns:
+      Output tensor the same shape as `inputs` except the last dimension is of
+      size `units`.
 
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = FullDense(output_size,
-                    activation=activation,
-                    use_bias=use_bias,
-                    weight_initializer=weight_initializer,
-                    bias_initializer=bias_initializer,
-                    weight_regularizer=weight_regularizer,
-                    bias_regularizer=bias_regularizer,
-                    activity_regularizer=activity_regularizer,
-                    weight_constraint=weight_constraint,
-                    bias_constraint=bias_constraint,
-                    trainable=trainable,
-                    name=name,
-                    dtype=inputs.dtype.base_dtype,
-                    num_partitions=num_partitions,
-                    _scope=name,
-                    _reuse=reuse)
-  return layer.apply(inputs)
+    Raises:
+      ValueError: if eager execution is enabled.
+    """
+    layer = FullDense(
+        output_size,
+        activation=activation,
+        use_bias=use_bias,
+        weight_initializer=weight_initializer,
+        bias_initializer=bias_initializer,
+        weight_regularizer=weight_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        weight_constraint=weight_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        dtype=inputs.dtype.base_dtype,
+        num_partitions=num_partitions,
+        _scope=name,
+        _reuse=reuse,
+    )
+    return layer.apply(inputs)
diff --git a/twml/twml/layers/full_sparse.py b/twml/twml/layers/full_sparse.py
index 4f0f21930..447350c1e 100644
--- a/twml/twml/layers/full_sparse.py
+++ b/twml/twml/layers/full_sparse.py
@@ -5,366 +5,406 @@
 
 import math
 
+import tensorflow.compat.v1 as tf
 from twitter.deepbird.sparse import sparse_dense_matmul
 
-from .layer import Layer
-
-import tensorflow.compat.v1 as tf
 import twml
 
+from .layer import Layer
 
-class FullSparse(Layer):
-  """Fully-sparse layer class.
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Arguments:
-    output_size:
-      Long or Integer, dimensionality of the output space.
-    input_size:
-      The number of input units. (Deprecated)
-    weight_initializer:
-      Initializer function for the weight matrix.
-      This argument defaults to zeros_initializer().
-      This is valid when the FullSparse is the first layer of
-      parameters but should be changed otherwise.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    bias_initializer:
-      Initializer function for the bias.
-      This argument defaults to tf.constant_initializer(1/output_size)
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column)
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-      This parameter can also be a list of binary values if `inputs` passed to `call` a list.
-    use_compression:
-      Default False. Set True to enable data compression techniques for
-      optimization of network traffic for distributed training.
-    use_binary_sparse_dense_matmul:
-      If binary sparse dense matmul op is to be used. It will only be enabled if
-      `use_binary_values` is set true. It only should be used for inference, best practice is
-      to set `use_binary_sparse_dense_matmul = not is_training`.
-  """
-
-  def __init__(self,
-               output_size,
-               input_size=None,
-               weight_initializer=None,
-               activation=None,
-               bias_initializer=None,
-               trainable=True,
-               name=None,
-               use_sparse_grads=True,
-               num_partitions=None,
-               partition_axis=0,
-               use_binary_values=False,
-               bias_regularizer=None,
-               weight_regularizer=None,
-               use_compression=False,
-               use_binary_sparse_dense_matmul=False,
-               **kwargs):
-    super(FullSparse, self).__init__(trainable=trainable, name=name, **kwargs)
-    # TODO - remove input_size warning.
-    if input_size:
-      raise ValueError('input_size is deprecated - it is now automatically \
-                       inferred from your input.')
-
-    # The bias initialization and weights initialization is set to match v1's implementation.
-    if bias_initializer is None:
-      bias_initializer = tf.constant_initializer(1 / output_size)
-    # Weights initialization is set to 0s. This is safe for full sparse layers because
-    # you are supposed to learn your embedding from the label.
-    if weight_initializer is None:
-      weight_initializer = tf.zeros_initializer()
-    self.weight_initializer = weight_initializer
-    self.bias_initializer = bias_initializer
-    self.output_size = output_size
-    self.activation = activation
-    self.use_sparse_grads = use_sparse_grads
-    self.num_partitions = num_partitions
-    if partition_axis != 0 and partition_axis != 1:
-      raise ValueError('partition_axis must be 0 or 1')
-    self.partition_axis = partition_axis
-    self.use_binary_values = use_binary_values
-    self.weight_regularizer = weight_regularizer
-    self.bias_regularizer = bias_regularizer
-    self._use_compression = use_compression
-    self._cast_indices_dtype = tf.int32 if self._use_compression else None
-    self.use_binary_sparse_dense_matmul = use_binary_sparse_dense_matmul
-
-  def _make_weight_var(self, shape, partitioner):
-    self.weight = self.add_variable(
-      'weight',
-      initializer=self.weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=True,
-      partitioner=partitioner,
-    )
-
-  def build(self, input_shapes):
-    """
-    creates the ``bias`` and ``weight`` Variables
-    of shape ``[output_size]`` and ``[input_size, output_size]`` respectively.
-    """
-
-    if isinstance(input_shapes, (list, tuple)):
-      input_shape = input_shapes[0]
-      is_compatible = True
-      for other_shape in input_shapes[1:]:
-        is_compatible &= input_shape.is_compatible_with(other_shape)
-      if not is_compatible:
-        raise ValueError("Input shapes %s are not compatible." % input_shapes)
-    else:
-      input_shape = input_shapes
-
-    self.bias = self.add_variable(
-      'bias',
-      initializer=self.bias_initializer,
-      regularizer=self.bias_regularizer,
-      shape=[self.output_size, ],
-      dtype=self.dtype,
-      trainable=True
-    )
 
-    partitioner = None
-    shape = [input_shape[1], self.output_size]
-
-    # There is a 2gb limitation for each tensor because of protobuf.
-    # 2**30 is 1GB. 2 * (2**30) is 2GB.
-    dtype = tf.as_dtype(self.dtype)
-    num_partitions = 1 if self.num_partitions is None else self.num_partitions
-    in_shape = input_shape[1]
-    out_shape = self.output_size
-
-    # when v2 behavior is disabled, in_shape is tf.Dimension. otherwise it is int.
-    if isinstance(in_shape, tf.Dimension):
-      in_shape = in_shape.value
-
-    if in_shape is None:
-      raise ValueError("Input tensor should have shape."
-                       " You can set it using twml.util.limit_sparse_tensor_size")
-
-    (split_dim, other_dim) = (in_shape, out_shape) if self.partition_axis == 0 else (out_shape, in_shape)
-    requested_size = math.ceil(float(split_dim) / num_partitions) * other_dim * dtype.size
-    if (requested_size >= 2**31):
-      raise ValueError("Weight tensor partitions cannot be larger than 2GB.\n"
-                       "Requested Dimensions(%d, %d) of type %s (%d bytes total) over %d partitions.\n"
-                       "Possible solutions:\n"
-                       "- reduce the params.output_size_bits\n"
-                       "- reduce the output_size of the sparse_layer\n"
-                       "- specify a larger num_partitions argument\n"
-                       "- reduce input_size_bits" %
-                       (in_shape, self.output_size, dtype.name, requested_size, num_partitions))
-
-    if self.num_partitions:
-      partition_axis = int(self.partition_axis)
-      partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis)
-    else:
-      # Regular variables do not like it when you pass both constant tensors and shape
-      if not callable(self.weight_initializer):
-        shape = None
-
-    self._make_weight_var(shape, partitioner)
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
+class FullSparse(Layer):
+    """Fully-sparse layer class.
+    This layer implements the operation:
 
-    """
-    raise NotImplementedError
+    .. code-block:: python
 
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
+      outputs = activation(inputs.weight + bias)
 
     Arguments:
-      inputs:
-        A SparseTensor or a list of SparseTensors.
-        If `inputs` is a list, all tensors must have same `dense_shape`.
-
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
-      - If `inputs` is a `list[SparseTensor`, then returns
-        `bias + add_n([sp_a * dense_b for sp_a in inputs])`.
-
+      output_size:
+        Long or Integer, dimensionality of the output space.
+      input_size:
+        The number of input units. (Deprecated)
+      weight_initializer:
+        Initializer function for the weight matrix.
+        This argument defaults to zeros_initializer().
+        This is valid when the FullSparse is the first layer of
+        parameters but should be changed otherwise.
+      weight_regularizer:
+        Regularizer function for the weight matrix.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      bias_regularizer:
+        Regularizer function for the bias.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect
+      activation:
+        Activation function (callable). Set it to None to maintain a linear activation.
+      bias_initializer:
+        Initializer function for the bias.
+        This argument defaults to tf.constant_initializer(1/output_size)
+      trainable:
+        Boolean, if `True` also add variables to the graph collection
+        ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+        <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+      name:
+        String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+      use_sparse_grads:
+        Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
+        make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
+        speed up at training time when input_size is large and optimizer handles sparse gradients
+        correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
+        to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
+        be large, so it's better to set it to `True`
+      num_partitions:
+        Number of partitions to use for the weight variable. Defaults to 1.
+      partition_axis:
+        If num_partitions is specified, the partition axis for the weight variable
+        Defaults to 0 (partition by row).
+        Must be 0 (row) or 1 (column)
+      use_binary_values:
+        Assume all non zero values are 1. Defaults to False.
+        This can improve training if used in conjunction with MDL.
+        This parameter can also be a list of binary values if `inputs` passed to `call` a list.
+      use_compression:
+        Default False. Set True to enable data compression techniques for
+        optimization of network traffic for distributed training.
+      use_binary_sparse_dense_matmul:
+        If binary sparse dense matmul op is to be used. It will only be enabled if
+        `use_binary_values` is set true. It only should be used for inference, best practice is
+        to set `use_binary_sparse_dense_matmul = not is_training`.
     """
-    if isinstance(inputs, (list, tuple)):
-
-      if isinstance(self.use_binary_values, (list, tuple)):
-        use_binary_values = self.use_binary_values
-      else:
-        use_binary_values = [self.use_binary_values] * len(inputs)
-
-      num_inputs = len(inputs)
-      if num_inputs != len(use_binary_values):
-        raise ValueError("#inputs is %d while #use_binary_values is %d"
-                         % (num_inputs, len(use_binary_values)))
-
-      outputs = []
-      for n in range(num_inputs):
-        outputs.append(sparse_dense_matmul(inputs[n], self.weight,
-                                           self.use_sparse_grads,
-                                           use_binary_values[n],
-                                           name='sparse_mm_' + str(n),
-                                           partition_axis=self.partition_axis,
-                                           num_partitions=self.num_partitions,
-                                           compress_ids=self._use_compression,
-                                           cast_indices_dtype=self._cast_indices_dtype,
-                                           use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul))
-      outputs = tf.accumulate_n(outputs)
-    else:
-
-      if isinstance(self.use_binary_values, (list, tuple)):
-        raise ValueError("use_binary_values can not be %s when inputs is %s" %
-                         (type(self.use_binary_values), type(inputs)))
-
-      outputs = sparse_dense_matmul(inputs, self.weight,
-                                    self.use_sparse_grads,
-                                    self.use_binary_values,
-                                    name='sparse_mm',
-                                    partition_axis=self.partition_axis,
-                                    num_partitions=self.num_partitions,
-                                    compress_ids=self._use_compression,
-                                    cast_indices_dtype=self._cast_indices_dtype,
-                                    use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul)
-
-    if self.bias is not None:
-      outputs = tf.nn.bias_add(outputs, self.bias)
-
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
 
-
-def full_sparse(
-        inputs, output_size,
+    def __init__(
+        self,
+        output_size,
         input_size=None,
+        weight_initializer=None,
         activation=None,
-        bias_regularizer=None,
-        weight_regularizer=None,
         bias_initializer=None,
-        weight_initializer=None,
         trainable=True,
         name=None,
-        reuse=None,
         use_sparse_grads=True,
         num_partitions=None,
         partition_axis=0,
         use_binary_values=False,
-        use_compression=False):
-  """Functional interface for the sparsely-connected layer.
-
-  Arguments:
-    inputs:
-      A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
-    output_size:
-      Long or Integer, dimensionality of the output space.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column)
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-    use_compression:
-      Default False. Set True to enable data compression techniques for
-      optimization of network traffic for distributed training.
-  Returns:
-    Outputs a ``tf.Tensor`` of size ``[batch_size x output_size]``.
-  """
-  # TODO - remove input_size warning.
-  if input_size:
-    raise ValueError('input_size is deprecated - it is now \
-                      automatically inferred from your input.')
-
-  dtype = None
-  if isinstance(inputs, twml.SparseTensor):
-    inputs = inputs.to_tf()
-    dtype = inputs.dtype.base_dtype
-
-  if isinstance(inputs, (list, tuple)):
-    inputs = [inp.to_tf() if isinstance(inp, twml.SparseTensor) else inp for inp in inputs]
-    dtype = inputs[0].dtype.base_dtype
-
-  layer = FullSparse(output_size=output_size,
-                     activation=activation,
-                     trainable=trainable,
-                     name=name,
-                     weight_initializer=weight_initializer,
-                     bias_initializer=bias_initializer,
-                     weight_regularizer=weight_regularizer,
-                     bias_regularizer=bias_regularizer,
-                     dtype=dtype,
-                     _scope=name,
-                     _reuse=reuse,
-                     use_sparse_grads=use_sparse_grads,
-                     num_partitions=num_partitions,
-                     partition_axis=partition_axis,
-                     use_compression=use_compression,
-                     use_binary_values=use_binary_values)
-  return layer(inputs)
+        bias_regularizer=None,
+        weight_regularizer=None,
+        use_compression=False,
+        use_binary_sparse_dense_matmul=False,
+        **kwargs
+    ):
+        super(FullSparse, self).__init__(trainable=trainable, name=name, **kwargs)
+        # TODO - remove input_size warning.
+        if input_size:
+            raise ValueError(
+                "input_size is deprecated - it is now automatically \
+                       inferred from your input."
+            )
+
+        # The bias initialization and weights initialization is set to match v1's implementation.
+        if bias_initializer is None:
+            bias_initializer = tf.constant_initializer(1 / output_size)
+        # Weights initialization is set to 0s. This is safe for full sparse layers because
+        # you are supposed to learn your embedding from the label.
+        if weight_initializer is None:
+            weight_initializer = tf.zeros_initializer()
+        self.weight_initializer = weight_initializer
+        self.bias_initializer = bias_initializer
+        self.output_size = output_size
+        self.activation = activation
+        self.use_sparse_grads = use_sparse_grads
+        self.num_partitions = num_partitions
+        if partition_axis != 0 and partition_axis != 1:
+            raise ValueError("partition_axis must be 0 or 1")
+        self.partition_axis = partition_axis
+        self.use_binary_values = use_binary_values
+        self.weight_regularizer = weight_regularizer
+        self.bias_regularizer = bias_regularizer
+        self._use_compression = use_compression
+        self._cast_indices_dtype = tf.int32 if self._use_compression else None
+        self.use_binary_sparse_dense_matmul = use_binary_sparse_dense_matmul
+
+    def _make_weight_var(self, shape, partitioner):
+        self.weight = self.add_variable(
+            "weight",
+            initializer=self.weight_initializer,
+            regularizer=self.weight_regularizer,
+            shape=shape,
+            dtype=self.dtype,
+            trainable=True,
+            partitioner=partitioner,
+        )
+
+    def build(self, input_shapes):
+        """
+        creates the ``bias`` and ``weight`` Variables
+        of shape ``[output_size]`` and ``[input_size, output_size]`` respectively.
+        """
+
+        if isinstance(input_shapes, (list, tuple)):
+            input_shape = input_shapes[0]
+            is_compatible = True
+            for other_shape in input_shapes[1:]:
+                is_compatible &= input_shape.is_compatible_with(other_shape)
+            if not is_compatible:
+                raise ValueError("Input shapes %s are not compatible." % input_shapes)
+        else:
+            input_shape = input_shapes
+
+        self.bias = self.add_variable(
+            "bias",
+            initializer=self.bias_initializer,
+            regularizer=self.bias_regularizer,
+            shape=[
+                self.output_size,
+            ],
+            dtype=self.dtype,
+            trainable=True,
+        )
+
+        partitioner = None
+        shape = [input_shape[1], self.output_size]
+
+        # There is a 2gb limitation for each tensor because of protobuf.
+        # 2**30 is 1GB. 2 * (2**30) is 2GB.
+        dtype = tf.as_dtype(self.dtype)
+        num_partitions = 1 if self.num_partitions is None else self.num_partitions
+        in_shape = input_shape[1]
+        out_shape = self.output_size
+
+        # when v2 behavior is disabled, in_shape is tf.Dimension. otherwise it is int.
+        if isinstance(in_shape, tf.Dimension):
+            in_shape = in_shape.value
+
+        if in_shape is None:
+            raise ValueError(
+                "Input tensor should have shape."
+                " You can set it using twml.util.limit_sparse_tensor_size"
+            )
+
+        (split_dim, other_dim) = (
+            (in_shape, out_shape) if self.partition_axis == 0 else (out_shape, in_shape)
+        )
+        requested_size = (
+            math.ceil(float(split_dim) / num_partitions) * other_dim * dtype.size
+        )
+        if requested_size >= 2**31:
+            raise ValueError(
+                "Weight tensor partitions cannot be larger than 2GB.\n"
+                "Requested Dimensions(%d, %d) of type %s (%d bytes total) over %d partitions.\n"
+                "Possible solutions:\n"
+                "- reduce the params.output_size_bits\n"
+                "- reduce the output_size of the sparse_layer\n"
+                "- specify a larger num_partitions argument\n"
+                "- reduce input_size_bits"
+                % (
+                    in_shape,
+                    self.output_size,
+                    dtype.name,
+                    requested_size,
+                    num_partitions,
+                )
+            )
+
+        if self.num_partitions:
+            partition_axis = int(self.partition_axis)
+            partitioner = tf.fixed_size_partitioner(
+                self.num_partitions, axis=partition_axis
+            )
+        else:
+            # Regular variables do not like it when you pass both constant tensors and shape
+            if not callable(self.weight_initializer):
+                shape = None
+
+        self._make_weight_var(shape, partitioner)
+
+        self.built = True
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+
+        """
+        raise NotImplementedError
+
+    def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Arguments:
+          inputs:
+            A SparseTensor or a list of SparseTensors.
+            If `inputs` is a list, all tensors must have same `dense_shape`.
+
+        Returns:
+          - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
+          - If `inputs` is a `list[SparseTensor`, then returns
+            `bias + add_n([sp_a * dense_b for sp_a in inputs])`.
+
+        """
+        if isinstance(inputs, (list, tuple)):
+            if isinstance(self.use_binary_values, (list, tuple)):
+                use_binary_values = self.use_binary_values
+            else:
+                use_binary_values = [self.use_binary_values] * len(inputs)
+
+            num_inputs = len(inputs)
+            if num_inputs != len(use_binary_values):
+                raise ValueError(
+                    "#inputs is %d while #use_binary_values is %d"
+                    % (num_inputs, len(use_binary_values))
+                )
+
+            outputs = []
+            for n in range(num_inputs):
+                outputs.append(
+                    sparse_dense_matmul(
+                        inputs[n],
+                        self.weight,
+                        self.use_sparse_grads,
+                        use_binary_values[n],
+                        name="sparse_mm_" + str(n),
+                        partition_axis=self.partition_axis,
+                        num_partitions=self.num_partitions,
+                        compress_ids=self._use_compression,
+                        cast_indices_dtype=self._cast_indices_dtype,
+                        use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul,
+                    )
+                )
+            outputs = tf.accumulate_n(outputs)
+        else:
+            if isinstance(self.use_binary_values, (list, tuple)):
+                raise ValueError(
+                    "use_binary_values can not be %s when inputs is %s"
+                    % (type(self.use_binary_values), type(inputs))
+                )
+
+            outputs = sparse_dense_matmul(
+                inputs,
+                self.weight,
+                self.use_sparse_grads,
+                self.use_binary_values,
+                name="sparse_mm",
+                partition_axis=self.partition_axis,
+                num_partitions=self.num_partitions,
+                compress_ids=self._use_compression,
+                cast_indices_dtype=self._cast_indices_dtype,
+                use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul,
+            )
+
+        if self.bias is not None:
+            outputs = tf.nn.bias_add(outputs, self.bias)
+
+        if self.activation is not None:
+            return self.activation(outputs)  # pylint: disable=not-callable
+        return outputs
+
+
+def full_sparse(
+    inputs,
+    output_size,
+    input_size=None,
+    activation=None,
+    bias_regularizer=None,
+    weight_regularizer=None,
+    bias_initializer=None,
+    weight_initializer=None,
+    trainable=True,
+    name=None,
+    reuse=None,
+    use_sparse_grads=True,
+    num_partitions=None,
+    partition_axis=0,
+    use_binary_values=False,
+    use_compression=False,
+):
+    """Functional interface for the sparsely-connected layer.
+
+    Arguments:
+      inputs:
+        A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
+      output_size:
+        Long or Integer, dimensionality of the output space.
+      weight_initializer:
+        Initializer function for the weight matrix.
+      activation:
+        Activation function (callable). Set it to None to maintain a linear activation.
+      bias_initializer:
+        Initializer function for the bias.
+      weight_regularizer:
+        Regularizer function for the weight matrix.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      bias_regularizer:
+        Regularizer function for the bias.
+        Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+      trainable:
+        Boolean, if `True` also add variables to the graph collection
+        ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+        <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+      name:
+        String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+      use_sparse_grads:
+        Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
+        make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
+        speed up at training time when input_size is large and optimizer handles sparse gradients
+        correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
+        to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
+        be large, so it's better to set it to `True`
+      num_partitions:
+        Number of partitions to use for the weight variable. Defaults to 1.
+      partition_axis:
+        If num_partitions is specified, the partition axis for the weight variable
+        Defaults to 0 (partition by row).
+        Must be 0 (row) or 1 (column)
+      use_binary_values:
+        Assume all non zero values are 1. Defaults to False.
+        This can improve training if used in conjunction with MDL.
+      use_compression:
+        Default False. Set True to enable data compression techniques for
+        optimization of network traffic for distributed training.
+    Returns:
+      Outputs a ``tf.Tensor`` of size ``[batch_size x output_size]``.
+    """
+    # TODO - remove input_size warning.
+    if input_size:
+        raise ValueError(
+            "input_size is deprecated - it is now \
+                      automatically inferred from your input."
+        )
+
+    dtype = None
+    if isinstance(inputs, twml.SparseTensor):
+        inputs = inputs.to_tf()
+        dtype = inputs.dtype.base_dtype
+
+    if isinstance(inputs, (list, tuple)):
+        inputs = [
+            inp.to_tf() if isinstance(inp, twml.SparseTensor) else inp for inp in inputs
+        ]
+        dtype = inputs[0].dtype.base_dtype
+
+    layer = FullSparse(
+        output_size=output_size,
+        activation=activation,
+        trainable=trainable,
+        name=name,
+        weight_initializer=weight_initializer,
+        bias_initializer=bias_initializer,
+        weight_regularizer=weight_regularizer,
+        bias_regularizer=bias_regularizer,
+        dtype=dtype,
+        _scope=name,
+        _reuse=reuse,
+        use_sparse_grads=use_sparse_grads,
+        num_partitions=num_partitions,
+        partition_axis=partition_axis,
+        use_compression=use_compression,
+        use_binary_values=use_binary_values,
+    )
+    return layer(inputs)
diff --git a/twml/twml/layers/isotonic.py b/twml/twml/layers/isotonic.py
index 7113f7af4..867a9ea2f 100644
--- a/twml/twml/layers/isotonic.py
+++ b/twml/twml/layers/isotonic.py
@@ -3,74 +3,84 @@
 Contains the Isotonic Layer
 """
 
-from .layer import Layer
-
 import libtwml
 import numpy as np
 
+from .layer import Layer
+
 
 class Isotonic(Layer):
-  """
-  This layer is created by the IsotonicCalibrator.
-  Typically it is used intead of sigmoid activation on the output unit.
-
-  Arguments:
-    n_unit:
-      number of input units to the layer (same as number of output units).
-    n_bin:
-      number of bins used for isotonic calibration.
-      More bins means a more precise isotonic function.
-      Less bins means a more regularized isotonic function.
-    xs_input:
-      A tensor containing the boundaries of the bins.
-    ys_input:
-      A tensor containing calibrated values for the corresponding bins.
-
-  Output:
-      output:
-        A layer containing calibrated probabilities with same shape and size as input.
-  Expected Sizes:
-      xs_input, ys_input:
-        [n_unit, n_bin].
-  Expected Types:
-      xs_input, ys_input:
-        same as input.
-  """
-
-  def __init__(self, n_unit, n_bin, xs_input=None, ys_input=None, **kwargs):
-    super(Isotonic, self).__init__(**kwargs)
-
-    self._n_unit = n_unit
-    self._n_bin = n_bin
-
-    self.xs_input = np.empty([n_unit, n_bin], dtype=np.float32) if xs_input is None else xs_input
-    self.ys_input = np.empty([n_unit, n_bin], dtype=np.float32) if ys_input is None else ys_input
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
+    """
+    This layer is created by the IsotonicCalibrator.
+    Typically it is used intead of sigmoid activation on the output unit.
 
+    Arguments:
+      n_unit:
+        number of input units to the layer (same as number of output units).
+      n_bin:
+        number of bins used for isotonic calibration.
+        More bins means a more precise isotonic function.
+        Less bins means a more regularized isotonic function.
+      xs_input:
+        A tensor containing the boundaries of the bins.
+      ys_input:
+        A tensor containing calibrated values for the corresponding bins.
+
+    Output:
+        output:
+          A layer containing calibrated probabilities with same shape and size as input.
+    Expected Sizes:
+        xs_input, ys_input:
+          [n_unit, n_bin].
+    Expected Types:
+        xs_input, ys_input:
+          same as input.
     """
-    raise NotImplementedError
 
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the variables of the layer."""
+    def __init__(self, n_unit, n_bin, xs_input=None, ys_input=None, **kwargs):
+        super(Isotonic, self).__init__(**kwargs)
 
-    self.built = True
+        self._n_unit = n_unit
+        self._n_bin = n_bin
 
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
+        self.xs_input = (
+            np.empty([n_unit, n_bin], dtype=np.float32)
+            if xs_input is None
+            else xs_input
+        )
+        self.ys_input = (
+            np.empty([n_unit, n_bin], dtype=np.float32)
+            if ys_input is None
+            else ys_input
+        )
 
-    Arguments:
-      inputs: input tensor(s).
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
 
-    Returns:
-      The output from the layer
-    """
-    calibrate_op = libtwml.ops.isotonic_calibration(inputs, self.xs_input, self.ys_input)
-    return calibrate_op
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+
+        """
+        raise NotImplementedError
+
+    def build(self, input_shape):  # pylint: disable=unused-argument
+        """Creates the variables of the layer."""
+
+        self.built = True
+
+    def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Arguments:
+          inputs: input tensor(s).
+
+        Returns:
+          The output from the layer
+        """
+        calibrate_op = libtwml.ops.isotonic_calibration(
+            inputs, self.xs_input, self.ys_input
+        )
+        return calibrate_op
diff --git a/twml/twml/layers/layer.py b/twml/twml/layers/layer.py
index c1b00eb13..fe994de88 100644
--- a/twml/twml/layers/layer.py
+++ b/twml/twml/layers/layer.py
@@ -7,44 +7,44 @@
 
 
 class Layer(base.Layer):
-  """
-  Base Layer implementation for twml.
-  Overloads `twml.layers.Layer
-  <https://www.tensorflow.org/versions/master/api_docs/python/tf/layers/Layer>`_
-  from tensorflow and adds a couple of custom methods.
-  """
-
-  @property
-  def init(self):
     """
-    Return initializer ops. By default returns tf.no_op().
-    This method is overwritten by classes like twml.layers.MDL, which
-    uses a HashTable internally, that must be initialized with its own op.
+    Base Layer implementation for twml.
+    Overloads `twml.layers.Layer
+    <https://www.tensorflow.org/versions/master/api_docs/python/tf/layers/Layer>`_
+    from tensorflow and adds a couple of custom methods.
     """
-    return tf.no_op()
 
-  def call(self, inputs, **kwargs):
-    """The logic of the layer lives here.
+    @property
+    def init(self):
+        """
+        Return initializer ops. By default returns tf.no_op().
+        This method is overwritten by classes like twml.layers.MDL, which
+        uses a HashTable internally, that must be initialized with its own op.
+        """
+        return tf.no_op()
 
-    Arguments:
-      inputs:
-        input tensor(s).
-      **kwargs:
-        additional keyword arguments.
+    def call(self, inputs, **kwargs):
+        """The logic of the layer lives here.
 
-    Returns:
-      Output tensor(s).
-    """
-    raise NotImplementedError
+        Arguments:
+          inputs:
+            input tensor(s).
+          **kwargs:
+            additional keyword arguments.
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+        Returns:
+          Output tensor(s).
+        """
+        raise NotImplementedError
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
 
-    Raise NotImplementedError.
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
 
-    """
-    raise NotImplementedError
+        Raise NotImplementedError.
+
+        """
+        raise NotImplementedError
diff --git a/twml/twml/layers/mdl.py b/twml/twml/layers/mdl.py
index cf4018afa..5ba2c13bf 100644
--- a/twml/twml/layers/mdl.py
+++ b/twml/twml/layers/mdl.py
@@ -4,253 +4,278 @@
 """
 
 
-from .layer import Layer
-from .partition import Partition
-from .stitch import Stitch
-
 import libtwml
 import numpy as np
 import tensorflow.compat.v1 as tf
+
 import twml
 
+from .layer import Layer
+from .partition import Partition
+from .stitch import Stitch
+
 
 class MDL(Layer):  # noqa: T000
-  """
-  MDL layer is constructed by MDLCalibrator after accumulating data
-  and performing minimum description length (MDL) calibration.
-
-  MDL takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an MDL bin.
-  Each MDL input feature is converted to n_bin bins.
-  Each MDL calibration tries to find bin delimiters such that the number of features values
-  per bin is roughly equal (for each given MDL feature).
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  """
-
-  def __init__(
-          self,
-          n_feature, n_bin, out_bits,
-          bin_values=None, hash_keys=None, hash_values=None,
-          bin_ids=None, feature_offsets=None, **kwargs):
-    """
-    Creates a non-initialized `MDL` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during MDL calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of MDL bins used for MDL calibration.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that MDL discretizes and knows about.
-        The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces: MDL vs non-MDL
-          2. transate the MDL features into a hash_feature ID that MDL understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for MDL.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the MDL features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-    """
-    super(MDL, self).__init__(**kwargs)
-    tf.logging.warning("MDL will be deprecated. Please use PercentileDiscretizer instead")
-
-    max_mdl_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    self._hash_keys_initializer = tf.constant_initializer(
-      hash_keys if hash_keys is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._hash_values_initializer = tf.constant_initializer(
-      hash_values if hash_values is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._bin_ids_initializer = tf.constant_initializer(
-      bin_ids if bin_ids is not None
-      else np.empty(max_mdl_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._bin_values_initializer = tf.constant_initializer(
-      bin_values if bin_values is not None
-      else np.empty(max_mdl_feature, dtype=np.float32),
-      dtype=np.float32
-    )
-    self._feature_offsets_initializer = tf.constant_initializer(
-      feature_offsets if feature_offsets is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-
-    # note that calling build here is an exception as typically __call__ would call build().
-    # We call it here because we need to initialize hash_map.
-    # Also note that the variable_scope is set by add_variable in build()
-    if not self.built:
-      self.build(input_shape=None)
-
-    self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer:
-    hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
     """
-
-    # build layers
-    self.partition = Partition()
-    self.stitch = Stitch()
-
-    # build variables
-
-    hash_keys = self.add_variable(
-      'hash_keys',
-      initializer=self._hash_keys_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    hash_values = self.add_variable(
-      'hash_values',
-      initializer=self._hash_values_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    # hashmap converts known features into range [0, n_feature)
-    initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
-    self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
-
-    self.bin_ids = self.add_variable(
-      'bin_ids',
-      initializer=self._bin_ids_initializer,
-      shape=[self._n_feature * (self._n_bin + 1)],
-      dtype=tf.int64,
-      trainable=False)
-
-    self.bin_values = self.add_variable(
-      'bin_values',
-      initializer=self._bin_values_initializer,
-      shape=[self._n_feature * (self._n_bin + 1)],
-      dtype=tf.float32,
-      trainable=False)
-
-    self.feature_offsets = self.add_variable(
-      'feature_offsets',
-      initializer=self._feature_offsets_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements MDL inference where inputs are intersected with a hash_map.
-    Part of the inputs are discretized using twml.mdl to produce a mdl_output SparseTensor.
-    This SparseTensor is then joined with the original inputs SparseTensor,
-    but only for the inputs keys that did not get discretized.
-
-    Args:
-      inputs: A 2D SparseTensor that is input to MDL for discretization.
-        It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+    MDL layer is constructed by MDLCalibrator after accumulating data
+    and performing minimum description length (MDL) calibration.
+
+    MDL takes sparse continuous features and converts then to sparse
+    binary features. Each binary output feature is associated to an MDL bin.
+    Each MDL input feature is converted to n_bin bins.
+    Each MDL calibration tries to find bin delimiters such that the number of features values
+    per bin is roughly equal (for each given MDL feature).
+    Note that if an input feature is rarely used, so will its associated output bin/features.
     """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    # get intersect(keys, hash_map)
-    hashed_keys = self.hash_map.lookup(keys)
 
-    found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
-    partition_ids = tf.cast(found, tf.int32)
-
-    vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys))
-    non_mdl_keys, mdl_in_keys = key
-    non_mdl_vals, mdl_in_vals = vals
-
-    self.non_mdl_keys = non_mdl_keys
-
-    # run MDL on the keys/values it knows about
-    mdl_keys, mdl_vals = libtwml.ops.mdl(mdl_in_keys, mdl_in_vals, self.bin_ids, self.bin_values,
-                                         self.feature_offsets)
-
-    # handle output ID conflicts
-    mdl_size = tf.size(self.bin_ids, out_type=tf.int64)
-    non_mdl_size = tf.subtract(self.output_size, mdl_size)
-    non_mdl_keys = tf.add(tf.floormod(non_mdl_keys, non_mdl_size), mdl_size)
-
-    # Stitch the keys and values from mdl and non mdl indices back, with help
-    # of the Stitch Layer
-
-    # out for inference checking
-    self.mdl_out_keys = mdl_keys
-
-    concat_data = self.stitch([non_mdl_vals, mdl_vals],
-                              [non_mdl_keys, mdl_keys],
-                              indices)
-
-    concat_vals, concat_keys = concat_data
-
-    # Generate output shape using _compute_output_shape
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self.output_size]
-    return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
+    def __init__(
+        self,
+        n_feature,
+        n_bin,
+        out_bits,
+        bin_values=None,
+        hash_keys=None,
+        hash_values=None,
+        bin_ids=None,
+        feature_offsets=None,
+        **kwargs
+    ):
+        """
+        Creates a non-initialized `MDL` object.
+        Before using the table you will have to initialize it. After initialization
+        the table will be immutable.
+
+        Parent class args:
+          see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
+          for documentation of parent class arguments.
+
+        Required args:
+          n_feature:
+            number of unique features accumulated during MDL calibration.
+            This is the number of features in the hash map.
+            Used to initialize bin_values, hash_keys, hash_values,
+            bin_ids, bin_values and feature_offsets.
+          n_bin:
+            number of MDL bins used for MDL calibration.
+            Used to initialize bin_values, hash_keys, hash_values,
+            bin_ids, bin_values and feature_offsets.
+          out_bits:
+            Determines the maximum value for output feature IDs.
+            The dense_shape of the SparseTensor returned by lookup(x)
+            will be [x.shape[0], 1 << output_bits].
+
+        Optional args:
+          hash_keys:
+            contains the features ID that MDL discretizes and knows about.
+            The hash map (hash_keys->hash_values) is used for two reasons:
+              1. divide inputs into two feature spaces: MDL vs non-MDL
+              2. transate the MDL features into a hash_feature ID that MDL understands.
+            The hash_map is expected to contain n_feature items.
+          hash_values:
+            translates the feature IDs into hash_feature IDs for MDL.
+          bin_ids:
+            a 1D Tensor of size n_feature * n_bin + 1 which contains
+            unique IDs to which the MDL features will be translated to.
+            For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
+            the most efficient output space.
+          bin_values:
+            a 1D Tensor aligned with bin_ids.
+            For a given hash_feature ID j, it's value bin's are indexed between
+            `j*n_bin` and `j*n_bin + n_bin-1`.
+            As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
+            and a inputs value between
+            `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
+          feature_offsets:
+            a 1D Tensor specifying the starting location of bins for a given feature id.
+            For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
+        """
+        super(MDL, self).__init__(**kwargs)
+        tf.logging.warning(
+            "MDL will be deprecated. Please use PercentileDiscretizer instead"
+        )
+
+        max_mdl_feature = n_feature * (n_bin + 1)
+        self._n_feature = n_feature
+        self._n_bin = n_bin
+
+        self._hash_keys_initializer = tf.constant_initializer(
+            hash_keys if hash_keys is not None else np.empty(n_feature, dtype=np.int64),
+            dtype=np.int64,
+        )
+        self._hash_values_initializer = tf.constant_initializer(
+            hash_values
+            if hash_values is not None
+            else np.empty(n_feature, dtype=np.int64),
+            dtype=np.int64,
+        )
+        self._bin_ids_initializer = tf.constant_initializer(
+            bin_ids
+            if bin_ids is not None
+            else np.empty(max_mdl_feature, dtype=np.int64),
+            dtype=np.int64,
+        )
+        self._bin_values_initializer = tf.constant_initializer(
+            bin_values
+            if bin_values is not None
+            else np.empty(max_mdl_feature, dtype=np.float32),
+            dtype=np.float32,
+        )
+        self._feature_offsets_initializer = tf.constant_initializer(
+            feature_offsets
+            if feature_offsets is not None
+            else np.empty(n_feature, dtype=np.int64),
+            dtype=np.int64,
+        )
+
+        # note that calling build here is an exception as typically __call__ would call build().
+        # We call it here because we need to initialize hash_map.
+        # Also note that the variable_scope is set by add_variable in build()
+        if not self.built:
+            self.build(input_shape=None)
+
+        self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
+
+    def build(self, input_shape):  # pylint: disable=unused-argument
+        """
+        Creates the variables of the layer:
+        hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
+        """
+
+        # build layers
+        self.partition = Partition()
+        self.stitch = Stitch()
+
+        # build variables
+
+        hash_keys = self.add_variable(
+            "hash_keys",
+            initializer=self._hash_keys_initializer,
+            shape=[self._n_feature],
+            dtype=tf.int64,
+            trainable=False,
+        )
+
+        hash_values = self.add_variable(
+            "hash_values",
+            initializer=self._hash_values_initializer,
+            shape=[self._n_feature],
+            dtype=tf.int64,
+            trainable=False,
+        )
+
+        # hashmap converts known features into range [0, n_feature)
+        initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
+        self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
+
+        self.bin_ids = self.add_variable(
+            "bin_ids",
+            initializer=self._bin_ids_initializer,
+            shape=[self._n_feature * (self._n_bin + 1)],
+            dtype=tf.int64,
+            trainable=False,
+        )
+
+        self.bin_values = self.add_variable(
+            "bin_values",
+            initializer=self._bin_values_initializer,
+            shape=[self._n_feature * (self._n_bin + 1)],
+            dtype=tf.float32,
+            trainable=False,
+        )
+
+        self.feature_offsets = self.add_variable(
+            "feature_offsets",
+            initializer=self._feature_offsets_initializer,
+            shape=[self._n_feature],
+            dtype=tf.int64,
+            trainable=False,
+        )
+
+        # make sure this is last
+        self.built = True
+
+    def call(self, inputs, **kwargs):
+        """Looks up `keys` in a table, outputs the corresponding values.
+
+        Implements MDL inference where inputs are intersected with a hash_map.
+        Part of the inputs are discretized using twml.mdl to produce a mdl_output SparseTensor.
+        This SparseTensor is then joined with the original inputs SparseTensor,
+        but only for the inputs keys that did not get discretized.
+
+        Args:
+          inputs: A 2D SparseTensor that is input to MDL for discretization.
+            It has a dense_shape of [batch_size, input_size]
+          name: A name for the operation (optional).
+        Returns:
+          A `SparseTensor` of the same type as `inputs`.
+          Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+        """
+        if isinstance(inputs, tf.SparseTensor):
+            inputs = twml.SparseTensor.from_tf(inputs)
+
+        assert isinstance(inputs, twml.SparseTensor)
+
+        # sparse column indices
+        ids = inputs.ids
+        # sparse row indices
+        keys = inputs.indices
+        # sparse values
+        vals = inputs.values
+
+        # get intersect(keys, hash_map)
+        hashed_keys = self.hash_map.lookup(keys)
+
+        found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
+        partition_ids = tf.cast(found, tf.int32)
+
+        vals, key, indices = self.partition(
+            partition_ids, vals, tf.where(found, hashed_keys, keys)
+        )
+        non_mdl_keys, mdl_in_keys = key
+        non_mdl_vals, mdl_in_vals = vals
+
+        self.non_mdl_keys = non_mdl_keys
+
+        # run MDL on the keys/values it knows about
+        mdl_keys, mdl_vals = libtwml.ops.mdl(
+            mdl_in_keys,
+            mdl_in_vals,
+            self.bin_ids,
+            self.bin_values,
+            self.feature_offsets,
+        )
+
+        # handle output ID conflicts
+        mdl_size = tf.size(self.bin_ids, out_type=tf.int64)
+        non_mdl_size = tf.subtract(self.output_size, mdl_size)
+        non_mdl_keys = tf.add(tf.floormod(non_mdl_keys, non_mdl_size), mdl_size)
+
+        # Stitch the keys and values from mdl and non mdl indices back, with help
+        # of the Stitch Layer
+
+        # out for inference checking
+        self.mdl_out_keys = mdl_keys
+
+        concat_data = self.stitch(
+            [non_mdl_vals, mdl_vals], [non_mdl_keys, mdl_keys], indices
+        )
+
+        concat_vals, concat_keys = concat_data
+
+        # Generate output shape using _compute_output_shape
+
+        batch_size = tf.to_int64(inputs.dense_shape[0])
+        output_shape = [batch_size, self.output_size]
+        return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+
+        """
+        raise NotImplementedError
diff --git a/twml/twml/layers/partition.py b/twml/twml/layers/partition.py
index 0e7c85f18..5c93d190b 100644
--- a/twml/twml/layers/partition.py
+++ b/twml/twml/layers/partition.py
@@ -3,72 +3,77 @@
 """
 
 
-from .layer import Layer
-
 import tensorflow.compat.v1 as tf
 
+from .layer import Layer
 
-class Partition(Layer):
-  """
-  This layer implements:
-
-  .. code-block:: python
-
-    tf.dynamic_partition(input_vals, partition_ids, self.partitions)
-
-  Input:
-    partitions:
-      the number of partitions which we will divide the hashmap keys/bvalues
-
-  Output:
-    A layer that performs partitioning
-   """
 
-  def __init__(self, partitions=2, **kwargs):
-    self.partitions = partitions
-    super(Partition, self).__init__(**kwargs)
+class Partition(Layer):
+    """
+    This layer implements:
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    .. code-block:: python
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+      tf.dynamic_partition(input_vals, partition_ids, self.partitions)
 
-    Raises NotImplementedError.
+    Input:
+      partitions:
+        the number of partitions which we will divide the hashmap keys/bvalues
 
+    Output:
+      A layer that performs partitioning
     """
-    raise NotImplementedError
-
-  def call(self, partition_ids, input_vals, input_keys, **kwargs):
-    """This layer is responsible for partitioning the values/keys of a hashmap
-
-    Arguments:
-      partition_ids:
-        Tensor that is equivalent to boolean (int32).
-      input_vals:
-        Tensor that represents the values of the hashmap(float).
-      input_keys:
-        Tensor that represents the keys of the hashmap(float)
-
-    Returns:
-      The output of the partition layer, which is a list of lists which looks
-      something like:
-
-      .. code-block:: python
-
-        [[vals_0, vals_1], [keys_0, keys_1], [indices_0, indices_1]]
-
-      where:
-        vals_x:
-          values of the hashmap for partition x
-        keys_x:
-          keys of the hashmap for partition x
-        indices_x:
-          indices of the hashmap for partition x
-    """
-    partioned_val = tf.dynamic_partition(input_vals, partition_ids, self.partitions)
-    partioned_keys = tf.dynamic_partition(input_keys, partition_ids, self.partitions)
-    partioned_indices = tf.dynamic_partition(tf.range(tf.shape(partition_ids)[0]),
-                                             tf.cast(partition_ids, tf.int32), self.partitions)
-    return [partioned_val, partioned_keys, partioned_indices]
+
+    def __init__(self, partitions=2, **kwargs):
+        self.partitions = partitions
+        super(Partition, self).__init__(**kwargs)
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+
+        """
+        raise NotImplementedError
+
+    def call(self, partition_ids, input_vals, input_keys, **kwargs):
+        """This layer is responsible for partitioning the values/keys of a hashmap
+
+        Arguments:
+          partition_ids:
+            Tensor that is equivalent to boolean (int32).
+          input_vals:
+            Tensor that represents the values of the hashmap(float).
+          input_keys:
+            Tensor that represents the keys of the hashmap(float)
+
+        Returns:
+          The output of the partition layer, which is a list of lists which looks
+          something like:
+
+          .. code-block:: python
+
+            [[vals_0, vals_1], [keys_0, keys_1], [indices_0, indices_1]]
+
+          where:
+            vals_x:
+              values of the hashmap for partition x
+            keys_x:
+              keys of the hashmap for partition x
+            indices_x:
+              indices of the hashmap for partition x
+        """
+        partioned_val = tf.dynamic_partition(input_vals, partition_ids, self.partitions)
+        partioned_keys = tf.dynamic_partition(
+            input_keys, partition_ids, self.partitions
+        )
+        partioned_indices = tf.dynamic_partition(
+            tf.range(tf.shape(partition_ids)[0]),
+            tf.cast(partition_ids, tf.int32),
+            self.partitions,
+        )
+        return [partioned_val, partioned_keys, partioned_indices]
diff --git a/twml/twml/layers/percentile_discretizer.py b/twml/twml/layers/percentile_discretizer.py
index 55bb4de8c..edc6846f4 100644
--- a/twml/twml/layers/percentile_discretizer.py
+++ b/twml/twml/layers/percentile_discretizer.py
@@ -7,203 +7,235 @@
 import libtwml
 import numpy as np
 import tensorflow.compat.v1 as tf
+
 import twml
 from twml.layers import Layer
 
 
 class PercentileDiscretizer(Layer):
-  """
-  PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after
-  accumulating data and performing percentile bucket calibration.
-
-  PercentileDiscretizer takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an PercentileDiscretizer bin.
-  Each PercentileDiscretizer input feature is converted to n_bin bins.
-  Each PercentileDiscretizer calibration tries to find bin delimiters such
-  that the number of features values per bin is roughly equal (for
-  each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx.
-  equiprobable, according to the given calibration data.
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  """
-
-  def __init__(
-      self,
-      n_feature, n_bin, out_bits,
-      bin_values=None, hash_keys=None, hash_values=None,
-      bin_ids=None, feature_offsets=None, num_parts=1, cost_per_unit=100, **kwargs):
     """
-    Creates a non-initialized `PercentileDiscretizer` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    If there are no calibrated features, then the discretizer will only apply
-    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-    the discretizer will be a "no-operation", other than obeying `out_bits`
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during PercentileDiscretizer calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of PercentileDiscretizer bins used for PercentileDiscretizer calibration.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that PercentileDiscretizer discretizes and knows about.
-        The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces:
-          PercentileDiscretizer vs non-PercentileDiscretizer
-          2. transate the PercentileDiscretizer features into a hash_feature ID that
-          PercentileDiscretizer understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for PercentileDiscretizer.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the PercentileDiscretizer features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
+    PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after
+    accumulating data and performing percentile bucket calibration.
+
+    PercentileDiscretizer takes sparse continuous features and converts then to sparse
+    binary features. Each binary output feature is associated to an PercentileDiscretizer bin.
+    Each PercentileDiscretizer input feature is converted to n_bin bins.
+    Each PercentileDiscretizer calibration tries to find bin delimiters such
+    that the number of features values per bin is roughly equal (for
+    each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx.
+    equiprobable, according to the given calibration data.
+    Note that if an input feature is rarely used, so will its associated output bin/features.
     """
 
-    super(PercentileDiscretizer, self).__init__(**kwargs)
-
-    if not self.built:
-      self.build(input_shape=None)
-
-    max_discretizer_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    # build variables
-    self._out_bits = out_bits
-    self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-    self._hash_keys = (hash_keys if hash_keys is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self._hash_values = (hash_values if hash_values is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self._bin_ids = (bin_ids if bin_ids is not None else
-     np.empty(max_discretizer_feature, dtype=np.int64))
-    self._bin_values = (bin_values if bin_values is not None else
-     np.empty(max_discretizer_feature, dtype=np.float32))
-    self._feature_offsets = (feature_offsets if feature_offsets is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self.num_parts = num_parts
-    self.cost_per_unit = cost_per_unit
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer
-    """
-    self.built = True
-
-  def call(self, inputs, keep_inputs=False, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements PercentileDiscretizer inference where inputs are intersected with a hash_map.
-    Input features that were not calibrated have their feature IDs truncated, so as
-    to be less than 1<<output_bits, but their values remain untouched (not discretized)
-
-    If there are no calibrated features, then the discretizer will only apply
-    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-    the discretizer will be a "no-operation", other than obeying `out_bits`
-
-    Args:
-      inputs: A 2D SparseTensor that is input to PercentileDiscretizer for discretization.
-        It has a dense_shape of [batch_size, input_size]
-      keep_inputs:
-        Include the original inputs in the output.
-        Note - if True, undiscretized features will be passed through, but will have
-        their values doubled (unless there are no calibrated features to discretize).
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    if self._n_feature > 0:
-      discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2(
-        input_ids=keys,  # inc key assigned to feature_id, or -1
-        input_vals=vals,  # the observed feature values
-        bin_ids=self._bin_ids,  # n_feat X (n_bin+1) 2D arange
-        bin_vals=self._bin_values,  # bin boundaries
-        feature_offsets=self._feature_offsets,  # 0 : nbin_1 : max_feat
-        output_bits=self._out_bits,
-        feature_ids=tf.make_tensor_proto(self._hash_keys),  # feature ids to build internal hash map
-        feature_indices=tf.make_tensor_proto(self._hash_values),  # keys associated w/ feat. indices
-        start_compute=tf.constant(0, shape=[], dtype=tf.int64),
-        end_compute=tf.constant(-1, shape=[], dtype=tf.int64),
-        cost_per_unit=self.cost_per_unit
-      )
-    else:
-      discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
-      discretizer_vals = vals
-      # don't 2x the input.
-      keep_inputs = False
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self._output_size]
-
-    output = twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
-
-    if keep_inputs:
-      # Note the non-discretized features will end up doubled,
-      #   since these are already in `output`
-      # handle output ID conflicts
-      mdl_size = self._n_feature * (self._n_bin + 1)
-      non_mdl_size = tf.subtract(self._output_size, mdl_size)
-      input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size)
-
-      new_input = twml.SparseTensor(
-        ids=ids, indices=input_keys, values=vals, dense_shape=output_shape).to_tf()
-
-      # concatenate discretizer output with original input
-      sparse_add = tf.sparse_add(new_input, output)
-      output = tf.SparseTensor(sparse_add.indices, sparse_add.values, output_shape)
-
-    return output
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
+    def __init__(
+        self,
+        n_feature,
+        n_bin,
+        out_bits,
+        bin_values=None,
+        hash_keys=None,
+        hash_values=None,
+        bin_ids=None,
+        feature_offsets=None,
+        num_parts=1,
+        cost_per_unit=100,
+        **kwargs
+    ):
+        """
+        Creates a non-initialized `PercentileDiscretizer` object.
+        Before using the table you will have to initialize it. After initialization
+        the table will be immutable.
+
+        If there are no calibrated features, then the discretizer will only apply
+        twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
+        the discretizer will be a "no-operation", other than obeying `out_bits`
+
+        Parent class args:
+          see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
+          for documentation of parent class arguments.
+
+        Required args:
+          n_feature:
+            number of unique features accumulated during PercentileDiscretizer calibration.
+            This is the number of features in the hash map.
+            Used to initialize bin_values, hash_keys, hash_values,
+            bin_ids, bin_values and feature_offsets.
+          n_bin:
+            number of PercentileDiscretizer bins used for PercentileDiscretizer calibration.
+            Used to initialize bin_values, hash_keys, hash_values,
+            bin_ids, bin_values and feature_offsets.
+          out_bits:
+            Determines the maximum value for output feature IDs.
+            The dense_shape of the SparseTensor returned by lookup(x)
+            will be [x.shape[0], 1 << output_bits].
+
+        Optional args:
+          hash_keys:
+            contains the features ID that PercentileDiscretizer discretizes and knows about.
+            The hash map (hash_keys->hash_values) is used for two reasons:
+              1. divide inputs into two feature spaces:
+              PercentileDiscretizer vs non-PercentileDiscretizer
+              2. transate the PercentileDiscretizer features into a hash_feature ID that
+              PercentileDiscretizer understands.
+            The hash_map is expected to contain n_feature items.
+          hash_values:
+            translates the feature IDs into hash_feature IDs for PercentileDiscretizer.
+          bin_ids:
+            a 1D Tensor of size n_feature * n_bin + 1 which contains
+            unique IDs to which the PercentileDiscretizer features will be translated to.
+            For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
+            the most efficient output space.
+          bin_values:
+            a 1D Tensor aligned with bin_ids.
+            For a given hash_feature ID j, it's value bin's are indexed between
+            `j*n_bin` and `j*n_bin + n_bin-1`.
+            As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
+            and a inputs value between
+            `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
+          feature_offsets:
+            a 1D Tensor specifying the starting location of bins for a given feature id.
+            For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
+        """
+
+        super(PercentileDiscretizer, self).__init__(**kwargs)
+
+        if not self.built:
+            self.build(input_shape=None)
+
+        max_discretizer_feature = n_feature * (n_bin + 1)
+        self._n_feature = n_feature
+        self._n_bin = n_bin
+
+        # build variables
+        self._out_bits = out_bits
+        self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
+        self._hash_keys = (
+            hash_keys if hash_keys is not None else np.empty(n_feature, dtype=np.int64)
+        )
+        self._hash_values = (
+            hash_values
+            if hash_values is not None
+            else np.empty(n_feature, dtype=np.int64)
+        )
+        self._bin_ids = (
+            bin_ids
+            if bin_ids is not None
+            else np.empty(max_discretizer_feature, dtype=np.int64)
+        )
+        self._bin_values = (
+            bin_values
+            if bin_values is not None
+            else np.empty(max_discretizer_feature, dtype=np.float32)
+        )
+        self._feature_offsets = (
+            feature_offsets
+            if feature_offsets is not None
+            else np.empty(n_feature, dtype=np.int64)
+        )
+        self.num_parts = num_parts
+        self.cost_per_unit = cost_per_unit
+
+    def build(self, input_shape):  # pylint: disable=unused-argument
+        """
+        Creates the variables of the layer
+        """
+        self.built = True
+
+    def call(self, inputs, keep_inputs=False, **kwargs):
+        """Looks up `keys` in a table, outputs the corresponding values.
+
+        Implements PercentileDiscretizer inference where inputs are intersected with a hash_map.
+        Input features that were not calibrated have their feature IDs truncated, so as
+        to be less than 1<<output_bits, but their values remain untouched (not discretized)
+
+        If there are no calibrated features, then the discretizer will only apply
+        twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
+        the discretizer will be a "no-operation", other than obeying `out_bits`
+
+        Args:
+          inputs: A 2D SparseTensor that is input to PercentileDiscretizer for discretization.
+            It has a dense_shape of [batch_size, input_size]
+          keep_inputs:
+            Include the original inputs in the output.
+            Note - if True, undiscretized features will be passed through, but will have
+            their values doubled (unless there are no calibrated features to discretize).
+          name: A name for the operation (optional).
+        Returns:
+          A `SparseTensor` of the same type as `inputs`.
+          Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+        """
+
+        if isinstance(inputs, tf.SparseTensor):
+            inputs = twml.SparseTensor.from_tf(inputs)
+
+        assert isinstance(inputs, twml.SparseTensor)
+
+        # sparse column indices
+        ids = inputs.ids
+        # sparse row indices
+        keys = inputs.indices
+        # sparse values
+        vals = inputs.values
+
+        if self._n_feature > 0:
+            discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2(
+                input_ids=keys,  # inc key assigned to feature_id, or -1
+                input_vals=vals,  # the observed feature values
+                bin_ids=self._bin_ids,  # n_feat X (n_bin+1) 2D arange
+                bin_vals=self._bin_values,  # bin boundaries
+                feature_offsets=self._feature_offsets,  # 0 : nbin_1 : max_feat
+                output_bits=self._out_bits,
+                feature_ids=tf.make_tensor_proto(
+                    self._hash_keys
+                ),  # feature ids to build internal hash map
+                feature_indices=tf.make_tensor_proto(
+                    self._hash_values
+                ),  # keys associated w/ feat. indices
+                start_compute=tf.constant(0, shape=[], dtype=tf.int64),
+                end_compute=tf.constant(-1, shape=[], dtype=tf.int64),
+                cost_per_unit=self.cost_per_unit,
+            )
+        else:
+            discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
+            discretizer_vals = vals
+            # don't 2x the input.
+            keep_inputs = False
+
+        batch_size = tf.to_int64(inputs.dense_shape[0])
+        output_shape = [batch_size, self._output_size]
+
+        output = twml.SparseTensor(
+            ids, discretizer_keys, discretizer_vals, output_shape
+        ).to_tf()
+
+        if keep_inputs:
+            # Note the non-discretized features will end up doubled,
+            #   since these are already in `output`
+            # handle output ID conflicts
+            mdl_size = self._n_feature * (self._n_bin + 1)
+            non_mdl_size = tf.subtract(self._output_size, mdl_size)
+            input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size)
+
+            new_input = twml.SparseTensor(
+                ids=ids, indices=input_keys, values=vals, dense_shape=output_shape
+            ).to_tf()
+
+            # concatenate discretizer output with original input
+            sparse_add = tf.sparse_add(new_input, output)
+            output = tf.SparseTensor(
+                sparse_add.indices, sparse_add.values, output_shape
+            )
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+
+        """
+        raise NotImplementedError
diff --git a/twml/twml/layers/sequential.py b/twml/twml/layers/sequential.py
index c0d4b92cc..3fd43b32a 100644
--- a/twml/twml/layers/sequential.py
+++ b/twml/twml/layers/sequential.py
@@ -3,158 +3,164 @@
 """
 
 
-from .layer import Layer
-
 from tensorflow import keras
 from tensorflow.python.layers import base
 
+from .layer import Layer
 
-class Sequential(Layer):
-  """
-  A sequential stack of layers.
-
-  Arguments:
-      layers: list of layers to add to the model.
-
-  Output:
-      the output of the sequential layers
-   """
-
-  def __init__(self, layers=None, **kwargs):
-    self._layers = []  # Stack of layers.
-    self._layer_names = []  # Stack of layers names
-    self._layer_outputs = []
-    # Add to the model any layers passed to the constructor.
-    if layers:
-      for layer in layers:
-        self.add(layer)
-    super(Sequential, self).__init__(**kwargs)
-
-  def add(self, layer):
-    """Adds a layer instance on top of the layer stack.
-
-    Arguments:
-      layer:
-        layer instance.
-
-    Raises:
-      TypeError:
-        if the layer argument is not instance of base.Layer
-    """
-    if not isinstance(layer, base.Layer) and not isinstance(layer, keras.layers.Layer):
-      raise TypeError('The added layer must be an instance of class Layer')
-
-    if layer.name in self._layer_names:
-      raise ValueError('Layer with name %s already exists in sequential layer' % layer.name)
-
-    self._layers.append(layer)
-    self._layer_names.append(layer.name)
-
-  def pop(self):
-    """Removes the last layer in the model.
-
-    Raises:
-      TypeError:
-        if there are no layers in the model.
-    """
-    if not self._layers or not self._layer_names:
-      raise TypeError('There are no layers in the model.')
-    self._layers.pop()
-    self._layer_names.pop()
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        input tensor(s).
-
-    Returns:
-      The output of the sequential layers
-    """
-    self._layer_outputs = []
-    for layer in self._layers:
-      # don't use layer.call because you want to build individual layers
-      inputs = layer(inputs)  # overwrites the current input after it has been processed
-      self._layer_outputs.append(inputs)
-    return inputs
-
-  @property
-  def layers(self):
-    """ Return the layers in the sequential layer """
-    return self._layers
-
-  @property
-  def layer_names(self):
-    """ Return the layer names in the sequential layer """
-    return self._layer_names
-
-  @property
-  def layer_outputs(self):
-    """ Return the layer outputs in the sequential layer """
-    return self._layer_outputs
-
-  def get(self, key):
-    """Retrieves the n-th layer.
-
-    Arguments:
-      key:
-        index of the layer
-
-    Output:
-      The n-th layer where n is equal to the key.
-    """
-    return self._layers[key]
-
-  def get_output(self, key):
-    """Retrieves the n-th layer output.
-
-    Arguments:
-      key:
-        index of the layer
-
-    Output:
-      The intermediary output equivalent to the nth layer, where n is equal to the key.
-    """
-    return self._layer_outputs[key]
-
-  def get_layer_by_name(self, name):
-    """Retrieves the layer corresponding to the name.
-
-    Arguments:
-      name:
-        name of the layer
 
-    Output:
-      list of layers that have the name desired
+class Sequential(Layer):
     """
-    return self._layers[self._layer_names.index(name)]
-
-  def get_layer_output_by_name(self, name):
-    """Retrieves the layer output corresponding to the name.
+    A sequential stack of layers.
 
     Arguments:
-      name:
-        name of the layer
+        layers: list of layers to add to the model.
 
     Output:
-      list of the output of the layers that have the desired name
+        the output of the sequential layers
     """
-    return self._layer_outputs[self._layer_names.index(name)]
 
-  @property
-  def init(self):
-    """ returns a list of initialization ops (one per layer) """
-    return [layer.init for layer in self._layers]
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
+    def __init__(self, layers=None, **kwargs):
+        self._layers = []  # Stack of layers.
+        self._layer_names = []  # Stack of layers names
+        self._layer_outputs = []
+        # Add to the model any layers passed to the constructor.
+        if layers:
+            for layer in layers:
+                self.add(layer)
+        super(Sequential, self).__init__(**kwargs)
+
+    def add(self, layer):
+        """Adds a layer instance on top of the layer stack.
+
+        Arguments:
+          layer:
+            layer instance.
+
+        Raises:
+          TypeError:
+            if the layer argument is not instance of base.Layer
+        """
+        if not isinstance(layer, base.Layer) and not isinstance(
+            layer, keras.layers.Layer
+        ):
+            raise TypeError("The added layer must be an instance of class Layer")
+
+        if layer.name in self._layer_names:
+            raise ValueError(
+                "Layer with name %s already exists in sequential layer" % layer.name
+            )
+
+        self._layers.append(layer)
+        self._layer_names.append(layer.name)
+
+    def pop(self):
+        """Removes the last layer in the model.
+
+        Raises:
+          TypeError:
+            if there are no layers in the model.
+        """
+        if not self._layers or not self._layer_names:
+            raise TypeError("There are no layers in the model.")
+        self._layers.pop()
+        self._layer_names.pop()
+
+    def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Arguments:
+          inputs:
+            input tensor(s).
+
+        Returns:
+          The output of the sequential layers
+        """
+        self._layer_outputs = []
+        for layer in self._layers:
+            # don't use layer.call because you want to build individual layers
+            inputs = layer(
+                inputs
+            )  # overwrites the current input after it has been processed
+            self._layer_outputs.append(inputs)
+        return inputs
+
+    @property
+    def layers(self):
+        """Return the layers in the sequential layer"""
+        return self._layers
+
+    @property
+    def layer_names(self):
+        """Return the layer names in the sequential layer"""
+        return self._layer_names
+
+    @property
+    def layer_outputs(self):
+        """Return the layer outputs in the sequential layer"""
+        return self._layer_outputs
+
+    def get(self, key):
+        """Retrieves the n-th layer.
+
+        Arguments:
+          key:
+            index of the layer
+
+        Output:
+          The n-th layer where n is equal to the key.
+        """
+        return self._layers[key]
+
+    def get_output(self, key):
+        """Retrieves the n-th layer output.
+
+        Arguments:
+          key:
+            index of the layer
+
+        Output:
+          The intermediary output equivalent to the nth layer, where n is equal to the key.
+        """
+        return self._layer_outputs[key]
+
+    def get_layer_by_name(self, name):
+        """Retrieves the layer corresponding to the name.
+
+        Arguments:
+          name:
+            name of the layer
+
+        Output:
+          list of layers that have the name desired
+        """
+        return self._layers[self._layer_names.index(name)]
+
+    def get_layer_output_by_name(self, name):
+        """Retrieves the layer output corresponding to the name.
+
+        Arguments:
+          name:
+            name of the layer
+
+        Output:
+          list of the output of the layers that have the desired name
+        """
+        return self._layer_outputs[self._layer_names.index(name)]
+
+    @property
+    def init(self):
+        """returns a list of initialization ops (one per layer)"""
+        return [layer.init for layer in self._layers]
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        Raise NotImplementedError.
+
+        """
+        raise NotImplementedError
diff --git a/twml/twml/layers/sparse_max_norm.py b/twml/twml/layers/sparse_max_norm.py
index e1f423fe0..ced1887dc 100644
--- a/twml/twml/layers/sparse_max_norm.py
+++ b/twml/twml/layers/sparse_max_norm.py
@@ -2,220 +2,232 @@
 """
 Contains the twml.layers.SparseMaxNorm layer.
 """
-from .layer import Layer
-
-from libtwml import OPLIB
 import tensorflow.compat.v1 as tf
-import twml
-
+from libtwml import OPLIB
 
-class SparseMaxNorm(Layer):
-  """
-  Computes a max-normalization and adds bias to the sparse_input,
-  forwards that through a sparse affine transform followed
-  by an non-linear activation on the resulting dense representation.
-
-  This layer has two parameters, one of which learns through gradient descent:
-    bias_x (optional):
-      vector of shape [input_size]. Learned through gradient descent.
-    max_x:
-      vector of shape [input_size]. Holds the maximas of input ``x`` for normalization.
-      Either calibrated through SparseMaxNorm calibrator, or calibrated online, or both.
-
-  The pseudo-code for this layer looks like:
-
-  .. code-block:: python
-
-    abs_x = abs(x)
-    normed_x = clip_by_value(x / max_x, -1, 1)
-    biased_x = normed_x + bias_x
-    return biased
-
-
-  Args:
-    max_x_initializer:
-      initializer vector of shape [input_size] used by variable `max_x`
-    bias_x_initializer:
-      initializer vector of shape [input_size] used by parameter `bias_x`
-    is_training:
-      Are we training the layer to learn the normalization maximas.
-      If set to True, max_x will be able to learn. This is independent of bias_x
-    epsilon:
-      The minimum value used for max_x. Defaults to 1E-5.
-    use_bias:
-      Default True. Set to False to not use a bias term.
-
-  Returns:
-    A layer representing the output of the sparse_max_norm transformation.
-   """
-
-  def __init__(
-          self,
-          input_size=None,
-          max_x_initializer=None,
-          bias_x_initializer=None,
-          is_training=True,
-          epsilon=1E-5,
-          use_bias=True,
-          **kwargs):
-
-    super(SparseMaxNorm, self).__init__(**kwargs)
-    if input_size:
-      raise ValueError('input_size is deprecated - it is now automatically \
-                       inferred from your input.')
-    if max_x_initializer is None:
-      max_x_initializer = tf.zeros_initializer()
-    self.max_x_initializer = max_x_initializer
-
-    self._use_bias = use_bias
-    if use_bias:
-      if bias_x_initializer is None:
-        bias_x_initializer = tf.zeros_initializer()
-      self.bias_x_initializer = bias_x_initializer
-
-    self.epsilon = epsilon
-    self.is_training = is_training
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the max_x and bias_x tf.Variables of the layer."""
-
-    self.max_x = self.add_variable(
-      'max_x',
-      initializer=self.max_x_initializer,
-      shape=[input_shape[1]],
-      dtype=tf.float32,
-      trainable=False)
-
-    if self._use_bias:
-      self.bias_x = self.add_variable(
-        'bias_x',
-        initializer=self.bias_x_initializer,
-        shape=[input_shape[1]],
-        dtype=tf.float32,
-        trainable=True)
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+import twml
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+from .layer import Layer
 
-    Raises NotImplementedError.
 
+class SparseMaxNorm(Layer):
     """
-    raise NotImplementedError
+    Computes a max-normalization and adds bias to the sparse_input,
+    forwards that through a sparse affine transform followed
+    by an non-linear activation on the resulting dense representation.
 
-  def _call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """
-    The forward propagation logic of the layer lives here.
+    This layer has two parameters, one of which learns through gradient descent:
+      bias_x (optional):
+        vector of shape [input_size]. Learned through gradient descent.
+      max_x:
+        vector of shape [input_size]. Holds the maximas of input ``x`` for normalization.
+        Either calibrated through SparseMaxNorm calibrator, or calibrated online, or both.
 
-    Arguments:
-      sparse_input:
-        A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
-    Returns:
-       A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
-       be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
-    """
+    The pseudo-code for this layer looks like:
 
-    if isinstance(inputs, twml.SparseTensor):
-      inputs = inputs.to_tf()
-    elif not isinstance(inputs, tf.SparseTensor):
-      raise TypeError("The inputs must be of type tf.SparseTensor or twml.SparseTensor")
+    .. code-block:: python
 
-    indices_x = inputs.indices[:, 1]
-    values_x = inputs.values
+      abs_x = abs(x)
+      normed_x = clip_by_value(x / max_x, -1, 1)
+      biased_x = normed_x + bias_x
+      return biased
 
-    if self.is_training is False:
-      normalized_x = OPLIB.sparse_max_norm_inference(self.max_x,
-                                                     indices_x,
-                                                     values_x,
-                                                     self.epsilon)
 
-      update_op = tf.no_op()
-    else:
-      max_x, normalized_x = OPLIB.sparse_max_norm_training(self.max_x,
-                                                           indices_x,
-                                                           values_x,
-                                                           self.epsilon)
+    Args:
+      max_x_initializer:
+        initializer vector of shape [input_size] used by variable `max_x`
+      bias_x_initializer:
+        initializer vector of shape [input_size] used by parameter `bias_x`
+      is_training:
+        Are we training the layer to learn the normalization maximas.
+        If set to True, max_x will be able to learn. This is independent of bias_x
+      epsilon:
+        The minimum value used for max_x. Defaults to 1E-5.
+      use_bias:
+        Default True. Set to False to not use a bias term.
 
-      update_op = tf.assign(self.max_x, max_x)
+    Returns:
+      A layer representing the output of the sparse_max_norm transformation.
+    """
 
-    with tf.control_dependencies([update_op]):
-      normalized_x = tf.stop_gradient(normalized_x)
+    def __init__(
+        self,
+        input_size=None,
+        max_x_initializer=None,
+        bias_x_initializer=None,
+        is_training=True,
+        epsilon=1e-5,
+        use_bias=True,
+        **kwargs
+    ):
+        super(SparseMaxNorm, self).__init__(**kwargs)
+        if input_size:
+            raise ValueError(
+                "input_size is deprecated - it is now automatically \
+                       inferred from your input."
+            )
+        if max_x_initializer is None:
+            max_x_initializer = tf.zeros_initializer()
+        self.max_x_initializer = max_x_initializer
+
+        self._use_bias = use_bias
+        if use_bias:
+            if bias_x_initializer is None:
+                bias_x_initializer = tf.zeros_initializer()
+            self.bias_x_initializer = bias_x_initializer
+
+        self.epsilon = epsilon
+        self.is_training = is_training
+
+    def build(self, input_shape):  # pylint: disable=unused-argument
+        """Creates the max_x and bias_x tf.Variables of the layer."""
+
+        self.max_x = self.add_variable(
+            "max_x",
+            initializer=self.max_x_initializer,
+            shape=[input_shape[1]],
+            dtype=tf.float32,
+            trainable=False,
+        )
+
+        if self._use_bias:
+            self.bias_x = self.add_variable(
+                "bias_x",
+                initializer=self.bias_x_initializer,
+                shape=[input_shape[1]],
+                dtype=tf.float32,
+                trainable=True,
+            )
+
+        self.built = True
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+
+        """
+        raise NotImplementedError
+
+    def _call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+        """
+        The forward propagation logic of the layer lives here.
+
+        Arguments:
+          sparse_input:
+            A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
+        Returns:
+           A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
+           be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
+        """
+
+        if isinstance(inputs, twml.SparseTensor):
+            inputs = inputs.to_tf()
+        elif not isinstance(inputs, tf.SparseTensor):
+            raise TypeError(
+                "The inputs must be of type tf.SparseTensor or twml.SparseTensor"
+            )
+
+        indices_x = inputs.indices[:, 1]
+        values_x = inputs.values
+
+        if self.is_training is False:
+            normalized_x = OPLIB.sparse_max_norm_inference(
+                self.max_x, indices_x, values_x, self.epsilon
+            )
+
+            update_op = tf.no_op()
+        else:
+            max_x, normalized_x = OPLIB.sparse_max_norm_training(
+                self.max_x, indices_x, values_x, self.epsilon
+            )
+
+            update_op = tf.assign(self.max_x, max_x)
+
+        with tf.control_dependencies([update_op]):
+            normalized_x = tf.stop_gradient(normalized_x)
+
+        # add input bias
+        if self._use_bias:
+            normalized_x = normalized_x + tf.gather(self.bias_x, indices_x)
+
+        # convert back to sparse tensor
+        return tf.SparseTensor(inputs.indices, normalized_x, inputs.dense_shape)
+
+    def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+        """
+        The forward propagation logic of the layer lives here.
+
+        Arguments:
+          sparse_input:
+            A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
+        Returns:
+           A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
+           be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
+        """
+        with tf.device(self.max_x.device):
+            return self._call(inputs, **kwargs)
 
-    # add input bias
-    if self._use_bias:
-      normalized_x = normalized_x + tf.gather(self.bias_x, indices_x)
 
-    # convert back to sparse tensor
-    return tf.SparseTensor(inputs.indices, normalized_x, inputs.dense_shape)
+# For backwards compatiblity and also because I don't want to change all the tests.
+MaxNorm = SparseMaxNorm
 
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """
-    The forward propagation logic of the layer lives here.
 
-    Arguments:
-      sparse_input:
-        A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
-    Returns:
-       A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
-       be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
+def sparse_max_norm(
+    inputs,
+    input_size=None,
+    max_x_initializer=None,
+    bias_x_initializer=None,
+    is_training=True,
+    epsilon=1e-5,
+    use_bias=True,
+    name=None,
+    reuse=None,
+):
     """
-    with tf.device(self.max_x.device):
-      return self._call(inputs, **kwargs)
+    Functional inteface to SparseMaxNorm.
 
-# For backwards compatiblity and also because I don't want to change all the tests.
-MaxNorm = SparseMaxNorm
+    Args:
+      inputs:
+        A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
+      input_size:
+        number of input units
+      max_x_initializer:
+        initializer vector of shape [input_size] used by variable `max_x`
+      bias_x_initializer:
+        initializer vector of shape [input_size] used by parameter `bias_x`
+      is_training:
+        Are we training the layer to learn the normalization maximas.
+        If set to True, max_x will be able to learn. This is independent of bias_x
+      epsilon:
+        The minimum value used for max_x. Defaults to 1E-5.
+      use_bias:
+        Default True. Set to False to not use a bias term.
 
+    Returns:
+      Output after normalizing with the max value.
+    """
+    if input_size:
+        raise ValueError(
+            "input_size is deprecated - it is now automatically \
+                     inferred from your input."
+        )
 
-def sparse_max_norm(inputs,
-                    input_size=None,
-                    max_x_initializer=None,
-                    bias_x_initializer=None,
-                    is_training=True,
-                    epsilon=1E-5,
-                    use_bias=True,
-                    name=None,
-                    reuse=None):
-  """
-  Functional inteface to SparseMaxNorm.
-
-  Args:
-    inputs:
-      A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
-    input_size:
-      number of input units
-    max_x_initializer:
-      initializer vector of shape [input_size] used by variable `max_x`
-    bias_x_initializer:
-      initializer vector of shape [input_size] used by parameter `bias_x`
-    is_training:
-      Are we training the layer to learn the normalization maximas.
-      If set to True, max_x will be able to learn. This is independent of bias_x
-    epsilon:
-      The minimum value used for max_x. Defaults to 1E-5.
-    use_bias:
-      Default True. Set to False to not use a bias term.
-
-  Returns:
-    Output after normalizing with the max value.
-   """
-  if input_size:
-    raise ValueError('input_size is deprecated - it is now automatically \
-                     inferred from your input.')
-
-  if isinstance(inputs, twml.SparseTensor):
-    inputs = inputs.to_tf()
-
-  layer = SparseMaxNorm(max_x_initializer=max_x_initializer,
-                        bias_x_initializer=bias_x_initializer,
-                        is_training=is_training,
-                        epsilon=epsilon,
-                        use_bias=use_bias,
-                        name=name,
-                        _scope=name,
-                        _reuse=reuse)
-  return layer(inputs)
+    if isinstance(inputs, twml.SparseTensor):
+        inputs = inputs.to_tf()
+
+    layer = SparseMaxNorm(
+        max_x_initializer=max_x_initializer,
+        bias_x_initializer=bias_x_initializer,
+        is_training=is_training,
+        epsilon=epsilon,
+        use_bias=use_bias,
+        name=name,
+        _scope=name,
+        _reuse=reuse,
+    )
+    return layer(inputs)
diff --git a/twml/twml/layers/stitch.py b/twml/twml/layers/stitch.py
index 51dffdb8e..54187c724 100644
--- a/twml/twml/layers/stitch.py
+++ b/twml/twml/layers/stitch.py
@@ -4,51 +4,52 @@
 """
 
 
-from .layer import Layer
-
 import tensorflow.compat.v1 as tf
 
+from .layer import Layer
 
-class Stitch(Layer):
-  """
-  This layer is responsible for stitching a partioned layer together.
-
-  Output:
-    A layer that performs stitching
-  """
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
 
-  def call(self, partioned_val, partioned_keys,
-           partioned_indices, **kwargs):  # pylint: disable=unused-argument, arguments-differ
+class Stitch(Layer):
     """
     This layer is responsible for stitching a partioned layer together.
 
-    Input:
-      partioned_val:
-        a list of partioned Tensors which represent the vals of the hashmap
-      partioned_keys:
-        a list of partioned Tensors which represent the keys of the hashmap
-      partioned_indices:
-        a list of partioned Tensors which represent the indices of the hashmap
     Output:
-      List which contains: [output_vals, output_keys]
-        output_vals:
-          Values of the HashMap (float)
-        output_keys:
-          Keys of HashMap (float)
+      A layer that performs stitching
     """
-    indices = [tf.to_int32(index) for index in partioned_indices]
-    concat_keys = tf.dynamic_stitch(indices, partioned_keys)
-    concat_vals = tf.dynamic_stitch(indices, partioned_val)
-    return [concat_vals, concat_keys]
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+          input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+            be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+
+        """
+        raise NotImplementedError
+
+    def call(
+        self, partioned_val, partioned_keys, partioned_indices, **kwargs
+    ):  # pylint: disable=unused-argument, arguments-differ
+        """
+        This layer is responsible for stitching a partioned layer together.
+
+        Input:
+          partioned_val:
+            a list of partioned Tensors which represent the vals of the hashmap
+          partioned_keys:
+            a list of partioned Tensors which represent the keys of the hashmap
+          partioned_indices:
+            a list of partioned Tensors which represent the indices of the hashmap
+        Output:
+          List which contains: [output_vals, output_keys]
+            output_vals:
+              Values of the HashMap (float)
+            output_keys:
+              Keys of HashMap (float)
+        """
+        indices = [tf.to_int32(index) for index in partioned_indices]
+        concat_keys = tf.dynamic_stitch(indices, partioned_keys)
+        concat_vals = tf.dynamic_stitch(indices, partioned_val)
+        return [concat_vals, concat_keys]
diff --git a/twml/twml/learning_rate_decay.py b/twml/twml/learning_rate_decay.py
index be522d75b..4667e760d 100644
--- a/twml/twml/learning_rate_decay.py
+++ b/twml/twml/learning_rate_decay.py
@@ -4,165 +4,208 @@
 
 
 def get_learning_rate_decay_fn(params):
-  """
-  Returns a learning rate decay function that takes the initial
-  learning_rate and global_step
-  as arguments and returns the current learning rate.
-
-  Currently supports params.learning_rate_decay values of:
-  exponential | polynomial | piecewise_constant | cosine | cosine restarts.
-  See `Decaying the Leanring Rate
-  <https://www.tensorflow.org/api_guides/python/train#Decaying_the_learning_rate>`_ for details.
-
-  Arguments:
-    params:
-      a tensorflow.contrib.train.HParams object containing the relevant hyperparameters.
-  """
-  paramsv = params.values()
-  if 'learning_rate_decay' not in paramsv or params.learning_rate_decay == 'no_learning_rate_decay':
-    return None
-  elif params.learning_rate_decay == 'exponential_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'exponential'")
-    if 'exponential_decay_rate' not in paramsv:
-      raise ValueError("Expecting params.exponential_decay_rate for "
-                       "params.learning_rate_decay == 'exponential'")
-
-    def exponential_decay_fn(learning_rate, global_step):
-      """ exponential decay function to be passed to optimize_loss """
-      return tf.train.exponential_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        decay_rate=params.exponential_decay_rate
-      )
-    return exponential_decay_fn
-  elif params.learning_rate_decay == 'piecewise_constant_learning_rate_decay':
-    if 'piecewise_constant_boundaries' not in paramsv:
-      raise ValueError("Expecting params.piecewise_constant_boundaries for "
-                       "params.learning_rate_decay == 'piecewise_constant'")
-    if 'piecewise_constant_values' not in paramsv:
-      raise ValueError("Expecting params.piecewise_constant_values for "
-                       "params.learning_rate_decay == 'piecewise_constant'")
-    # pylint: disable=unused-argument
-
-    def piecewise_constant_fn(learning_rate, global_step):
-      """ piecewise_constant decay function to be passed to optimize_loss """
-      return tf.train.piecewise_constant(
-        x=global_step,
-        boundaries=params.piecewise_constant_boundaries,
-        values=params.piecewise_constant_values
-      )
-    return piecewise_constant_fn
-  elif params.learning_rate_decay == 'polynomial_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'polynomial'")
-    if 'end_learning_rate' not in paramsv:
-      raise ValueError("Expecting params.end_learning_rate for "
-                       "params.learning_rate_decay == 'polynomial'")
-
-    def polynomial_decay_fn(learning_rate, global_step):
-      """ polynomial decay function to be passed to optimize_loss """
-      return tf.train.polynomial_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        end_learning_rate=params.end_learning_rate,
-        power=params.polynomial_power if 'polynomial_power' in paramsv else 1.0,
-      )
-    return polynomial_decay_fn
-
-  elif params.learning_rate_decay == 'inverse_learning_rate_decay':
-    if 'min_learning_rate' not in paramsv:
-      raise ValueError("Expecting params.min_learning_rate for "
-                       "params.learning_rate_decay == 'inverse'")
-    if 'decay_rate' not in paramsv:
-      raise ValueError("Expecting params.decay_rate for "
-                       "params.learning_rate_decay == 'inverse'")
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'inverse'")
-
-    def bounded_inverse_time_decay_fn(learning_rate, global_step):
-      '''
-      Returns the decayed learning_rate by applying the function:
-      decayed_lr = max(lr /(1 + decay_rate * floor(global_step /decay_step)),
-                       min_learning_rate)
-      Arguments:
-        learning_rate:
-          A scalar `float32` or `float64` `Tensor` or a Python number.
-          The initial learning rate.
-        global_step:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Global step to use for the decay computation.  Must not be negative.
-        min_learning_rate:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Minimum possible learning_rate. The decayed learning_rate will not be
-          smaller than the min_learning_rate
-        decay_steps:
-          How often to apply decay. In dbv1, this should be 1.
-        decay_rate:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Rate in which we decay the learning rate.
-        Returns:
-        A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-        learning rate.
-      '''
-      decayed_rate = tf.train.inverse_time_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        decay_rate=params.decay_rate)
-      # Getting dtype of returned Tensor
-      dtype = decayed_rate.dtype
-      # Casting the min_learning rate the same dtype as decayes rate
-      min_learning_rate = tf.cast(params.min_learning_rate, dtype)
-      # Returning the maximum between the two
-      return tf.maximum(decayed_rate, min_learning_rate)
-
-    return bounded_inverse_time_decay_fn
-
-  elif params.learning_rate_decay == 'cosine_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'cosine_decay'")
-    if "alpha" not in paramsv:
-      raise ValueError("Expecting params.alpha for "
-                       "params.learning_rate_decay == 'cosine_decay'")
-    def cosine_decay_fn(learning_rate, global_step):
-      """ cosine decay function to be passed to optimize_loss """
-      return tf.train.cosine_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        alpha=params.alpha
-      )
-    return cosine_decay_fn
-  elif params.learning_rate_decay == 'cosine_restarts_learning_rate_decay':
-    if 'first_decay_steps' not in paramsv:
-      raise ValueError("Expecting params.first_decay_steps for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if 't_mul' not in paramsv:
-      raise ValueError("Expecting params.t_mul for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if 'm_mul' not in paramsv:
-      raise ValueError("Expecting params.m_mul for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if "alpha" not in paramsv:
-      raise ValueError("Expecting params.alpha for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    def cosine_restart_decay_fn(learning_rate, global_step):
-      """ cosine decay function to be passed to optimize_loss """
-      return tf.train.cosine_decay_restarts(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        first_decay_steps=params.first_decay_steps,
-        t_mul=params.t_mul,
-        m_mul=params.m_mul,
-        alpha=params.alpha
-      )
-    return cosine_restart_decay_fn
-
-  raise ValueError("Unsupported params.learning_rate_decay: %s" % params.learning_rate_decay)
+    """
+    Returns a learning rate decay function that takes the initial
+    learning_rate and global_step
+    as arguments and returns the current learning rate.
+
+    Currently supports params.learning_rate_decay values of:
+    exponential | polynomial | piecewise_constant | cosine | cosine restarts.
+    See `Decaying the Leanring Rate
+    <https://www.tensorflow.org/api_guides/python/train#Decaying_the_learning_rate>`_ for details.
+
+    Arguments:
+      params:
+        a tensorflow.contrib.train.HParams object containing the relevant hyperparameters.
+    """
+    paramsv = params.values()
+    if (
+        "learning_rate_decay" not in paramsv
+        or params.learning_rate_decay == "no_learning_rate_decay"
+    ):
+        return None
+    elif params.learning_rate_decay == "exponential_learning_rate_decay":
+        if "decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_steps for "
+                "params.learning_rate_decay == 'exponential'"
+            )
+        if "exponential_decay_rate" not in paramsv:
+            raise ValueError(
+                "Expecting params.exponential_decay_rate for "
+                "params.learning_rate_decay == 'exponential'"
+            )
+
+        def exponential_decay_fn(learning_rate, global_step):
+            """exponential decay function to be passed to optimize_loss"""
+            return tf.train.exponential_decay(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                decay_steps=params.decay_steps,
+                decay_rate=params.exponential_decay_rate,
+            )
+
+        return exponential_decay_fn
+    elif params.learning_rate_decay == "piecewise_constant_learning_rate_decay":
+        if "piecewise_constant_boundaries" not in paramsv:
+            raise ValueError(
+                "Expecting params.piecewise_constant_boundaries for "
+                "params.learning_rate_decay == 'piecewise_constant'"
+            )
+        if "piecewise_constant_values" not in paramsv:
+            raise ValueError(
+                "Expecting params.piecewise_constant_values for "
+                "params.learning_rate_decay == 'piecewise_constant'"
+            )
+        # pylint: disable=unused-argument
+
+        def piecewise_constant_fn(learning_rate, global_step):
+            """piecewise_constant decay function to be passed to optimize_loss"""
+            return tf.train.piecewise_constant(
+                x=global_step,
+                boundaries=params.piecewise_constant_boundaries,
+                values=params.piecewise_constant_values,
+            )
+
+        return piecewise_constant_fn
+    elif params.learning_rate_decay == "polynomial_learning_rate_decay":
+        if "decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_steps for "
+                "params.learning_rate_decay == 'polynomial'"
+            )
+        if "end_learning_rate" not in paramsv:
+            raise ValueError(
+                "Expecting params.end_learning_rate for "
+                "params.learning_rate_decay == 'polynomial'"
+            )
+
+        def polynomial_decay_fn(learning_rate, global_step):
+            """polynomial decay function to be passed to optimize_loss"""
+            return tf.train.polynomial_decay(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                decay_steps=params.decay_steps,
+                end_learning_rate=params.end_learning_rate,
+                power=params.polynomial_power if "polynomial_power" in paramsv else 1.0,
+            )
+
+        return polynomial_decay_fn
+
+    elif params.learning_rate_decay == "inverse_learning_rate_decay":
+        if "min_learning_rate" not in paramsv:
+            raise ValueError(
+                "Expecting params.min_learning_rate for "
+                "params.learning_rate_decay == 'inverse'"
+            )
+        if "decay_rate" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_rate for "
+                "params.learning_rate_decay == 'inverse'"
+            )
+        if "decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_steps for "
+                "params.learning_rate_decay == 'inverse'"
+            )
+
+        def bounded_inverse_time_decay_fn(learning_rate, global_step):
+            """
+            Returns the decayed learning_rate by applying the function:
+            decayed_lr = max(lr /(1 + decay_rate * floor(global_step /decay_step)),
+                             min_learning_rate)
+            Arguments:
+              learning_rate:
+                A scalar `float32` or `float64` `Tensor` or a Python number.
+                The initial learning rate.
+              global_step:
+                A scalar `int32` or `int64` `Tensor` or a Python number.
+                Global step to use for the decay computation.  Must not be negative.
+              min_learning_rate:
+                A scalar `int32` or `int64` `Tensor` or a Python number.
+                Minimum possible learning_rate. The decayed learning_rate will not be
+                smaller than the min_learning_rate
+              decay_steps:
+                How often to apply decay. In dbv1, this should be 1.
+              decay_rate:
+                A scalar `int32` or `int64` `Tensor` or a Python number.
+                Rate in which we decay the learning rate.
+              Returns:
+              A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+              learning rate.
+            """
+            decayed_rate = tf.train.inverse_time_decay(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                decay_steps=params.decay_steps,
+                decay_rate=params.decay_rate,
+            )
+            # Getting dtype of returned Tensor
+            dtype = decayed_rate.dtype
+            # Casting the min_learning rate the same dtype as decayes rate
+            min_learning_rate = tf.cast(params.min_learning_rate, dtype)
+            # Returning the maximum between the two
+            return tf.maximum(decayed_rate, min_learning_rate)
+
+        return bounded_inverse_time_decay_fn
+
+    elif params.learning_rate_decay == "cosine_learning_rate_decay":
+        if "decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_steps for "
+                "params.learning_rate_decay == 'cosine_decay'"
+            )
+        if "alpha" not in paramsv:
+            raise ValueError(
+                "Expecting params.alpha for "
+                "params.learning_rate_decay == 'cosine_decay'"
+            )
+
+        def cosine_decay_fn(learning_rate, global_step):
+            """cosine decay function to be passed to optimize_loss"""
+            return tf.train.cosine_decay(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                decay_steps=params.decay_steps,
+                alpha=params.alpha,
+            )
+
+        return cosine_decay_fn
+    elif params.learning_rate_decay == "cosine_restarts_learning_rate_decay":
+        if "first_decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.first_decay_steps for "
+                "params.learning_rate_decay == 'cosine_restarts_decay'"
+            )
+        if "t_mul" not in paramsv:
+            raise ValueError(
+                "Expecting params.t_mul for "
+                "params.learning_rate_decay == 'cosine_restarts_decay'"
+            )
+        if "m_mul" not in paramsv:
+            raise ValueError(
+                "Expecting params.m_mul for "
+                "params.learning_rate_decay == 'cosine_restarts_decay'"
+            )
+        if "alpha" not in paramsv:
+            raise ValueError(
+                "Expecting params.alpha for "
+                "params.learning_rate_decay == 'cosine_restarts_decay'"
+            )
+
+        def cosine_restart_decay_fn(learning_rate, global_step):
+            """cosine decay function to be passed to optimize_loss"""
+            return tf.train.cosine_decay_restarts(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                first_decay_steps=params.first_decay_steps,
+                t_mul=params.t_mul,
+                m_mul=params.m_mul,
+                alpha=params.alpha,
+            )
+
+        return cosine_restart_decay_fn
+
+    raise ValueError(
+        "Unsupported params.learning_rate_decay: %s" % params.learning_rate_decay
+    )
diff --git a/twml/twml/lookup/__init__.py b/twml/twml/lookup/__init__.py
index 87392d719..2695fa53c 100644
--- a/twml/twml/lookup/__init__.py
+++ b/twml/twml/lookup/__init__.py
@@ -1,9 +1,8 @@
-from tensorflow.python.ops.lookup_ops import (
-  index_table_from_file,
-  index_table_from_tensor,
-  index_to_string_table_from_file
-)  # noqa: F401
-
+from tensorflow.python.ops.lookup_ops import (  # noqa: F401
+    index_table_from_file,
+    index_table_from_tensor,
+    index_to_string_table_from_file,
+)
 
 """
 NOTE: Using `from tensorflow.python.ops.lookup_ops import index_table_from_tensor` in the code works.
diff --git a/twml/twml/metrics.py b/twml/twml/metrics.py
index ee2f82b74..59cfa52b5 100644
--- a/twml/twml/metrics.py
+++ b/twml/twml/metrics.py
@@ -11,7 +11,6 @@
 import tensorboard as tb
 import tensorflow.compat.v1 as tf
 
-
 CLAMP_EPSILON = 0.00001
 
 
@@ -21,28 +20,33 @@ def total_weight_metric(
     weights=None,
     metrics_collections=None,
     updates_collections=None,
-    name=None):
-  with tf.variable_scope(name, 'total_weight', (labels, predictions, weights)):
-    total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float64)
-
-    if weights is None:
-      weights = tf.cast(tf.size(labels), total_weight.dtype, name="default_weight")
-    else:
-      weights = tf.cast(weights, total_weight.dtype)
+    name=None,
+):
+    with tf.variable_scope(name, "total_weight", (labels, predictions, weights)):
+        total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float64)
+
+        if weights is None:
+            weights = tf.cast(
+                tf.size(labels), total_weight.dtype, name="default_weight"
+            )
+        else:
+            weights = tf.cast(weights, total_weight.dtype)
 
-    # add up the weights to get total weight of the eval set
-    update_total_weight = tf.assign_add(total_weight, tf.reduce_sum(weights), name="update_op")
+        # add up the weights to get total weight of the eval set
+        update_total_weight = tf.assign_add(
+            total_weight, tf.reduce_sum(weights), name="update_op"
+        )
 
-    value_op = tf.identity(total_weight)
-    update_op = tf.identity(update_total_weight)
+        value_op = tf.identity(total_weight)
+        update_op = tf.identity(update_total_weight)
 
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, value_op)
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, value_op)
 
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_op)
 
-    return value_op, update_op
+        return value_op, update_op
 
 
 def num_samples_metric(
@@ -51,1330 +55,1561 @@ def num_samples_metric(
     weights=None,
     metrics_collections=None,
     updates_collections=None,
-    name=None):
-  with tf.variable_scope(name, 'num_samples', (labels, predictions, weights)):
-    num_samples = _metric_variable(name='num_samples', shape=[], dtype=tf.float64)
-    update_num_samples = tf.assign_add(num_samples, tf.cast(tf.size(labels), num_samples.dtype), name="update_op")
-
-    value_op = tf.identity(num_samples)
-    update_op = tf.identity(update_num_samples)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, value_op)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return value_op, update_op
-
-
-def ctr(labels, predictions,
-        weights=None,
-        metrics_collections=None,
-        updates_collections=None,
-        name=None):
-  # pylint: disable=unused-argument
-  """
-  Compute the weighted average positive sample ratio based on labels
-  (i.e. weighted average percentage of positive labels).
-  The name `ctr` (click-through-rate) is from legacy.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    ctr: A `Tensor` representing positive sample ratio.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  return tf.metrics.mean(
-    values=labels,
-    weights=weights,
-    metrics_collections=metrics_collections,
-    updates_collections=updates_collections,
-    name=name)
-
-
-def predicted_ctr(labels, predictions,
-                  weights=None,
-                  metrics_collections=None,
-                  updates_collections=None,
-                  name=None):
-  # pylint: disable=unused-argument
-  """
-  Compute the weighted average positive ratio based on predictions,
-  (i.e. weighted averaged predicted positive probability).
-  The name `ctr` (click-through-rate) is from legacy.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    predicted_ctr: A `Tensor` representing the predicted positive ratio.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  return tf.metrics.mean(
-    values=predictions,
-    weights=weights,
-    metrics_collections=metrics_collections,
-    updates_collections=updates_collections,
-    name=name)
-
-
-def prediction_std_dev(labels, predictions,
-                       weights=None,
-                       metrics_collections=None,
-                       updates_collections=None,
-                       name=None):
-  """
-  Compute the weighted standard deviation of the predictions.
-  Note - this is not a confidence interval metric.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'pred_std_dev', (labels, predictions, weights)):
-    labels = tf.cast(labels, tf.float64)
-    predictions = tf.cast(predictions, tf.float64)
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float64, name="default_weight")
-    else:
-      weights = tf.cast(weights, tf.float64)
-
-    # State kept during streaming of examples
-    total_weighted_preds = _metric_variable(
-        name='total_weighted_preds', shape=[], dtype=tf.float64)
-    total_weighted_preds_sq = _metric_variable(
-        name='total_weighted_preds_sq', shape=[], dtype=tf.float64)
-    total_weights = _metric_variable(
-        name='total_weights', shape=[], dtype=tf.float64)
-
-    # Update state
-    update_total_weighted_preds = tf.assign_add(total_weighted_preds, tf.reduce_sum(weights * predictions))
-    update_total_weighted_preds_sq = tf.assign_add(total_weighted_preds_sq, tf.reduce_sum(weights * predictions * predictions))
-    update_total_weights = tf.assign_add(total_weights, tf.reduce_sum(weights))
-
-    # Compute output
-    def compute_output(tot_w, tot_wp, tot_wpp):
-      return tf.math.sqrt(tot_wpp / tot_w - (tot_wp / tot_w) ** 2)
-    std_dev_est = compute_output(total_weights, total_weighted_preds, total_weighted_preds_sq)
-    update_std_dev_est = compute_output(update_total_weights, update_total_weighted_preds, update_total_weighted_preds_sq)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, std_dev_est)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_std_dev_est)
-
-    return std_dev_est, update_std_dev_est
-
-
-def _get_arce_predictions(predictions, weights, label_weighted, labels,
-                         up_weight, deprecated_rce,
-                         total_positive, update_total_positive):
-  """
-  Returns the ARCE predictions, total_positive, update_total_positive and weights
-  used by the rest of the twml.metrics.rce metric computation.
-  """
-  predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
-  label_weighted_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(label_weighted))
-  pred_weight_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(predictions_weighted))
-  normalizer_comp = label_weighted_comp / pred_weight_comp
-
-  if up_weight is False:
-    total_positive_unweighted = _metric_variable(
-      name='total_positive_unweighted', shape=[], dtype=tf.float32)
-
-    update_total_positive_unweighted = tf.assign_add(
-      total_positive_unweighted, tf.reduce_sum(labels),
-      name="total_positive_unweighted_update")
-
-    if deprecated_rce:
-      normalizer = tf.reduce_sum(labels) / tf.reduce_sum(label_weighted)
-    else:
-      # sum of labels / sum of weighted labels
-      normalizer = update_total_positive_unweighted / update_total_positive
-
-    label_comp = tf.subtract(tf.to_float(tf.size(labels)), tf.reduce_sum(labels))
-    normalizer_comp = label_comp / label_weighted_comp
-
-    # note that up_weight=True changes these for the rest of the twml.metric.rce computation
-    weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    total_positive = total_positive_unweighted
-    update_total_positive = update_total_positive_unweighted
-  else:
-    if deprecated_rce:
-      normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-    else:
-      # normalizer used for NRCE (and ARCE with up_weight=True)
-      total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32)
-
-      # update the variable holding the sum of weighted predictions
-      update_total_prediction = tf.assign_add(
-        total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update")
-
-      # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-      # but it measure normalizer over batch was too flawed an approximation.
-      normalizer = update_total_positive / update_total_prediction
-
-  pred_comp = tf.subtract(tf.ones(shape=tf.shape(labels), dtype=tf.float32), predictions)
-  pred_comp_norm = tf.multiply(pred_comp, normalizer_comp, name="normalized_predictions_comp")
-  pred_num = tf.multiply(predictions, normalizer, name="normalized_pred_numerator")
-  pred_denom = tf.add(pred_num, pred_comp_norm, name="normalized_pred_denominator")
-  predictions = pred_num / pred_denom
-
-  return predictions, total_positive, update_total_positive, weights
-
-
-def rce(labels, predictions,
-        weights=None,
-        normalize=False,
-        arce=False,
-        up_weight=True,
-        metrics_collections=None,
-        updates_collections=None,
-        name=None,
-        deprecated_rce=False):
-  """
-  Compute the relative cross entropy (RCE).
-  The RCE is a relative measurement compared to the baseline model's performance.
-  The baseline model always predicts average click-through-rate (CTR).
-  The RCE measures, in percentage, how much better the predictions are, compared
-  to the baseline model, in terms of cross entropy loss.
-
-  y = label; p = prediction;
-  binary cross entropy = y * log(p) + (1-y) * log(1-p)
-
-  Args:
-    labels:
-      the ground true value.
-    predictions:
-      the predicted values, whose shape must match labels.
-    weights:
-      optional weights, whose shape must match labels . Weight is 1 if not set.
-    normalize:
-      if set to true, produce NRCEs used at Twitter. (normalize preds by weights first)
-      NOTE: if you don't understand what NRCE is, please don't use it.
-    arce:
-      if set to true, produces `ARCE <http://go/arce>`_.
-      This can only be activated if `normalize=True`.
-    up_weight:
-      if set to true, produces arce in the up_weighted space (considers CTR after up_weighting
-      data), while False gives arce in the original space (only considers CTR before up_weighting).
-      In the actual version, this flag can only be activated if arce is True.
-      Notice that the actual version of NRCE corresponds to up_weight=True.
-    metrics_collections:
-      optional list of collections to add this metric into.
-    updates_collections:
-      optional list of collections to add the associated update_op into.
-    name:
-      an optional variable_scope name.
-    deprecated_rce:
-      enables the previous NRCE/ARCE calculations which calculated some label metrics
-      on the batch instead of on all batches seen so far. Note that the older metric
-      calculation is less stable, especially for smaller batch sizes. You should probably
-      never have to set this to True.
-
-  Return:
-    rce_value:
-      A ``Tensor`` representing the RCE.
-    update_op:
-      A update operation used to accumulate data into this metric.
-
-  .. note:: Must have at least 1 positive and 1 negative sample accumulated,
-     or RCE will come out as NaN.
-  """
-  with tf.variable_scope(name, 'rce', (labels, predictions, weights)):
-    labels = tf.to_float(labels, name="label_to_float")
-    predictions = tf.to_float(predictions, name="predictions_to_float")
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    else:
-      weights = tf.to_float(weights, name="weight_to_float")
+    name=None,
+):
+    with tf.variable_scope(name, "num_samples", (labels, predictions, weights)):
+        num_samples = _metric_variable(name="num_samples", shape=[], dtype=tf.float64)
+        update_num_samples = tf.assign_add(
+            num_samples, tf.cast(tf.size(labels), num_samples.dtype), name="update_op"
+        )
 
-    total_positive = _metric_variable(name='total_positive', shape=[], dtype=tf.float32)
-    total_loss = _metric_variable(name='total_loss', shape=[], dtype=tf.float32)
-    total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float32)
+        value_op = tf.identity(num_samples)
+        update_op = tf.identity(update_num_samples)
 
-    label_weighted = tf.multiply(labels, weights, name="weighted_label")
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, value_op)
 
-    update_total_positive = tf.assign_add(
-      total_positive, tf.reduce_sum(label_weighted), name="total_pos_update")
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_op)
 
-    if arce:
-      if normalize is False:
-        raise ValueError('This configuration of parameters is not actually allowed')
+        return value_op, update_op
 
-      predictions, total_positive, update_total_positive, weights = _get_arce_predictions(
-        predictions=predictions, weights=weights, deprecated_rce=deprecated_rce,
-        label_weighted=label_weighted, labels=labels, up_weight=up_weight,
-        total_positive=total_positive, update_total_positive=update_total_positive)
 
-    elif normalize:
-      predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
+def ctr(
+    labels,
+    predictions,
+    weights=None,
+    metrics_collections=None,
+    updates_collections=None,
+    name=None,
+):
+    # pylint: disable=unused-argument
+    """
+    Compute the weighted average positive sample ratio based on labels
+    (i.e. weighted average percentage of positive labels).
+    The name `ctr` (click-through-rate) is from legacy.
+
+    Args:
+      labels: the ground truth value.
+      predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
+      weights: optional weights, whose shape must match labels . Weight is 1 if not set.
+      metrics_collections: optional list of collections to add this metric into.
+      updates_collections: optional list of collections to add the associated update_op into.
+      name: an optional variable_scope name.
+
+    Return:
+      ctr: A `Tensor` representing positive sample ratio.
+      update_op: A update operation used to accumulate data into this metric.
+    """
+    return tf.metrics.mean(
+        values=labels,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=name,
+    )
 
-      if deprecated_rce:
-        normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-      else:
-        total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32)
 
-        # update the variable holding the sum of weighted predictions
-        update_total_prediction = tf.assign_add(
-          total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update")
+def predicted_ctr(
+    labels,
+    predictions,
+    weights=None,
+    metrics_collections=None,
+    updates_collections=None,
+    name=None,
+):
+    # pylint: disable=unused-argument
+    """
+    Compute the weighted average positive ratio based on predictions,
+    (i.e. weighted averaged predicted positive probability).
+    The name `ctr` (click-through-rate) is from legacy.
+
+    Args:
+      labels: the ground truth value.
+      predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
+      weights: optional weights, whose shape must match labels . Weight is 1 if not set.
+      metrics_collections: optional list of collections to add this metric into.
+      updates_collections: optional list of collections to add the associated update_op into.
+      name: an optional variable_scope name.
+
+    Return:
+      predicted_ctr: A `Tensor` representing the predicted positive ratio.
+      update_op: A update operation used to accumulate data into this metric.
+    """
+    return tf.metrics.mean(
+        values=predictions,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=name,
+    )
 
-        # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-        # but it measure normalizer over batch was too flawed an approximation.
-        normalizer = update_total_positive / update_total_prediction
 
-      # NRCE
-      predictions = tf.multiply(predictions, normalizer, name="normalized_predictions")
+def prediction_std_dev(
+    labels,
+    predictions,
+    weights=None,
+    metrics_collections=None,
+    updates_collections=None,
+    name=None,
+):
+    """
+    Compute the weighted standard deviation of the predictions.
+    Note - this is not a confidence interval metric.
+
+    Args:
+      labels: the ground truth value.
+      predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
+      weights: optional weights, whose shape must match labels . Weight is 1 if not set.
+      metrics_collections: optional list of collections to add this metric into.
+      updates_collections: optional list of collections to add the associated update_op into.
+      name: an optional variable_scope name.
+
+    Return:
+      metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
+      update_op: A update operation used to accumulate data into this metric.
+    """
+    with tf.variable_scope(name, "pred_std_dev", (labels, predictions, weights)):
+        labels = tf.cast(labels, tf.float64)
+        predictions = tf.cast(predictions, tf.float64)
+
+        if weights is None:
+            weights = tf.ones(
+                shape=tf.shape(labels), dtype=tf.float64, name="default_weight"
+            )
+        else:
+            weights = tf.cast(weights, tf.float64)
+
+        # State kept during streaming of examples
+        total_weighted_preds = _metric_variable(
+            name="total_weighted_preds", shape=[], dtype=tf.float64
+        )
+        total_weighted_preds_sq = _metric_variable(
+            name="total_weighted_preds_sq", shape=[], dtype=tf.float64
+        )
+        total_weights = _metric_variable(
+            name="total_weights", shape=[], dtype=tf.float64
+        )
+
+        # Update state
+        update_total_weighted_preds = tf.assign_add(
+            total_weighted_preds, tf.reduce_sum(weights * predictions)
+        )
+        update_total_weighted_preds_sq = tf.assign_add(
+            total_weighted_preds_sq, tf.reduce_sum(weights * predictions * predictions)
+        )
+        update_total_weights = tf.assign_add(total_weights, tf.reduce_sum(weights))
+
+        # Compute output
+        def compute_output(tot_w, tot_wp, tot_wpp):
+            return tf.math.sqrt(tot_wpp / tot_w - (tot_wp / tot_w) ** 2)
+
+        std_dev_est = compute_output(
+            total_weights, total_weighted_preds, total_weighted_preds_sq
+        )
+        update_std_dev_est = compute_output(
+            update_total_weights,
+            update_total_weighted_preds,
+            update_total_weighted_preds_sq,
+        )
+
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, std_dev_est)
+
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_std_dev_est)
+
+        return std_dev_est, update_std_dev_est
+
+
+def _get_arce_predictions(
+    predictions,
+    weights,
+    label_weighted,
+    labels,
+    up_weight,
+    deprecated_rce,
+    total_positive,
+    update_total_positive,
+):
+    """
+    Returns the ARCE predictions, total_positive, update_total_positive and weights
+    used by the rest of the twml.metrics.rce metric computation.
+    """
+    predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
+    label_weighted_comp = tf.subtract(
+        tf.reduce_sum(weights), tf.reduce_sum(label_weighted)
+    )
+    pred_weight_comp = tf.subtract(
+        tf.reduce_sum(weights), tf.reduce_sum(predictions_weighted)
+    )
+    normalizer_comp = label_weighted_comp / pred_weight_comp
 
-    # clamp predictions to keep log(p) stable
-    clip_p = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p")
-    logloss = _binary_cross_entropy(pred=clip_p, target=labels, name="logloss")
+    if up_weight is False:
+        total_positive_unweighted = _metric_variable(
+            name="total_positive_unweighted", shape=[], dtype=tf.float32
+        )
 
-    logloss_weighted = tf.multiply(logloss, weights, name="weighted_logloss")
+        update_total_positive_unweighted = tf.assign_add(
+            total_positive_unweighted,
+            tf.reduce_sum(labels),
+            name="total_positive_unweighted_update",
+        )
 
-    update_total_loss = tf.assign_add(
-      total_loss, tf.reduce_sum(logloss_weighted), name="total_loss_update")
-    update_total_weight = tf.assign_add(
-      total_weight, tf.reduce_sum(weights), name="total_weight_update")
+        if deprecated_rce:
+            normalizer = tf.reduce_sum(labels) / tf.reduce_sum(label_weighted)
+        else:
+            # sum of labels / sum of weighted labels
+            normalizer = update_total_positive_unweighted / update_total_positive
+
+        label_comp = tf.subtract(tf.to_float(tf.size(labels)), tf.reduce_sum(labels))
+        normalizer_comp = label_comp / label_weighted_comp
+
+        # note that up_weight=True changes these for the rest of the twml.metric.rce computation
+        weights = tf.ones(
+            shape=tf.shape(labels), dtype=tf.float32, name="default_weight"
+        )
+        total_positive = total_positive_unweighted
+        update_total_positive = update_total_positive_unweighted
+    else:
+        if deprecated_rce:
+            normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(
+                predictions_weighted
+            )
+        else:
+            # normalizer used for NRCE (and ARCE with up_weight=True)
+            total_prediction = _metric_variable(
+                name="total_prediction", shape=[], dtype=tf.float32
+            )
+
+            # update the variable holding the sum of weighted predictions
+            update_total_prediction = tf.assign_add(
+                total_prediction,
+                tf.reduce_sum(predictions_weighted),
+                name="total_prediction_update",
+            )
+
+            # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
+            # but it measure normalizer over batch was too flawed an approximation.
+            normalizer = update_total_positive / update_total_prediction
+
+    pred_comp = tf.subtract(
+        tf.ones(shape=tf.shape(labels), dtype=tf.float32), predictions
+    )
+    pred_comp_norm = tf.multiply(
+        pred_comp, normalizer_comp, name="normalized_predictions_comp"
+    )
+    pred_num = tf.multiply(predictions, normalizer, name="normalized_pred_numerator")
+    pred_denom = tf.add(pred_num, pred_comp_norm, name="normalized_pred_denominator")
+    predictions = pred_num / pred_denom
 
-    # metric value retrieval subgraph
-    ctr1 = tf.truediv(total_positive, total_weight, name="ctr")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_ce = _binary_cross_entropy(pred=ctr1, target=ctr1, name="baseline_ce")
-    pred_ce = tf.truediv(total_loss, total_weight, name="pred_ce")
+    return predictions, total_positive, update_total_positive, weights
 
-    rce_t = tf.multiply(
-      1.0 - tf.truediv(pred_ce, baseline_ce),
-      100,
-      name="rce")
 
-    # metric update subgraph
-    ctr2 = tf.truediv(update_total_positive, update_total_weight, name="ctr_update")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_ce2 = _binary_cross_entropy(pred=ctr2, target=ctr2, name="baseline_ce_update")
-    pred_ce2 = tf.truediv(update_total_loss, update_total_weight, name="pred_ce_update")
+def rce(
+    labels,
+    predictions,
+    weights=None,
+    normalize=False,
+    arce=False,
+    up_weight=True,
+    metrics_collections=None,
+    updates_collections=None,
+    name=None,
+    deprecated_rce=False,
+):
+    """
+    Compute the relative cross entropy (RCE).
+    The RCE is a relative measurement compared to the baseline model's performance.
+    The baseline model always predicts average click-through-rate (CTR).
+    The RCE measures, in percentage, how much better the predictions are, compared
+    to the baseline model, in terms of cross entropy loss.
+
+    y = label; p = prediction;
+    binary cross entropy = y * log(p) + (1-y) * log(1-p)
+
+    Args:
+      labels:
+        the ground true value.
+      predictions:
+        the predicted values, whose shape must match labels.
+      weights:
+        optional weights, whose shape must match labels . Weight is 1 if not set.
+      normalize:
+        if set to true, produce NRCEs used at Twitter. (normalize preds by weights first)
+        NOTE: if you don't understand what NRCE is, please don't use it.
+      arce:
+        if set to true, produces `ARCE <http://go/arce>`_.
+        This can only be activated if `normalize=True`.
+      up_weight:
+        if set to true, produces arce in the up_weighted space (considers CTR after up_weighting
+        data), while False gives arce in the original space (only considers CTR before up_weighting).
+        In the actual version, this flag can only be activated if arce is True.
+        Notice that the actual version of NRCE corresponds to up_weight=True.
+      metrics_collections:
+        optional list of collections to add this metric into.
+      updates_collections:
+        optional list of collections to add the associated update_op into.
+      name:
+        an optional variable_scope name.
+      deprecated_rce:
+        enables the previous NRCE/ARCE calculations which calculated some label metrics
+        on the batch instead of on all batches seen so far. Note that the older metric
+        calculation is less stable, especially for smaller batch sizes. You should probably
+        never have to set this to True.
+
+    Return:
+      rce_value:
+        A ``Tensor`` representing the RCE.
+      update_op:
+        A update operation used to accumulate data into this metric.
+
+    .. note:: Must have at least 1 positive and 1 negative sample accumulated,
+       or RCE will come out as NaN.
+    """
+    with tf.variable_scope(name, "rce", (labels, predictions, weights)):
+        labels = tf.to_float(labels, name="label_to_float")
+        predictions = tf.to_float(predictions, name="predictions_to_float")
+
+        if weights is None:
+            weights = tf.ones(
+                shape=tf.shape(labels), dtype=tf.float32, name="default_weight"
+            )
+        else:
+            weights = tf.to_float(weights, name="weight_to_float")
+
+        total_positive = _metric_variable(
+            name="total_positive", shape=[], dtype=tf.float32
+        )
+        total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32)
+        total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32)
+
+        label_weighted = tf.multiply(labels, weights, name="weighted_label")
+
+        update_total_positive = tf.assign_add(
+            total_positive, tf.reduce_sum(label_weighted), name="total_pos_update"
+        )
+
+        if arce:
+            if normalize is False:
+                raise ValueError(
+                    "This configuration of parameters is not actually allowed"
+                )
+
+            (
+                predictions,
+                total_positive,
+                update_total_positive,
+                weights,
+            ) = _get_arce_predictions(
+                predictions=predictions,
+                weights=weights,
+                deprecated_rce=deprecated_rce,
+                label_weighted=label_weighted,
+                labels=labels,
+                up_weight=up_weight,
+                total_positive=total_positive,
+                update_total_positive=update_total_positive,
+            )
+
+        elif normalize:
+            predictions_weighted = tf.multiply(
+                predictions, weights, name="weighted_preds"
+            )
+
+            if deprecated_rce:
+                normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(
+                    predictions_weighted
+                )
+            else:
+                total_prediction = _metric_variable(
+                    name="total_prediction", shape=[], dtype=tf.float32
+                )
+
+                # update the variable holding the sum of weighted predictions
+                update_total_prediction = tf.assign_add(
+                    total_prediction,
+                    tf.reduce_sum(predictions_weighted),
+                    name="total_prediction_update",
+                )
+
+                # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
+                # but it measure normalizer over batch was too flawed an approximation.
+                normalizer = update_total_positive / update_total_prediction
+
+            # NRCE
+            predictions = tf.multiply(
+                predictions, normalizer, name="normalized_predictions"
+            )
+
+        # clamp predictions to keep log(p) stable
+        clip_p = tf.clip_by_value(
+            predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p"
+        )
+        logloss = _binary_cross_entropy(pred=clip_p, target=labels, name="logloss")
+
+        logloss_weighted = tf.multiply(logloss, weights, name="weighted_logloss")
+
+        update_total_loss = tf.assign_add(
+            total_loss, tf.reduce_sum(logloss_weighted), name="total_loss_update"
+        )
+        update_total_weight = tf.assign_add(
+            total_weight, tf.reduce_sum(weights), name="total_weight_update"
+        )
+
+        # metric value retrieval subgraph
+        ctr1 = tf.truediv(total_positive, total_weight, name="ctr")
+        # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
+        # is constant for every sample, we can simplify it to the formula below.
+        baseline_ce = _binary_cross_entropy(pred=ctr1, target=ctr1, name="baseline_ce")
+        pred_ce = tf.truediv(total_loss, total_weight, name="pred_ce")
+
+        rce_t = tf.multiply(1.0 - tf.truediv(pred_ce, baseline_ce), 100, name="rce")
+
+        # metric update subgraph
+        ctr2 = tf.truediv(update_total_positive, update_total_weight, name="ctr_update")
+        # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
+        # is constant for every sample, we can simplify it to the formula below.
+        baseline_ce2 = _binary_cross_entropy(
+            pred=ctr2, target=ctr2, name="baseline_ce_update"
+        )
+        pred_ce2 = tf.truediv(
+            update_total_loss, update_total_weight, name="pred_ce_update"
+        )
+
+        update_op = tf.multiply(
+            1.0 - tf.truediv(pred_ce2, baseline_ce2), 100, name="update_op"
+        )
+
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, rce_t)
+
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_op)
+
+        return rce_t, update_op
 
-    update_op = tf.multiply(
-      1.0 - tf.truediv(pred_ce2, baseline_ce2),
-      100,
-      name="update_op")
 
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, rce_t)
+def ce(p_true, p_est=None):
+    if p_est is None:
+        p_est = p_true
+    return _binary_cross_entropy(pred=p_est, target=p_true, name=None)
 
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
 
-    return rce_t, update_op
+def rce_transform(outputs, labels, weights):
+    """
+    Construct an OrderedDict of quantities to aggregate over eval batches
+    outputs, labels, weights are TensorFlow tensors, and are assumed to
+      be of shape [N] for batch_size = N
+    Each entry in the output OrderedDict should also be of shape [N]
+    """
+    out_vals = OrderedDict()
+    out_vals["weighted_loss"] = weights * ce(p_true=labels, p_est=outputs)
+    out_vals["weighted_labels"] = labels * weights
+    out_vals["weight"] = weights
+    return out_vals
 
 
-def ce(p_true, p_est=None):
-  if p_est is None:
-    p_est = p_true
-  return _binary_cross_entropy(pred=p_est, target=p_true, name=None)
+def rce_metric(aggregates):
+    """
+    input ``aggregates`` is an OrderedDict with the same keys as those created
+      by rce_transform(). The dict values are the aggregates (reduce_sum)
+      of the values produced by rce_transform(), and should be scalars.
+    output is the value of RCE
+    """
+    # cummulative weighted loss of model predictions
+    total_weighted_loss = aggregates["weighted_loss"]
+    total_weighted_labels = aggregates["weighted_labels"]
+    total_weight = aggregates["weight"]
 
+    model_average_loss = total_weighted_loss / total_weight
+    baseline_average_loss = ce(total_weighted_labels / total_weight)
+    return 100.0 * (1 - model_average_loss / baseline_average_loss)
 
-def rce_transform(outputs, labels, weights):
-  '''
-  Construct an OrderedDict of quantities to aggregate over eval batches
-  outputs, labels, weights are TensorFlow tensors, and are assumed to
-    be of shape [N] for batch_size = N
-  Each entry in the output OrderedDict should also be of shape [N]
-  '''
-  out_vals = OrderedDict()
-  out_vals['weighted_loss'] = weights * ce(p_true=labels, p_est=outputs)
-  out_vals['weighted_labels'] = labels * weights
-  out_vals['weight'] = weights
-  return out_vals
 
+def metric_std_err(
+    labels,
+    predictions,
+    weights=None,
+    transform=rce_transform,
+    metric=rce_metric,
+    metrics_collections=None,
+    updates_collections=None,
+    name="rce_std_err",
+):
+    """
+    Compute the weighted standard error of the RCE metric on this eval set.
+    This can be used for confidence intervals and unpaired hypothesis tests.
+
+    Args:
+      labels: the ground truth value.
+      predictions: the predicted values, whose shape must match labels.
+      weights: optional weights, whose shape must match labels . Weight is 1 if not set.
+      transform: a function of the following form:
+
+        .. code-block:: python
+
+          def transform(outputs, labels, weights):
+            out_vals = OrderedDict()
+            ...
+            return out_vals
+
+        where outputs, labels, and weights are all tensors of shape [eval_batch_size].
+        The returned OrderedDict() should have values that are tensors of shape  [eval_batch_size].
+        These will be aggregated across many batches in the eval dataset, to produce
+        one scalar value per key of out_vals.
+      metric: a function of the following form
+
+        .. code-block:: python
+
+          def metric(aggregates):
+            ...
+            return metric_value
+
+        where aggregates is an OrderedDict() having the same keys created by transform().
+        Each of the corresponding dict values is the reduce_sum of the values produced by
+        transform(), and is a TF scalar. The return value should be a scalar representing
+        the value of the desired metric.
+      metrics_collections: optional list of collections to add this metric into.
+      updates_collections: optional list of collections to add the associated update_op into.
+      name: an optional variable_scope name.
+
+    Return:
+      metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
+      update_op: A update operation used to accumulate data into this metric.
+    """
+    with tf.variable_scope(name, "metric_std_err", (labels, predictions, weights)):
+        labels = tf.cast(labels, tf.float64)
+        predictions = tf.cast(predictions, tf.float64)
 
-def rce_metric(aggregates):
-  '''
-  input ``aggregates`` is an OrderedDict with the same keys as those created
-    by rce_transform(). The dict values are the aggregates (reduce_sum)
-    of the values produced by rce_transform(), and should be scalars.
-  output is the value of RCE
-  '''
-  # cummulative weighted loss of model predictions
-  total_weighted_loss = aggregates['weighted_loss']
-  total_weighted_labels = aggregates['weighted_labels']
-  total_weight = aggregates['weight']
-
-  model_average_loss = total_weighted_loss / total_weight
-  baseline_average_loss = ce(total_weighted_labels / total_weight)
-  return 100.0 * (1 - model_average_loss / baseline_average_loss)
-
-
-def metric_std_err(labels, predictions,
-                   weights=None,
-                   transform=rce_transform, metric=rce_metric,
-                   metrics_collections=None,
-                   updates_collections=None,
-                   name='rce_std_err'):
-  """
-  Compute the weighted standard error of the RCE metric on this eval set.
-  This can be used for confidence intervals and unpaired hypothesis tests.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    transform: a function of the following form:
-
-      .. code-block:: python
-
-        def transform(outputs, labels, weights):
-          out_vals = OrderedDict()
-          ...
-          return out_vals
-
-      where outputs, labels, and weights are all tensors of shape [eval_batch_size].
-      The returned OrderedDict() should have values that are tensors of shape  [eval_batch_size].
-      These will be aggregated across many batches in the eval dataset, to produce
-      one scalar value per key of out_vals.
-    metric: a function of the following form
-
-      .. code-block:: python
-
-        def metric(aggregates):
-          ...
-          return metric_value
-
-      where aggregates is an OrderedDict() having the same keys created by transform().
-      Each of the corresponding dict values is the reduce_sum of the values produced by
-      transform(), and is a TF scalar. The return value should be a scalar representing
-      the value of the desired metric.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'metric_std_err', (labels, predictions, weights)):
-    labels = tf.cast(labels, tf.float64)
-    predictions = tf.cast(predictions, tf.float64)
-
-    if weights is None:
-      weights = tf.ones_like(labels, dtype=tf.float64, name="default_weight")
-    else:
-      weights = tf.cast(weights, tf.float64)
-
-    labels = tf.reshape(labels, [-1])
-    predictions = tf.reshape(predictions, [-1])
-    predictions = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p")
-    weights = tf.reshape(weights, [-1])
-
-    # first apply the supplied transform function to the output, label, weight data
-    # returns an OrderedDict of 1xN tensors for N input samples
-    # for each sample, compute f = transform(pred, l, w)
-    transformed = transform(predictions, labels, weights)
-
-    # we track 3 types of aggregate information
-    # 1. total number of samples
-    # 2. aggregated transformed samples (moment1), i.e. sum(f)
-    # 3. aggregated crosses of transformed samples (moment2), i.e. sum(f*f^T)
-
-    # count total number of samples
-    sample_count = _metric_variable(
-        name='sample_count', shape=[], dtype=tf.int64)
-    update_sample_count = tf.assign_add(sample_count, tf.size(labels, out_type=sample_count.dtype))
-
-    # compose the ordered dict into a single vector
-    # so f can be treated as a single column vector rather than a collection of scalars
-    N = len(transformed)
-    transformed_vec = tf.stack(list(transformed.values()), axis=1)
-
-    # compute and update transformed samples (1st order statistics)
-    # i.e. accumulate f into F as F += sum(f)
-    aggregates_1 = _metric_variable(
-        name='aggregates_1', shape=[N], dtype=tf.float64)
-    update_aggregates_1 = tf.assign_add(aggregates_1, tf.reduce_sum(transformed_vec, axis=0))
-
-    # compute and update crossed transformed samples (2nd order statistics)
-    # i.e. accumulate f*f^T into F2 as F2 += sum(f*transpose(f))
-    aggregates_2 = _metric_variable(
-        name='aggregates_2', shape=[N, N], dtype=tf.float64)
-    moment_2_temp = (
-      tf.reshape(transformed_vec, shape=[-1, N, 1])
-      * tf.reshape(transformed_vec, shape=[-1, 1, N])
-    )
-    update_aggregates_2 = tf.assign_add(aggregates_2, tf.reduce_sum(moment_2_temp, axis=0))
-
-    def compute_output(agg_1, agg_2, samp_cnt):
-      # decompose the aggregates back into a dict to pass to the user-supplied metric fn
-      aggregates_dict = OrderedDict()
-      for i, key in enumerate(transformed.keys()):
-        aggregates_dict[key] = agg_1[i]
-
-      metric_value = metric(aggregates_dict)
-
-      # derivative of metric with respect to the 1st order aggregates
-      # i.e. d M(agg1) / d agg1
-      metric_prime = tf.gradients(metric_value, agg_1, stop_gradients=agg_1)
-
-      # estimated covariance of agg_1
-      # cov(F) = sum(f*f^T) - (sum(f) * sum(f)^T) / N
-      #     = agg_2 - (agg_1 * agg_1^T) / N
-      N_covariance_estimate = agg_2 - (
-        tf.reshape(agg_1, shape=[-1, 1])
-        @ tf.reshape(agg_1, shape=[1, -1])
-        / tf.cast(samp_cnt, dtype=tf.float64)
-      )
-
-      # push N_covariance_estimate through a linearization of metric around agg_1
-      # metric var = transpose(d M(agg1) / d agg1) * cov(F) * (d M(agg1) / d agg1)
-      metric_variance = (
-        tf.reshape(metric_prime, shape=[1, -1])
-        @ N_covariance_estimate
-        @ tf.reshape(metric_prime, shape=[-1, 1])
-      )
-      # result should be a single element, but the matmul is 2D
-      metric_variance = metric_variance[0][0]
-      metric_stderr = tf.sqrt(metric_variance)
-      return metric_stderr
-
-    metric_stderr = compute_output(aggregates_1, aggregates_2, sample_count)
-    update_metric_stderr = compute_output(update_aggregates_1, update_aggregates_2, update_sample_count)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, metric_stderr)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_metric_stderr)
-
-    return metric_stderr, update_metric_stderr
-
-
-def lolly_nrce(labels, predictions,
-               weights=None,
-               metrics_collections=None,
-               updates_collections=None,
-               name=None):
-  """
-  Compute the Lolly NRCE.
-
-  Note: As this NRCE calculation uses Taylor expansion, it becomes inaccurate when the ctr is large,
-  especially when the adjusted ctr goes above 1.0.
-
-  Calculation:
-
-  ::
-
-    NRCE: lolly NRCE
-    BCE: baseline cross entropy
-    NCE: normalized cross entropy
-    CE: cross entropy
-    y_i: label of example i
-    p_i: prediction of example i
-    y: ctr
-    p: average prediction
-    a: normalizer
-
-    Assumes any p_i and a * p_i is within [0, 1)
-    NRCE = (1 - NCE / BCE) * 100
-    BCE = - sum_i(y_i * log(y) + (1 - y_i) * log(1 - y))
-        = - (y * log(y) + (1 - y) * log(1 - y))
-    a = y / p
-    CE = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
-    NCE = - sum_i(y_i * log(a * p_i) + (1 - y_i) * log(1 - a * p_i))
-        = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
-          - sum_i(y_i * log(a))
-          + sum_i((1 - y_i) * log(1 - p_i))
-          - sum_i((1 - y_i) * log(1 - a * p_i))
-        ~= CE - sum_i(y_i) * log(a)
-          + sum_i((1 - y_i) * (- sum_{j=1~5}(p_i^j / j)))
-          - sum_i((1 - y_i) * (- sum_{j=1~5}(a^j * p_i^j / j)))
-          # Takes 5 items from the Taylor expansion, can be increased if needed
-          # Error for each example is O(p_i^6)
-        = CE - sum_i(y_i) * log(a)
-          - sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) / j)
-          + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * a^j / j)
-        = CE - sum_i(y_i) * log(a)
-          + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * (a^j - 1) / j)
-
-  Thus we keep track of CE, sum_i(y_i), sum_i((1 - y_i) * p_i^j) for j=1~5.
-  We also keep track of p and y by sum_i(y_i), sum_i(p_i), sum_i(1) so that
-  we can get a at the end, which leads to this NRCE.
-
-  NRCE uses ctr and average pctr to normalize the pctrs.
-  It removes the impact of prediction error from RCE.
-  Usually NRCE is higher as the prediction error impact on RCE is negative.
-  Removing prediction error in our model can make RCE closer to NRCE and thus improve RCE.
-
-  In Lolly NRCE we use ctr and average pctr of the whole dataset.
-  We thus remove the dataset level error in NRCE calculation.
-  In this case, when we want to improve RCE to the level of NRCE,
-  it is achievable as dataset level prediction error is easy to remove by calibration.
-  Lolly NRCE is thus a good estimate about the potential gain by adding calibration.
-
-  In DBv2 NRCE, we use per-batch ctr and average pctr. We remove the batch level error.
-  This error is difficult to remove by modeling improvement,
-  at least not by simple calibration.
-  It thus cannot indicate the same opportunity as the Lolly NRCE does.
-
-  Args:
-    labels:
-      the ground true value.
-    predictions:
-      the predicted values, whose shape must match labels.
-    weights:
-      optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections:
-      optional list of collections to add this metric into.
-    updates_collections:
-      optional list of collections to add the associated update_op into.
-    name:
-      an optional variable_scope name.
-
-  Return:
-    rce_value:
-      A ``Tensor`` representing the RCE.
-    update_op:
-      A update operation used to accumulate data into this metric.
-
-  Note: Must have at least 1 positive and 1 negative sample accumulated,
-        or NRCE will come out as NaN.
-  """
-  with tf.variable_scope(name, "lolly_nrce", (labels, predictions, weights)):
-    labels = tf.to_float(labels, name="label_to_float")
-    predictions = tf.to_float(predictions, name="predictions_to_float")
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    else:
-      weights = tf.to_float(weights, name="weight_to_float")
-
-    positive_weights = tf.multiply(labels, weights, name="positive_weights")
-
-    # clamp predictions to keep log(p) stable
-    clip_predictions = tf.clip_by_value(
-      predictions,
-      CLAMP_EPSILON,
-      1.0 - CLAMP_EPSILON,
-      name="clip_predictions")
-    weighted_predictions = tf.multiply(
-      predictions, weights,
-      name="weighted_predictions")
-
-    logloss = _binary_cross_entropy(pred=clip_predictions, target=labels, name="logloss")
-    weighted_logloss = tf.multiply(logloss, weights, name="weighted_logloss")
-
-    negatives = tf.subtract(
-      tf.ones(shape=tf.shape(labels), dtype=tf.float32),
-      labels,
-      name="negatives")
-    negative_predictions = tf.multiply(
-      predictions,
-      negatives,
-      name="negative_predictions")
-    weighted_negative_predictions = tf.multiply(
-      negative_predictions, weights,
-      name="weighted_negative_predictions")
-    negative_squared_predictions = tf.multiply(
-      negative_predictions,
-      negative_predictions,
-      name="negative_squared_predictions")
-    weighted_negative_squared_predictions = tf.multiply(
-      negative_squared_predictions, weights,
-      name="weighted_negative_squared_predictions")
-    negative_cubed_predictions = tf.multiply(
-      negative_squared_predictions,
-      negative_predictions,
-      name="negative_cubed_predictions")
-    weighted_negative_cubed_predictions = tf.multiply(
-      negative_cubed_predictions, weights,
-      name="weighted_negative_cubed_predictions")
-    negative_quartic_predictions = tf.multiply(
-      negative_cubed_predictions,
-      negative_predictions,
-      name="negative_quartic_predictions")
-    weighted_negative_quartic_predictions = tf.multiply(
-      negative_quartic_predictions, weights,
-      name="weighted_negative_quartic_predictions")
-    negative_quintic_predictions = tf.multiply(
-      negative_quartic_predictions,
-      negative_predictions,
-      name="negative_quintic_predictions")
-    weighted_negative_quintic_predictions = tf.multiply(
-      negative_quintic_predictions, weights,
-      name="weighted_negative_quintic_predictions")
-
-    # Tracked stats
-    total_positive = _metric_variable(name="total_positive", shape=[], dtype=tf.float32)
-    total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32)
-
-    total_prediction = _metric_variable(name="total_prediction", shape=[], dtype=tf.float32)
-
-    total_negative_prediction = _metric_variable(
-      name="total_negative_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_squared_prediction = _metric_variable(
-      name="total_negative_squared_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_cubed_prediction = _metric_variable(
-      name="total_negative_cubed_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_quartic_prediction = _metric_variable(
-      name="total_negative_quartic_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_quintic_prediction = _metric_variable(
-      name="total_negative_quintic_prediction",
-      shape=[], dtype=tf.float32)
-
-    total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32)
-
-    # Update tracked stats
-    update_total_positive = tf.assign_add(
-      total_positive, tf.reduce_sum(positive_weights), name="total_positive_update")
-    update_total_weight = tf.assign_add(
-      total_weight, tf.reduce_sum(weights), name="total_weight_update")
-    update_total_prediction = tf.assign_add(
-      total_prediction, tf.reduce_sum(weighted_predictions), name="total_prediction_update")
-    update_total_negative_prediction = tf.assign_add(
-      total_negative_prediction,
-      tf.reduce_sum(weighted_negative_predictions), name="total_negative_prediction_update")
-    update_total_negative_squared_prediction = tf.assign_add(
-      total_negative_squared_prediction,
-      tf.reduce_sum(weighted_negative_squared_predictions),
-      name="total_negative_squared_prediction_update")
-    update_total_negative_cubed_prediction = tf.assign_add(
-      total_negative_cubed_prediction,
-      tf.reduce_sum(weighted_negative_cubed_predictions),
-      name="total_negative_cubed_prediction_update")
-    update_total_negative_quartic_prediction = tf.assign_add(
-      total_negative_quartic_prediction,
-      tf.reduce_sum(weighted_negative_quartic_predictions),
-      name="total_negative_quartic_prediction_update")
-    update_total_negative_quintic_prediction = tf.assign_add(
-      total_negative_quintic_prediction,
-      tf.reduce_sum(weighted_negative_quintic_predictions),
-      name="total_negative_quintic_prediction_update")
-    update_total_loss = tf.assign_add(
-      total_loss, tf.reduce_sum(weighted_logloss), name="total_loss_update")
-
-    # metric value retrieval subgraph
-    # ctr of this batch
-    positive_rate = tf.truediv(total_positive, total_weight, name="positive_rate")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_loss = _binary_cross_entropy(
-      pred=positive_rate,
-      target=positive_rate,
-      name="baseline_loss")
-
-    # normalizing ratio for nrce
-    # calculated using total ctr and pctr so the last batch has the dataset ctr and pctr
-    normalizer = tf.truediv(total_positive, total_prediction, name="normalizer")
-    # Taylor expansion to calculate nl = - sum(y * log(p * a) + (1 - y) * log (1 - p * a))
-    # log(1 - p * a) = -sum_{i=1~+inf}(a^i * x^i / i)
-    # log(1 - p) = -sum_{i=1~+inf}(a^i * x^i / i)
-    normalized_loss = (
-      total_loss -
-      total_positive * tf.log(normalizer) +
-      total_negative_prediction * (normalizer - 1) +
-      total_negative_squared_prediction * (normalizer * normalizer - 1) / 2 +
-      total_negative_cubed_prediction *
-      (normalizer * normalizer * normalizer - 1) / 3 +
-      total_negative_quartic_prediction *
-      (normalizer * normalizer * normalizer * normalizer - 1) / 4 +
-      total_negative_quintic_prediction *
-      (normalizer * normalizer * normalizer * normalizer * normalizer - 1) / 5)
-
-    # average normalized loss
-    avg_loss = tf.truediv(normalized_loss, total_weight, name="avg_loss")
-
-    nrce_t = tf.multiply(
-      1.0 - tf.truediv(avg_loss, baseline_loss),
-      100,
-      name="lolly_nrce")
-
-    # metric update subgraph
-    update_positive_rate = tf.truediv(
-      update_total_positive,
-      update_total_weight,
-      name="update_positive_rate")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    update_baseline_loss = _binary_cross_entropy(
-      pred=update_positive_rate,
-      target=update_positive_rate,
-      name="update_baseline_loss")
-
-    update_normalizer = tf.truediv(
-      update_total_positive,
-      update_total_prediction,
-      name="update_normalizer")
-    update_normalized_loss = (
-      update_total_loss -
-      update_total_positive * tf.log(update_normalizer) +
-      update_total_negative_prediction *
-      (update_normalizer - 1) +
-      update_total_negative_squared_prediction *
-      (update_normalizer * update_normalizer - 1) / 2 +
-      update_total_negative_cubed_prediction *
-      (update_normalizer * update_normalizer * update_normalizer - 1) / 3 +
-      update_total_negative_quartic_prediction *
-      (update_normalizer * update_normalizer * update_normalizer *
-       update_normalizer - 1) / 4 +
-      update_total_negative_quintic_prediction *
-      (update_normalizer * update_normalizer * update_normalizer *
-       update_normalizer * update_normalizer - 1) / 5)
-
-    update_avg_loss = tf.truediv(
-      update_normalized_loss,
-      update_total_weight,
-      name="update_avg_loss")
-
-    update_op = tf.multiply(
-      1.0 - tf.truediv(update_avg_loss, update_baseline_loss),
-      100,
-      name="update_op")
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, nrce_t)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return nrce_t, update_op
+        if weights is None:
+            weights = tf.ones_like(labels, dtype=tf.float64, name="default_weight")
+        else:
+            weights = tf.cast(weights, tf.float64)
+
+        labels = tf.reshape(labels, [-1])
+        predictions = tf.reshape(predictions, [-1])
+        predictions = tf.clip_by_value(
+            predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p"
+        )
+        weights = tf.reshape(weights, [-1])
+
+        # first apply the supplied transform function to the output, label, weight data
+        # returns an OrderedDict of 1xN tensors for N input samples
+        # for each sample, compute f = transform(pred, l, w)
+        transformed = transform(predictions, labels, weights)
+
+        # we track 3 types of aggregate information
+        # 1. total number of samples
+        # 2. aggregated transformed samples (moment1), i.e. sum(f)
+        # 3. aggregated crosses of transformed samples (moment2), i.e. sum(f*f^T)
+
+        # count total number of samples
+        sample_count = _metric_variable(name="sample_count", shape=[], dtype=tf.int64)
+        update_sample_count = tf.assign_add(
+            sample_count, tf.size(labels, out_type=sample_count.dtype)
+        )
+
+        # compose the ordered dict into a single vector
+        # so f can be treated as a single column vector rather than a collection of scalars
+        N = len(transformed)
+        transformed_vec = tf.stack(list(transformed.values()), axis=1)
+
+        # compute and update transformed samples (1st order statistics)
+        # i.e. accumulate f into F as F += sum(f)
+        aggregates_1 = _metric_variable(
+            name="aggregates_1", shape=[N], dtype=tf.float64
+        )
+        update_aggregates_1 = tf.assign_add(
+            aggregates_1, tf.reduce_sum(transformed_vec, axis=0)
+        )
+
+        # compute and update crossed transformed samples (2nd order statistics)
+        # i.e. accumulate f*f^T into F2 as F2 += sum(f*transpose(f))
+        aggregates_2 = _metric_variable(
+            name="aggregates_2", shape=[N, N], dtype=tf.float64
+        )
+        moment_2_temp = tf.reshape(transformed_vec, shape=[-1, N, 1]) * tf.reshape(
+            transformed_vec, shape=[-1, 1, N]
+        )
+        update_aggregates_2 = tf.assign_add(
+            aggregates_2, tf.reduce_sum(moment_2_temp, axis=0)
+        )
+
+        def compute_output(agg_1, agg_2, samp_cnt):
+            # decompose the aggregates back into a dict to pass to the user-supplied metric fn
+            aggregates_dict = OrderedDict()
+            for i, key in enumerate(transformed.keys()):
+                aggregates_dict[key] = agg_1[i]
+
+            metric_value = metric(aggregates_dict)
+
+            # derivative of metric with respect to the 1st order aggregates
+            # i.e. d M(agg1) / d agg1
+            metric_prime = tf.gradients(metric_value, agg_1, stop_gradients=agg_1)
+
+            # estimated covariance of agg_1
+            # cov(F) = sum(f*f^T) - (sum(f) * sum(f)^T) / N
+            #     = agg_2 - (agg_1 * agg_1^T) / N
+            N_covariance_estimate = agg_2 - (
+                tf.reshape(agg_1, shape=[-1, 1])
+                @ tf.reshape(agg_1, shape=[1, -1])
+                / tf.cast(samp_cnt, dtype=tf.float64)
+            )
+
+            # push N_covariance_estimate through a linearization of metric around agg_1
+            # metric var = transpose(d M(agg1) / d agg1) * cov(F) * (d M(agg1) / d agg1)
+            metric_variance = (
+                tf.reshape(metric_prime, shape=[1, -1])
+                @ N_covariance_estimate
+                @ tf.reshape(metric_prime, shape=[-1, 1])
+            )
+            # result should be a single element, but the matmul is 2D
+            metric_variance = metric_variance[0][0]
+            metric_stderr = tf.sqrt(metric_variance)
+            return metric_stderr
+
+        metric_stderr = compute_output(aggregates_1, aggregates_2, sample_count)
+        update_metric_stderr = compute_output(
+            update_aggregates_1, update_aggregates_2, update_sample_count
+        )
+
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, metric_stderr)
+
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_metric_stderr)
+
+        return metric_stderr, update_metric_stderr
+
+
+def lolly_nrce(
+    labels,
+    predictions,
+    weights=None,
+    metrics_collections=None,
+    updates_collections=None,
+    name=None,
+):
+    """
+    Compute the Lolly NRCE.
+
+    Note: As this NRCE calculation uses Taylor expansion, it becomes inaccurate when the ctr is large,
+    especially when the adjusted ctr goes above 1.0.
+
+    Calculation:
+
+    ::
+
+      NRCE: lolly NRCE
+      BCE: baseline cross entropy
+      NCE: normalized cross entropy
+      CE: cross entropy
+      y_i: label of example i
+      p_i: prediction of example i
+      y: ctr
+      p: average prediction
+      a: normalizer
+
+      Assumes any p_i and a * p_i is within [0, 1)
+      NRCE = (1 - NCE / BCE) * 100
+      BCE = - sum_i(y_i * log(y) + (1 - y_i) * log(1 - y))
+          = - (y * log(y) + (1 - y) * log(1 - y))
+      a = y / p
+      CE = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
+      NCE = - sum_i(y_i * log(a * p_i) + (1 - y_i) * log(1 - a * p_i))
+          = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
+            - sum_i(y_i * log(a))
+            + sum_i((1 - y_i) * log(1 - p_i))
+            - sum_i((1 - y_i) * log(1 - a * p_i))
+          ~= CE - sum_i(y_i) * log(a)
+            + sum_i((1 - y_i) * (- sum_{j=1~5}(p_i^j / j)))
+            - sum_i((1 - y_i) * (- sum_{j=1~5}(a^j * p_i^j / j)))
+            # Takes 5 items from the Taylor expansion, can be increased if needed
+            # Error for each example is O(p_i^6)
+          = CE - sum_i(y_i) * log(a)
+            - sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) / j)
+            + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * a^j / j)
+          = CE - sum_i(y_i) * log(a)
+            + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * (a^j - 1) / j)
+
+    Thus we keep track of CE, sum_i(y_i), sum_i((1 - y_i) * p_i^j) for j=1~5.
+    We also keep track of p and y by sum_i(y_i), sum_i(p_i), sum_i(1) so that
+    we can get a at the end, which leads to this NRCE.
+
+    NRCE uses ctr and average pctr to normalize the pctrs.
+    It removes the impact of prediction error from RCE.
+    Usually NRCE is higher as the prediction error impact on RCE is negative.
+    Removing prediction error in our model can make RCE closer to NRCE and thus improve RCE.
+
+    In Lolly NRCE we use ctr and average pctr of the whole dataset.
+    We thus remove the dataset level error in NRCE calculation.
+    In this case, when we want to improve RCE to the level of NRCE,
+    it is achievable as dataset level prediction error is easy to remove by calibration.
+    Lolly NRCE is thus a good estimate about the potential gain by adding calibration.
+
+    In DBv2 NRCE, we use per-batch ctr and average pctr. We remove the batch level error.
+    This error is difficult to remove by modeling improvement,
+    at least not by simple calibration.
+    It thus cannot indicate the same opportunity as the Lolly NRCE does.
+
+    Args:
+      labels:
+        the ground true value.
+      predictions:
+        the predicted values, whose shape must match labels.
+      weights:
+        optional weights, whose shape must match labels . Weight is 1 if not set.
+      metrics_collections:
+        optional list of collections to add this metric into.
+      updates_collections:
+        optional list of collections to add the associated update_op into.
+      name:
+        an optional variable_scope name.
+
+    Return:
+      rce_value:
+        A ``Tensor`` representing the RCE.
+      update_op:
+        A update operation used to accumulate data into this metric.
+
+    Note: Must have at least 1 positive and 1 negative sample accumulated,
+          or NRCE will come out as NaN.
+    """
+    with tf.variable_scope(name, "lolly_nrce", (labels, predictions, weights)):
+        labels = tf.to_float(labels, name="label_to_float")
+        predictions = tf.to_float(predictions, name="predictions_to_float")
+
+        if weights is None:
+            weights = tf.ones(
+                shape=tf.shape(labels), dtype=tf.float32, name="default_weight"
+            )
+        else:
+            weights = tf.to_float(weights, name="weight_to_float")
+
+        positive_weights = tf.multiply(labels, weights, name="positive_weights")
+
+        # clamp predictions to keep log(p) stable
+        clip_predictions = tf.clip_by_value(
+            predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_predictions"
+        )
+        weighted_predictions = tf.multiply(
+            predictions, weights, name="weighted_predictions"
+        )
+
+        logloss = _binary_cross_entropy(
+            pred=clip_predictions, target=labels, name="logloss"
+        )
+        weighted_logloss = tf.multiply(logloss, weights, name="weighted_logloss")
+
+        negatives = tf.subtract(
+            tf.ones(shape=tf.shape(labels), dtype=tf.float32), labels, name="negatives"
+        )
+        negative_predictions = tf.multiply(
+            predictions, negatives, name="negative_predictions"
+        )
+        weighted_negative_predictions = tf.multiply(
+            negative_predictions, weights, name="weighted_negative_predictions"
+        )
+        negative_squared_predictions = tf.multiply(
+            negative_predictions,
+            negative_predictions,
+            name="negative_squared_predictions",
+        )
+        weighted_negative_squared_predictions = tf.multiply(
+            negative_squared_predictions,
+            weights,
+            name="weighted_negative_squared_predictions",
+        )
+        negative_cubed_predictions = tf.multiply(
+            negative_squared_predictions,
+            negative_predictions,
+            name="negative_cubed_predictions",
+        )
+        weighted_negative_cubed_predictions = tf.multiply(
+            negative_cubed_predictions,
+            weights,
+            name="weighted_negative_cubed_predictions",
+        )
+        negative_quartic_predictions = tf.multiply(
+            negative_cubed_predictions,
+            negative_predictions,
+            name="negative_quartic_predictions",
+        )
+        weighted_negative_quartic_predictions = tf.multiply(
+            negative_quartic_predictions,
+            weights,
+            name="weighted_negative_quartic_predictions",
+        )
+        negative_quintic_predictions = tf.multiply(
+            negative_quartic_predictions,
+            negative_predictions,
+            name="negative_quintic_predictions",
+        )
+        weighted_negative_quintic_predictions = tf.multiply(
+            negative_quintic_predictions,
+            weights,
+            name="weighted_negative_quintic_predictions",
+        )
+
+        # Tracked stats
+        total_positive = _metric_variable(
+            name="total_positive", shape=[], dtype=tf.float32
+        )
+        total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32)
+
+        total_prediction = _metric_variable(
+            name="total_prediction", shape=[], dtype=tf.float32
+        )
+
+        total_negative_prediction = _metric_variable(
+            name="total_negative_prediction", shape=[], dtype=tf.float32
+        )
+        total_negative_squared_prediction = _metric_variable(
+            name="total_negative_squared_prediction", shape=[], dtype=tf.float32
+        )
+        total_negative_cubed_prediction = _metric_variable(
+            name="total_negative_cubed_prediction", shape=[], dtype=tf.float32
+        )
+        total_negative_quartic_prediction = _metric_variable(
+            name="total_negative_quartic_prediction", shape=[], dtype=tf.float32
+        )
+        total_negative_quintic_prediction = _metric_variable(
+            name="total_negative_quintic_prediction", shape=[], dtype=tf.float32
+        )
+
+        total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32)
+
+        # Update tracked stats
+        update_total_positive = tf.assign_add(
+            total_positive,
+            tf.reduce_sum(positive_weights),
+            name="total_positive_update",
+        )
+        update_total_weight = tf.assign_add(
+            total_weight, tf.reduce_sum(weights), name="total_weight_update"
+        )
+        update_total_prediction = tf.assign_add(
+            total_prediction,
+            tf.reduce_sum(weighted_predictions),
+            name="total_prediction_update",
+        )
+        update_total_negative_prediction = tf.assign_add(
+            total_negative_prediction,
+            tf.reduce_sum(weighted_negative_predictions),
+            name="total_negative_prediction_update",
+        )
+        update_total_negative_squared_prediction = tf.assign_add(
+            total_negative_squared_prediction,
+            tf.reduce_sum(weighted_negative_squared_predictions),
+            name="total_negative_squared_prediction_update",
+        )
+        update_total_negative_cubed_prediction = tf.assign_add(
+            total_negative_cubed_prediction,
+            tf.reduce_sum(weighted_negative_cubed_predictions),
+            name="total_negative_cubed_prediction_update",
+        )
+        update_total_negative_quartic_prediction = tf.assign_add(
+            total_negative_quartic_prediction,
+            tf.reduce_sum(weighted_negative_quartic_predictions),
+            name="total_negative_quartic_prediction_update",
+        )
+        update_total_negative_quintic_prediction = tf.assign_add(
+            total_negative_quintic_prediction,
+            tf.reduce_sum(weighted_negative_quintic_predictions),
+            name="total_negative_quintic_prediction_update",
+        )
+        update_total_loss = tf.assign_add(
+            total_loss, tf.reduce_sum(weighted_logloss), name="total_loss_update"
+        )
+
+        # metric value retrieval subgraph
+        # ctr of this batch
+        positive_rate = tf.truediv(total_positive, total_weight, name="positive_rate")
+        # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
+        # is constant for every sample, we can simplify it to the formula below.
+        baseline_loss = _binary_cross_entropy(
+            pred=positive_rate, target=positive_rate, name="baseline_loss"
+        )
+
+        # normalizing ratio for nrce
+        # calculated using total ctr and pctr so the last batch has the dataset ctr and pctr
+        normalizer = tf.truediv(total_positive, total_prediction, name="normalizer")
+        # Taylor expansion to calculate nl = - sum(y * log(p * a) + (1 - y) * log (1 - p * a))
+        # log(1 - p * a) = -sum_{i=1~+inf}(a^i * x^i / i)
+        # log(1 - p) = -sum_{i=1~+inf}(a^i * x^i / i)
+        normalized_loss = (
+            total_loss
+            - total_positive * tf.log(normalizer)
+            + total_negative_prediction * (normalizer - 1)
+            + total_negative_squared_prediction * (normalizer * normalizer - 1) / 2
+            + total_negative_cubed_prediction
+            * (normalizer * normalizer * normalizer - 1)
+            / 3
+            + total_negative_quartic_prediction
+            * (normalizer * normalizer * normalizer * normalizer - 1)
+            / 4
+            + total_negative_quintic_prediction
+            * (normalizer * normalizer * normalizer * normalizer * normalizer - 1)
+            / 5
+        )
+
+        # average normalized loss
+        avg_loss = tf.truediv(normalized_loss, total_weight, name="avg_loss")
+
+        nrce_t = tf.multiply(
+            1.0 - tf.truediv(avg_loss, baseline_loss), 100, name="lolly_nrce"
+        )
+
+        # metric update subgraph
+        update_positive_rate = tf.truediv(
+            update_total_positive, update_total_weight, name="update_positive_rate"
+        )
+        # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
+        # is constant for every sample, we can simplify it to the formula below.
+        update_baseline_loss = _binary_cross_entropy(
+            pred=update_positive_rate,
+            target=update_positive_rate,
+            name="update_baseline_loss",
+        )
+
+        update_normalizer = tf.truediv(
+            update_total_positive, update_total_prediction, name="update_normalizer"
+        )
+        update_normalized_loss = (
+            update_total_loss
+            - update_total_positive * tf.log(update_normalizer)
+            + update_total_negative_prediction * (update_normalizer - 1)
+            + update_total_negative_squared_prediction
+            * (update_normalizer * update_normalizer - 1)
+            / 2
+            + update_total_negative_cubed_prediction
+            * (update_normalizer * update_normalizer * update_normalizer - 1)
+            / 3
+            + update_total_negative_quartic_prediction
+            * (
+                update_normalizer
+                * update_normalizer
+                * update_normalizer
+                * update_normalizer
+                - 1
+            )
+            / 4
+            + update_total_negative_quintic_prediction
+            * (
+                update_normalizer
+                * update_normalizer
+                * update_normalizer
+                * update_normalizer
+                * update_normalizer
+                - 1
+            )
+            / 5
+        )
+
+        update_avg_loss = tf.truediv(
+            update_normalized_loss, update_total_weight, name="update_avg_loss"
+        )
+
+        update_op = tf.multiply(
+            1.0 - tf.truediv(update_avg_loss, update_baseline_loss),
+            100,
+            name="update_op",
+        )
+
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, nrce_t)
+
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_op)
+
+        return nrce_t, update_op
 
 
 def _binary_cross_entropy(pred, target, name):
-  return - tf.add(
-    target * tf.log(pred),
-    (1.0 - target) * tf.log(1.0 - pred),
-    name=name)
+    return -tf.add(
+        target * tf.log(pred), (1.0 - target) * tf.log(1.0 - pred), name=name
+    )
 
 
 # Copied from metrics_impl.py with minor modifications.
 # https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39
 def _metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
+    """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
+
+    return tf.Variable(
+        lambda: tf.zeros(shape, dtype),
+        trainable=False,
+        collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
+        validate_shape=validate_shape,
+        name=name,
+    )
 
-  return tf.Variable(
-    lambda: tf.zeros(shape, dtype),
-    trainable=False,
-    collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
-    validate_shape=validate_shape,
-    name=name)
 
 PERCENTILES = np.linspace(0, 1, 101, dtype=np.float32)
 
 # metric_name: (metric, requires thresholded output)
 SUPPORTED_BINARY_CLASS_METRICS = {
-  # TWML metrics
-  'total_weight': (total_weight_metric, False),
-  'num_samples': (num_samples_metric, False),
-  'rce': (rce, False),
-  'rce_std_err': (partial(metric_std_err, transform=rce_transform, metric=rce_metric, name='rce_std_err'), False),
-  'nrce': (partial(rce, normalize=True), False),
-  'lolly_nrce': (lolly_nrce, False),
-  'arce': (partial(rce, normalize=True, arce=True), False),
-  'arce_original': (partial(rce, normalize=True, arce=True, up_weight=False), False),
-  # CTR measures positive sample ratio. This terminology is inherited from Ads.
-  'ctr': (ctr, False),
-  # predicted CTR measures predicted positive ratio.
-  'predicted_ctr': (predicted_ctr, False),
-  'pred_std_dev': (prediction_std_dev, False),
-  # thresholded metrics
-  'accuracy': (tf.metrics.accuracy, True),
-  'precision': (tf.metrics.precision, True),
-  'recall': (tf.metrics.recall, True),
-
-  'false_positives': (tf.metrics.false_positives, True),
-  'false_negatives': (tf.metrics.false_negatives, True),
-  'true_positives': (tf.metrics.true_positives, True),
-  'true_negatives': (tf.metrics.true_negatives, True),
-
-  'precision_at_percentiles': (partial(tf.metrics.precision_at_thresholds, thresholds=PERCENTILES), False),
-  'recall_at_percentiles': (partial(tf.metrics.recall_at_thresholds, thresholds=PERCENTILES), False),
-  'false_positives_at_percentiles': (partial(tf.metrics.false_positives_at_thresholds, thresholds=PERCENTILES), False),
-  'false_negatives_at_percentiles': (partial(tf.metrics.false_negatives_at_thresholds, thresholds=PERCENTILES), False),
-  'true_positives_at_percentiles': (partial(tf.metrics.true_positives_at_thresholds, thresholds=PERCENTILES), False),
-  'true_negatives_at_percentiles': (partial(tf.metrics.true_negatives_at_thresholds, thresholds=PERCENTILES), False),
-
-  # tensorflow metrics
-  'roc_auc': (partial(tf.metrics.auc, curve='ROC',
-    summation_method='careful_interpolation'), False),
-  'pr_auc': (partial(tf.metrics.auc, curve='PR',
-    summation_method='careful_interpolation'), False),
-
-  # tensorboard curves
-  'pr_curve': (tb.summary.v1.pr_curve_streaming_op, False),
-
-  # deprecated metrics
-  'deprecated_nrce': (partial(rce, normalize=True, deprecated_rce=True), False),
-  'deprecated_arce': (partial(rce, normalize=True, arce=True, deprecated_rce=True), False),
-  'deprecated_arce_original': (partial(rce, normalize=True, arce=True,
-                                     up_weight=False, deprecated_rce=True), False)
+    # TWML metrics
+    "total_weight": (total_weight_metric, False),
+    "num_samples": (num_samples_metric, False),
+    "rce": (rce, False),
+    "rce_std_err": (
+        partial(
+            metric_std_err,
+            transform=rce_transform,
+            metric=rce_metric,
+            name="rce_std_err",
+        ),
+        False,
+    ),
+    "nrce": (partial(rce, normalize=True), False),
+    "lolly_nrce": (lolly_nrce, False),
+    "arce": (partial(rce, normalize=True, arce=True), False),
+    "arce_original": (partial(rce, normalize=True, arce=True, up_weight=False), False),
+    # CTR measures positive sample ratio. This terminology is inherited from Ads.
+    "ctr": (ctr, False),
+    # predicted CTR measures predicted positive ratio.
+    "predicted_ctr": (predicted_ctr, False),
+    "pred_std_dev": (prediction_std_dev, False),
+    # thresholded metrics
+    "accuracy": (tf.metrics.accuracy, True),
+    "precision": (tf.metrics.precision, True),
+    "recall": (tf.metrics.recall, True),
+    "false_positives": (tf.metrics.false_positives, True),
+    "false_negatives": (tf.metrics.false_negatives, True),
+    "true_positives": (tf.metrics.true_positives, True),
+    "true_negatives": (tf.metrics.true_negatives, True),
+    "precision_at_percentiles": (
+        partial(tf.metrics.precision_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "recall_at_percentiles": (
+        partial(tf.metrics.recall_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "false_positives_at_percentiles": (
+        partial(tf.metrics.false_positives_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "false_negatives_at_percentiles": (
+        partial(tf.metrics.false_negatives_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "true_positives_at_percentiles": (
+        partial(tf.metrics.true_positives_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "true_negatives_at_percentiles": (
+        partial(tf.metrics.true_negatives_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    # tensorflow metrics
+    "roc_auc": (
+        partial(tf.metrics.auc, curve="ROC", summation_method="careful_interpolation"),
+        False,
+    ),
+    "pr_auc": (
+        partial(tf.metrics.auc, curve="PR", summation_method="careful_interpolation"),
+        False,
+    ),
+    # tensorboard curves
+    "pr_curve": (tb.summary.v1.pr_curve_streaming_op, False),
+    # deprecated metrics
+    "deprecated_nrce": (partial(rce, normalize=True, deprecated_rce=True), False),
+    "deprecated_arce": (
+        partial(rce, normalize=True, arce=True, deprecated_rce=True),
+        False,
+    ),
+    "deprecated_arce_original": (
+        partial(rce, normalize=True, arce=True, up_weight=False, deprecated_rce=True),
+        False,
+    ),
 }
 
 # default metrics provided by get_binary_class_metric_fn
-DEFAULT_BINARY_CLASS_METRICS = ['total_weight', 'num_samples', 'rce', 'rce_std_err',
-                                'nrce', 'arce', 'ctr', 'predicted_ctr', 'pred_std_dev',
-                                'accuracy', 'precision', 'recall', 'roc_auc', 'pr_auc']
+DEFAULT_BINARY_CLASS_METRICS = [
+    "total_weight",
+    "num_samples",
+    "rce",
+    "rce_std_err",
+    "nrce",
+    "arce",
+    "ctr",
+    "predicted_ctr",
+    "pred_std_dev",
+    "accuracy",
+    "precision",
+    "recall",
+    "roc_auc",
+    "pr_auc",
+]
 
 
 def get_binary_class_metric_fn(metrics=None):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for binary classification. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions between 0 and 1. Required.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from following supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-
-      - ctr (same as positive sample ratio.)
-      - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-      - nrce (normalized rce, do not use this one if you do not understand what it is)
-      - `arce <http://go/arce>`_ (a more recent proposed improvment over NRCE)
-      - arce_original
-      - lolly_nrce (NRCE as it is computed in Lolly, with Taylor expansion)
-      - pr_auc
-      - roc_auc
-      - accuracy (percentage of predictions that are correct)
-      - precision (true positives) / (true positives + false positives)
-      - recall (true positives) / (true positives + false negatives)
-      - pr_curve (precision-recall curve)
-      - deprecated_arce (ARCE as it was calculated before a stability fix)
-      - deprecated_nrce (NRCE as it was calculated before a stability fix)
-
-      Example of metrics list with mixture of string and tuple:
-      metrics = [
-        'rce','nrce',
-        'roc_auc',  # default roc_auc metric
-        (
-          'roc_auc_500',  # give this metric a name
-          partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
-          False,  # whether the metric requires thresholded output
-        )]
-
-      NOTE: When predicting rare events roc_auc can be underestimated. Increasing num_threshold
-      can reduce the underestimation. See go/roc-auc-pitfall for more details.
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-  """
-  # pylint: disable=dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(DEFAULT_BINARY_CLASS_METRICS)
-
-  def get_eval_metric_ops(graph_output, labels, weights):
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    Returns a function having signature:
+
+    .. code-block:: python
+
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
+
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops for binary classification. See `tf.estimator.EstimatorSpec
+    <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
+    for a description of eval_metric_ops. The graph_output is a the result
+    dict returned by build_graph. Labels and weights are tf.Tensors.
+
+    The following graph_output keys are recognized:
+      output:
+        the raw predictions between 0 and 1. Required.
+      threshold:
+        A value between 0 and 1 used to threshold the output into a hard_output.
+        Defaults to 0.5 when threshold and hard_output are missing.
+        Either threshold or hard_output can be provided, but not both.
+      hard_output:
+        A thresholded output. Either threshold or hard_output can be provided, but not both.
+
+    Args:
+      metrics (list of String):
+        a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
+        Element in the list can be a string from following supported metrics, or can be a tuple
+        with three items: metric name, metric function, bool for thresholded output.
+
+        These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+        Supported metrics:
+
+        - ctr (same as positive sample ratio.)
+        - rce (cross entropy loss compared to the baseline model of always predicting ctr)
+        - nrce (normalized rce, do not use this one if you do not understand what it is)
+        - `arce <http://go/arce>`_ (a more recent proposed improvment over NRCE)
+        - arce_original
+        - lolly_nrce (NRCE as it is computed in Lolly, with Taylor expansion)
+        - pr_auc
+        - roc_auc
+        - accuracy (percentage of predictions that are correct)
+        - precision (true positives) / (true positives + false positives)
+        - recall (true positives) / (true positives + false negatives)
+        - pr_curve (precision-recall curve)
+        - deprecated_arce (ARCE as it was calculated before a stability fix)
+        - deprecated_nrce (NRCE as it was calculated before a stability fix)
+
+        Example of metrics list with mixture of string and tuple:
+        metrics = [
+          'rce','nrce',
+          'roc_auc',  # default roc_auc metric
+          (
+            'roc_auc_500',  # give this metric a name
+            partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
+            False,  # whether the metric requires thresholded output
+          )]
+
+        NOTE: When predicting rare events roc_auc can be underestimated. Increasing num_threshold
+        can reduce the underestimation. See go/roc-auc-pitfall for more details.
+
+        NOTE: accuracy / precision / recall apply to binary classification problems only.
+        I.e. a prediction is only considered correct if it matches the label. E.g. if the label
+        is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
+        precision / recall / accuracy metrics with soft predictions, you'll need to threshold
+        your predictions into hard 0/1 labels.
+
+        When metrics is None (the default), it defaults to:
+        [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
     """
+    # pylint: disable=dict-keys-not-iterating
+    if metrics is None:
+        # remove expensive metrics by default for faster eval
+        metrics = list(DEFAULT_BINARY_CLASS_METRICS)
 
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if hard_preds is None:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    # add metrics to eval_metric_ops dict
-    for metric in metrics:
-      if isinstance(metric, tuple) and len(metric) == 3:
-        metric_name, metric_factory, requires_threshold = metric
-        metric_name = metric_name.lower()
-      elif isinstance(metric, str):
-        metric_name = metric.lower()  # metric name are case insensitive.
-        metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-      else:
-        raise ValueError("Metric should be either string or tuple of length 3.")
-
-      if metric_name in eval_metric_ops:
-        # avoid adding duplicate metrics.
-        continue
-
-      if metric_factory:
-        value_op, update_op = metric_factory(
-          labels=labels,
-          predictions=(hard_preds if requires_threshold else preds),
-          weights=weights, name=metric_name)
-        eval_metric_ops[metric_name] = (value_op, update_op)
-      else:
-        raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
+    def get_eval_metric_ops(graph_output, labels, weights):
+        """
+        graph_output:
+          dict that is returned by build_graph given input features.
+        labels:
+          target labels associated to batch.
+        weights:
+          weights of the samples..
+        """
+
+        eval_metric_ops = OrderedDict()
+
+        preds = graph_output["output"]
+
+        threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5
+
+        hard_preds = graph_output.get("hard_output")
+        if hard_preds is None:
+            hard_preds = tf.greater_equal(preds, threshold)
+
+        # add metrics to eval_metric_ops dict
+        for metric in metrics:
+            if isinstance(metric, tuple) and len(metric) == 3:
+                metric_name, metric_factory, requires_threshold = metric
+                metric_name = metric_name.lower()
+            elif isinstance(metric, str):
+                metric_name = metric.lower()  # metric name are case insensitive.
+                metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(
+                    metric_name
+                )
+            else:
+                raise ValueError("Metric should be either string or tuple of length 3.")
+
+            if metric_name in eval_metric_ops:
+                # avoid adding duplicate metrics.
+                continue
+
+            if metric_factory:
+                value_op, update_op = metric_factory(
+                    labels=labels,
+                    predictions=(hard_preds if requires_threshold else preds),
+                    weights=weights,
+                    name=metric_name,
+                )
+                eval_metric_ops[metric_name] = (value_op, update_op)
+            else:
+                raise ValueError("Cannot find the metric named " + metric_name)
+
+        return eval_metric_ops
+
+    return get_eval_metric_ops
 
 
 def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for concatenated binary classifications. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  In multiple binary classification problems, the
-  ``predictions`` (that is, ``graph_output['output']``)
-  are expected to have shape ``batch_size x n_classes``,
-  where ``n_classes`` is the number of binary classification.
-  Binary classification at output[i] is expected to discriminate between ``classes[i]`` (1)
-  and NOT ``classes[i]`` (0). The labels should be of the same shape as ``graph_output``
-  with binary values (0 or 1). The weights can be of size ``batch_size`` or
-  ``batch_size x n_classes``. The ``class_dim`` contain separate probabilities,
-  and need to have separate metrics.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions between 0 and 1. Required.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    metrics (list of Metrics):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from following supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-
-      - ctr (same as positive sample ratio.)
-      - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-      - nrce (normalized rce, do not use this one if you do not understand what it is)
-      - pr_auc
-      - roc_auc
-      - accuracy (percentage of predictions that are correct)
-      - precision (true positives) / (true positives + false positives)
-      - recall (true positives) / (true positives + false negatives)
-      - pr_curve (precision-recall curve)
-
-      Example of metrics list with mixture of string and tuple:
-      metrics = [
-        'rce','nrce',
-        'roc_auc',  # default roc_auc metric
-        (
-          'roc_auc_500',  # give this metric a name
-          partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
-          False,  # whether the metric requires thresholded output
-        )]
-
-      NOTE: When prediction on rare events, roc_auc can be underestimated. Increase num_threshold
-      can reduce the underestimation. See go/roc-auc-pitfall for more details.
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-
-    classes (list of strings):
-      In case of multiple binary class models, the names for each class or label.
-      These are used to display metrics on tensorboard.
-      If these are not specified, the index in the class or label dimension is used, and you'll
-      get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
-
-    class_dim (number):
-      Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
-  """
-  # pylint: disable=invalid-name,dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(DEFAULT_BINARY_CLASS_METRICS)
-
-  def get_eval_metric_ops(graph_output, labels, weights):
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    Returns a function having signature:
+
+    .. code-block:: python
+
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
+
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops for concatenated binary classifications. See `tf.estimator.EstimatorSpec
+    <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
+    for a description of eval_metric_ops. The graph_output is a the result
+    dict returned by build_graph. Labels and weights are tf.Tensors.
+
+    In multiple binary classification problems, the
+    ``predictions`` (that is, ``graph_output['output']``)
+    are expected to have shape ``batch_size x n_classes``,
+    where ``n_classes`` is the number of binary classification.
+    Binary classification at output[i] is expected to discriminate between ``classes[i]`` (1)
+    and NOT ``classes[i]`` (0). The labels should be of the same shape as ``graph_output``
+    with binary values (0 or 1). The weights can be of size ``batch_size`` or
+    ``batch_size x n_classes``. The ``class_dim`` contain separate probabilities,
+    and need to have separate metrics.
+
+    The following graph_output keys are recognized:
+      output:
+        the raw predictions between 0 and 1. Required.
+      threshold:
+        A value between 0 and 1 used to threshold the output into a hard_output.
+        Defaults to 0.5 when threshold and hard_output are missing.
+        Either threshold or hard_output can be provided, but not both.
+      hard_output:
+        A thresholded output. Either threshold or hard_output can be provided, but not both.
+
+    Args:
+      metrics (list of Metrics):
+        a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
+        Element in the list can be a string from following supported metrics, or can be a tuple
+        with three items: metric name, metric function, bool for thresholded output.
+
+        These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+        Supported metrics:
+
+        - ctr (same as positive sample ratio.)
+        - rce (cross entropy loss compared to the baseline model of always predicting ctr)
+        - nrce (normalized rce, do not use this one if you do not understand what it is)
+        - pr_auc
+        - roc_auc
+        - accuracy (percentage of predictions that are correct)
+        - precision (true positives) / (true positives + false positives)
+        - recall (true positives) / (true positives + false negatives)
+        - pr_curve (precision-recall curve)
+
+        Example of metrics list with mixture of string and tuple:
+        metrics = [
+          'rce','nrce',
+          'roc_auc',  # default roc_auc metric
+          (
+            'roc_auc_500',  # give this metric a name
+            partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
+            False,  # whether the metric requires thresholded output
+          )]
+
+        NOTE: When prediction on rare events, roc_auc can be underestimated. Increase num_threshold
+        can reduce the underestimation. See go/roc-auc-pitfall for more details.
+
+        NOTE: accuracy / precision / recall apply to binary classification problems only.
+        I.e. a prediction is only considered correct if it matches the label. E.g. if the label
+        is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
+        precision / recall / accuracy metrics with soft predictions, you'll need to threshold
+        your predictions into hard 0/1 labels.
+
+        When metrics is None (the default), it defaults to:
+        [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
+
+      classes (list of strings):
+        In case of multiple binary class models, the names for each class or label.
+        These are used to display metrics on tensorboard.
+        If these are not specified, the index in the class or label dimension is used, and you'll
+        get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
+
+      class_dim (number):
+        Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
     """
+    # pylint: disable=invalid-name,dict-keys-not-iterating
+    if metrics is None:
+        # remove expensive metrics by default for faster eval
+        metrics = list(DEFAULT_BINARY_CLASS_METRICS)
 
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if hard_preds is None:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    shape = labels.get_shape()
-    # basic sanity check: multi_metric dimension must exist
-    assert len(shape) > class_dim, "Dimension specified by class_dim does not exist."
-
-    num_labels = shape[class_dim]
-    # If we are doing multi-class / multi-label metric, the number of classes / labels must
-    # be know at graph construction time.  This dimension cannot have size None.
-    assert num_labels is not None, "The multi-metric dimension cannot be None."
-    assert classes is None or len(classes) == num_labels, (
-      "Number of classes must match the number of labels")
-
-    weights_shape = weights.get_shape() if weights is not None else None
-    if weights_shape is None:
-      num_weights = None
-    elif len(weights_shape) > 1:
-      num_weights = weights_shape[class_dim]
-    else:
-      num_weights = 1
-
-    for i in range(num_labels):
-
-      # add metrics to eval_metric_ops dict
-      for metric in metrics:
-        if isinstance(metric, tuple) and len(metric) == 3:
-          metric_name, metric_factory, requires_threshold = metric
-          metric_name = metric_name.lower()
-        elif isinstance(metric, str):
-          metric_name = metric.lower()  # metric name are case insensitive.
-          metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-        else:
-          raise ValueError("Metric should be either string or tuple of length 3.")
-
-        class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i))
-
-        if class_metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        class_labels = tf.gather(labels, indices=[i], axis=class_dim)
-        class_preds = tf.gather(preds, indices=[i], axis=class_dim)
-        class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
-
-        if num_weights is None:
-          class_weights = None
-        elif num_weights == num_labels:
-          class_weights = tf.gather(weights, indices=[i], axis=class_dim)
-        elif num_weights == 1:
-          class_weights = weights
-        else:
-          raise ValueError("num_weights (%d) and num_labels (%d) do not match"
-                           % (num_weights, num_labels))
-
-        if metric_factory:
-          value_op, update_op = metric_factory(
-            labels=class_labels,
-            predictions=(class_hard_preds if requires_threshold else class_preds),
-            weights=class_weights, name=class_metric_name)
-          eval_metric_ops[class_metric_name] = (value_op, update_op)
+    def get_eval_metric_ops(graph_output, labels, weights):
+        """
+        graph_output:
+          dict that is returned by build_graph given input features.
+        labels:
+          target labels associated to batch.
+        weights:
+          weights of the samples..
+        """
+
+        eval_metric_ops = OrderedDict()
+
+        preds = graph_output["output"]
+
+        threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5
+
+        hard_preds = graph_output.get("hard_output")
+        if hard_preds is None:
+            hard_preds = tf.greater_equal(preds, threshold)
+
+        shape = labels.get_shape()
+        # basic sanity check: multi_metric dimension must exist
+        assert (
+            len(shape) > class_dim
+        ), "Dimension specified by class_dim does not exist."
+
+        num_labels = shape[class_dim]
+        # If we are doing multi-class / multi-label metric, the number of classes / labels must
+        # be know at graph construction time.  This dimension cannot have size None.
+        assert num_labels is not None, "The multi-metric dimension cannot be None."
+        assert (
+            classes is None or len(classes) == num_labels
+        ), "Number of classes must match the number of labels"
+
+        weights_shape = weights.get_shape() if weights is not None else None
+        if weights_shape is None:
+            num_weights = None
+        elif len(weights_shape) > 1:
+            num_weights = weights_shape[class_dim]
         else:
-          raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
+            num_weights = 1
+
+        for i in range(num_labels):
+            # add metrics to eval_metric_ops dict
+            for metric in metrics:
+                if isinstance(metric, tuple) and len(metric) == 3:
+                    metric_name, metric_factory, requires_threshold = metric
+                    metric_name = metric_name.lower()
+                elif isinstance(metric, str):
+                    metric_name = metric.lower()  # metric name are case insensitive.
+                    (
+                        metric_factory,
+                        requires_threshold,
+                    ) = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
+                else:
+                    raise ValueError(
+                        "Metric should be either string or tuple of length 3."
+                    )
+
+                class_metric_name = (
+                    metric_name + "_" + (classes[i] if classes is not None else str(i))
+                )
+
+                if class_metric_name in eval_metric_ops:
+                    # avoid adding duplicate metrics.
+                    continue
+
+                class_labels = tf.gather(labels, indices=[i], axis=class_dim)
+                class_preds = tf.gather(preds, indices=[i], axis=class_dim)
+                class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
+
+                if num_weights is None:
+                    class_weights = None
+                elif num_weights == num_labels:
+                    class_weights = tf.gather(weights, indices=[i], axis=class_dim)
+                elif num_weights == 1:
+                    class_weights = weights
+                else:
+                    raise ValueError(
+                        "num_weights (%d) and num_labels (%d) do not match"
+                        % (num_weights, num_labels)
+                    )
+
+                if metric_factory:
+                    value_op, update_op = metric_factory(
+                        labels=class_labels,
+                        predictions=(
+                            class_hard_preds if requires_threshold else class_preds
+                        ),
+                        weights=class_weights,
+                        name=class_metric_name,
+                    )
+                    eval_metric_ops[class_metric_name] = (value_op, update_op)
+                else:
+                    raise ValueError("Cannot find the metric named " + metric_name)
+
+        return eval_metric_ops
+
+    return get_eval_metric_ops
 
 
 def _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=True):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops with uncalibrated output.
-
-  The following graph_output keys are recognized:
-    uncalibrated_output:
-      the uncalibrated raw predictions between 0 and 1. Required.
-    output:
-      the calibrated predictions between 0 and 1.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    calibrated_metric_fn: metrics function with calibration and weight.
-    keep_weight: Bool indicating whether we keep weight.
-  """
-  metric_scope = 'uncalibrated' if keep_weight else 'unweighted'
-
-  def get_eval_metric_ops(graph_output, labels, weights):
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    Returns a function having signature:
+
+    .. code-block:: python
+
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
+
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops with uncalibrated output.
+
+    The following graph_output keys are recognized:
+      uncalibrated_output:
+        the uncalibrated raw predictions between 0 and 1. Required.
+      output:
+        the calibrated predictions between 0 and 1.
+      threshold:
+        A value between 0 and 1 used to threshold the output into a hard_output.
+        Defaults to 0.5 when threshold and hard_output are missing.
+        Either threshold or hard_output can be provided, but not both.
+      hard_output:
+        A thresholded output. Either threshold or hard_output can be provided, but not both.
+
+    Args:
+      calibrated_metric_fn: metrics function with calibration and weight.
+      keep_weight: Bool indicating whether we keep weight.
     """
-    with tf.variable_scope(metric_scope):
-      if 'uncalibrated_output' not in graph_output:
-        raise Exception("Missing uncalibrated_output in graph_output!")
-      un_calibrated_weights = weights if keep_weight else tf.ones_like(weights)
-      uncalibrated_output = {
-        'output': graph_output['uncalibrated_output'],
-        'threshold': graph_output.get('threshold', 0.5),
-        'hard_output': graph_output.get('hard_output'),
-        **{k: v for k, v in graph_output.items() if k not in ['output', 'threshold', 'hard_output']}
-      }
-
-      eval_metrics_ops = calibrated_metric_fn(uncalibrated_output, labels, un_calibrated_weights)
+    metric_scope = "uncalibrated" if keep_weight else "unweighted"
 
-      renamed_metrics_ops = {f'{metric_scope}_{k}': v for k, v in eval_metrics_ops.items()}
-      return renamed_metrics_ops
-
-  return get_eval_metric_ops
+    def get_eval_metric_ops(graph_output, labels, weights):
+        """
+        graph_output:
+          dict that is returned by build_graph given input features.
+        labels:
+          target labels associated to batch.
+        weights:
+          weights of the samples..
+        """
+        with tf.variable_scope(metric_scope):
+            if "uncalibrated_output" not in graph_output:
+                raise Exception("Missing uncalibrated_output in graph_output!")
+            un_calibrated_weights = weights if keep_weight else tf.ones_like(weights)
+            uncalibrated_output = {
+                "output": graph_output["uncalibrated_output"],
+                "threshold": graph_output.get("threshold", 0.5),
+                "hard_output": graph_output.get("hard_output"),
+                **{
+                    k: v
+                    for k, v in graph_output.items()
+                    if k not in ["output", "threshold", "hard_output"]
+                },
+            }
+
+            eval_metrics_ops = calibrated_metric_fn(
+                uncalibrated_output, labels, un_calibrated_weights
+            )
+
+            renamed_metrics_ops = {
+                f"{metric_scope}_{k}": v for k, v in eval_metrics_ops.items()
+            }
+            return renamed_metrics_ops
+
+    return get_eval_metric_ops
 
 
 def get_multi_binary_class_uncalibrated_metric_fn(
-  metrics, classes=None, class_dim=1, keep_weight=True):
-  """
-  Returns a function having signature:
+    metrics, classes=None, class_dim=1, keep_weight=True
+):
+    """
+    Returns a function having signature:
 
-  .. code-block:: python
+    .. code-block:: python
 
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
 
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for concatenated binary classifications without calibration.
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops for concatenated binary classifications without calibration.
 
-  Note: 'uncalibrated_output' is required key in graph_output.
+    Note: 'uncalibrated_output' is required key in graph_output.
 
-  The main use case for this function is:
+    The main use case for this function is:
 
-  1) To calculated roc-auc for rare event.
-  Calibrated prediction score for rare events will be concentrated near zero. As a result,
-  the roc-auc can be seriously underestimated with current implementation in tf.metric.auc.
-  Since roc-auc is invariant against calibration, we can directly use uncalibrated score for roc-auc.
-  For more details, please refer to: go/roc-auc-invariance.
+    1) To calculated roc-auc for rare event.
+    Calibrated prediction score for rare events will be concentrated near zero. As a result,
+    the roc-auc can be seriously underestimated with current implementation in tf.metric.auc.
+    Since roc-auc is invariant against calibration, we can directly use uncalibrated score for roc-auc.
+    For more details, please refer to: go/roc-auc-invariance.
 
-  2) To set keep_weight=False and get unweighted and uncalibrated metrics.
-  This is useful to eval how the model is fitted to its actual training data, since
-  often time the model is trained without weight.
+    2) To set keep_weight=False and get unweighted and uncalibrated metrics.
+    This is useful to eval how the model is fitted to its actual training data, since
+    often time the model is trained without weight.
 
-  Args:
-    metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+    Args:
+      metrics (list of String):
+        a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
+        Element in the list can be a string from supported metrics, or can be a tuple
+        with three items: metric name, metric function, bool for thresholded output.
+        These metrics are evaluated and reported to tensorboard *during the eval phases only*.
 
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
+        When metrics is None (the default), it defaults to:
+        [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
 
-    classes (list of strings):
-      In case of multiple binary class models, the names for each class or label.
-      These are used to display metrics on tensorboard.
-      If these are not specified, the index in the class or label dimension is used, and you'll
-      get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
+      classes (list of strings):
+        In case of multiple binary class models, the names for each class or label.
+        These are used to display metrics on tensorboard.
+        If these are not specified, the index in the class or label dimension is used, and you'll
+        get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
 
-    class_dim (number):
-      Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
+      class_dim (number):
+        Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
 
-    keep_weight (bool):
-      Whether to keep weights for the metric.
-  """
+      keep_weight (bool):
+        Whether to keep weights for the metric.
+    """
 
-  calibrated_metric_fn = get_multi_binary_class_metric_fn(
-    metrics, classes=classes, class_dim=class_dim)
-  return _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=keep_weight)
+    calibrated_metric_fn = get_multi_binary_class_metric_fn(
+        metrics, classes=classes, class_dim=class_dim
+    )
+    return _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=keep_weight)
 
 
 def combine_metric_fns(*fn_list):
-  """
-  Combine multiple metric functions.
-  For example, we can combine metrics function generated by
-  get_multi_binary_class_metric_fn and get_multi_binary_class_uncalibrated_metric_fn.
-
-  Args:
-    *fn_list: Multiple metric functions to be combined
-
-  Returns:
-    Combined metric function.
-  """
-  def combined_metric_ops(*args, **kwargs):
-    eval_metric_ops = OrderedDict()
-    for fn in fn_list:
-      eval_metric_ops.update(fn(*args, **kwargs))
-    return eval_metric_ops
-  return combined_metric_ops
+    """
+    Combine multiple metric functions.
+    For example, we can combine metrics function generated by
+    get_multi_binary_class_metric_fn and get_multi_binary_class_uncalibrated_metric_fn.
+
+    Args:
+      *fn_list: Multiple metric functions to be combined
+
+    Returns:
+      Combined metric function.
+    """
+
+    def combined_metric_ops(*args, **kwargs):
+        eval_metric_ops = OrderedDict()
+        for fn in fn_list:
+            eval_metric_ops.update(fn(*args, **kwargs))
+        return eval_metric_ops
+
+    return combined_metric_ops
diff --git a/twml/twml/optimizers/__init__.py b/twml/twml/optimizers/__init__.py
index eaa29883c..d4f7ae7d3 100644
--- a/twml/twml/optimizers/__init__.py
+++ b/twml/twml/optimizers/__init__.py
@@ -1,4 +1,5 @@
-from twitter.deepbird.compat.v1.optimizers import (
-  LazyAdamOptimizer,
-  optimize_loss,
-  OPTIMIZER_SUMMARIES) # noqa: F401
+from twitter.deepbird.compat.v1.optimizers import (  # noqa: F401
+    OPTIMIZER_SUMMARIES,
+    LazyAdamOptimizer,
+    optimize_loss,
+)
diff --git a/twml/twml/parsers.py b/twml/twml/parsers.py
index eac60083a..50de1c7ad 100644
--- a/twml/twml/parsers.py
+++ b/twml/twml/parsers.py
@@ -1,20 +1,26 @@
-'''
+"""
 Contains implementations of functions to parse training and evaluation data.
 
 Modelers can use the functions in this module as the the train/eval_parse_fn of
 the DataRecordTrainer constructor to customize how to parse their datasets.
 
 Modelers may also provide custom implementations of train/eval_parse_fn using these as reference.
-'''
+"""
 
 from twitter.deepbird.io.legacy.parsers import (
-  convert_to_supervised_input_receiver_fn,  # noqa: F401
-  get_continuous_parse_fn,  # noqa: F401
-  get_default_parse_fn,  # noqa: F401
-  get_features_as_tensor_dict,  # noqa: F401
-  get_labels_in_features_parse_fn,  # noqa: F401
-  get_serving_input_receiver_fn_feature_dict,  # noqa: F401
-  get_sparse_parse_fn,  # noqa: F401
-  get_sparse_serving_input_receiver_fn,  # noqa: F401
-  get_tensor_parse_fn,  # noqa: F401
-)
+    convert_to_supervised_input_receiver_fn,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_continuous_parse_fn  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_default_parse_fn  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_features_as_tensor_dict  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import (
+    get_labels_in_features_parse_fn,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import (
+    get_serving_input_receiver_fn_feature_dict,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_sparse_parse_fn  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import (
+    get_sparse_serving_input_receiver_fn,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_tensor_parse_fn  # noqa: F401
diff --git a/twml/twml/readers/__init__.py b/twml/twml/readers/__init__.py
index 06a6d79f5..eea7c28e7 100644
--- a/twml/twml/readers/__init__.py
+++ b/twml/twml/readers/__init__.py
@@ -2,6 +2,6 @@
 """ This module contains data readers """
 
 from .batch_prediction_request import BatchPredictionRequest  # noqa: F401
-from .data_record import DataRecord, SPARSE_DATA_RECORD_FEATURE_FIELDS  # noqa: F401
+from .data_record import SPARSE_DATA_RECORD_FEATURE_FIELDS, DataRecord  # noqa: F401
 from .hashed_batch_prediction_request import HashedBatchPredictionRequest  # noqa: F401
-from .hashed_data_record import HashedDataRecord  # noqa: F401
\ No newline at end of file
+from .hashed_data_record import HashedDataRecord  # noqa: F401
diff --git a/twml/twml/readers/batch_prediction_request.py b/twml/twml/readers/batch_prediction_request.py
index 512a8c514..f0c233d35 100644
--- a/twml/twml/readers/batch_prediction_request.py
+++ b/twml/twml/readers/batch_prediction_request.py
@@ -4,5 +4,5 @@
 """
 
 from twitter.deepbird.io.legacy.readers.batch_prediction_request import (
-  BatchPredictionRequest  # noqa: F401
-)
+    BatchPredictionRequest,
+)  # noqa: F401
diff --git a/twml/twml/readers/data_record.py b/twml/twml/readers/data_record.py
index d1c377afd..6fd69bb6d 100644
--- a/twml/twml/readers/data_record.py
+++ b/twml/twml/readers/data_record.py
@@ -3,13 +3,15 @@
 This module includes facilities for manipulating data records.
 """
 
+from twitter.deepbird.io.legacy.readers.data_record import _SPEC_TO_TF  # noqa: F401
 from twitter.deepbird.io.legacy.readers.data_record import (
-  _SPEC_TO_TF,  # noqa: F401
-  SPARSE_DATA_RECORD_FEATURE_FIELDS,  # noqa: F401
-  _FeaturesBase,  # noqa: F401
-  _Features,  # noqa: F401
-  _DiscreteFeatures,  # noqa: F401
-  _StringFeatures,  # noqa: F401
-  _BaseDataRecord,  # noqa: F401
-  DataRecord,  # noqa: F401
-)
+    SPARSE_DATA_RECORD_FEATURE_FIELDS,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import DataRecord  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import _BaseDataRecord  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import (
+    _DiscreteFeatures,
+)  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import _Features  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import _FeaturesBase  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import _StringFeatures  # noqa: F401
diff --git a/twml/twml/readers/hashed_batch_prediction_request.py b/twml/twml/readers/hashed_batch_prediction_request.py
index 5850c4497..213dee734 100644
--- a/twml/twml/readers/hashed_batch_prediction_request.py
+++ b/twml/twml/readers/hashed_batch_prediction_request.py
@@ -4,5 +4,5 @@
 """
 
 from twitter.deepbird.io.legacy.readers.hashed_batch_prediction_request import (
-  HashedBatchPredictionRequest  # noqa: F401
-)
+    HashedBatchPredictionRequest,
+)  # noqa: F401
diff --git a/twml/twml/readers/hashed_data_record.py b/twml/twml/readers/hashed_data_record.py
index 1ff9ce816..9f8c5bd8f 100644
--- a/twml/twml/readers/hashed_data_record.py
+++ b/twml/twml/readers/hashed_data_record.py
@@ -5,8 +5,8 @@
 """
 
 from twitter.deepbird.io.legacy.readers.hashed_data_record import (
-  _HASHED_FIELDS,
-  _FEATURE_NAMES,
-  _FEATURE_TYPES,
-  HashedDataRecord,
+    _FEATURE_NAMES,
+    _FEATURE_TYPES,
+    _HASHED_FIELDS,
+    HashedDataRecord,
 )
diff --git a/twml/twml/saved_model_cli/__main__.py b/twml/twml/saved_model_cli/__main__.py
index ad5326431..96d4409e0 100644
--- a/twml/twml/saved_model_cli/__main__.py
+++ b/twml/twml/saved_model_cli/__main__.py
@@ -5,5 +5,5 @@
 
 from tensorflow.python.tools import saved_model_cli
 
-if __name__ == '__main__':
-  sys.exit(saved_model_cli.main())
+if __name__ == "__main__":
+    sys.exit(saved_model_cli.main())
diff --git a/twml/twml/tensorboard/__main__.py b/twml/twml/tensorboard/__main__.py
index c426060d1..75557b5f0 100644
--- a/twml/twml/tensorboard/__main__.py
+++ b/twml/twml/tensorboard/__main__.py
@@ -7,10 +7,9 @@
 
 from tensorboard.main import run_main
 
-
-if __name__ == '__main__':
-  # Tensorboard relies on werkzeug for its HTTP server which logs at info level
-  # by default
-  logging.getLogger('werkzeug').setLevel(logging.WARNING)
-  sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
-  sys.exit(run_main())
+if __name__ == "__main__":
+    # Tensorboard relies on werkzeug for its HTTP server which logs at info level
+    # by default
+    logging.getLogger("werkzeug").setLevel(logging.WARNING)
+    sys.argv[0] = re.sub(r"(-script\.pyw?|\.exe)?$", "", sys.argv[0])
+    sys.exit(run_main())
diff --git a/twml/twml/tensorio.py b/twml/twml/tensorio.py
index bc551ac56..de7bd26b4 100644
--- a/twml/twml/tensorio.py
+++ b/twml/twml/tensorio.py
@@ -8,7 +8,6 @@
 import numpy as np
 import yaml
 
-
 """
 Utility to load tensors serialized by Deepbird V1.
 
@@ -19,143 +18,151 @@
 
 # helper class used to assist hierarchical key access by remembering intermediate keys.
 class _KeyRecorder(object):
-  def __init__(self, tensorio, keys=[]):
-    self.tensorio = tensorio
-    self.keys = keys
+    def __init__(self, tensorio, keys=[]):
+        self.tensorio = tensorio
+        self.keys = keys
 
-  def __getitem__(self, k):
-    new_keys = self.keys + [str(k)]
-    prefix = ".".join(new_keys)
+    def __getitem__(self, k):
+        new_keys = self.keys + [str(k)]
+        prefix = ".".join(new_keys)
 
-    key_list = self.tensorio.list_tensors()
+        key_list = self.tensorio.list_tensors()
 
-    # if we have a complete key, load the tensor.
-    if prefix in key_list:
-      return self.tensorio._load(prefix)
+        # if we have a complete key, load the tensor.
+        if prefix in key_list:
+            return self.tensorio._load(prefix)
 
-    # we don't have a complete key yet, but at least one tensor should start with this prefix.
-    for k_value in key_list:
-      if k_value.startswith(prefix):
-        return _KeyRecorder(self.tensorio, new_keys)
+        # we don't have a complete key yet, but at least one tensor should start with this prefix.
+        for k_value in key_list:
+            if k_value.startswith(prefix):
+                return _KeyRecorder(self.tensorio, new_keys)
 
-    # if no key starts with the prefix, this _key_recorder is not valid.
-    raise ValueError("Key not found: " + prefix)
+        # if no key starts with the prefix, this _key_recorder is not valid.
+        raise ValueError("Key not found: " + prefix)
 
 
 # convert tensorio tensor type to numpy data type.
 # also returns element size in bytes.
 def _get_data_type(data_type):
-  if data_type == 'Double':
-    return (np.float64, 8)
+    if data_type == "Double":
+        return (np.float64, 8)
 
-  if data_type == 'Float':
-    return (np.float32, 4)
+    if data_type == "Float":
+        return (np.float32, 4)
 
-  if data_type == 'Int':
-    return (np.int32, 4)
+    if data_type == "Int":
+        return (np.int32, 4)
 
-  if data_type == 'Long':
-    return (np.int64, 8)
+    if data_type == "Long":
+        return (np.int64, 8)
 
-  if data_type == 'Byte':
-    return (np.int8, 1)
+    if data_type == "Byte":
+        return (np.int8, 1)
 
-  raise ValueError('Unexpected tensorio data type: ' + data_type)
+    raise ValueError("Unexpected tensorio data type: " + data_type)
 
 
 class TensorIO(object):
-  """
-  Construct a TensorIO class.
-  tensorio_path: a directory containing tensors serialized using tensorio. tar file not supported.
-  mmap_tensor:
-    By default, loaded tensors use mmap storage.
-    Set this to false to not use mmap. Useful when loading multiple tensors.
-  """
-
-  def __init__(self, tensorio_path, mmap_tensor=True):
-    self._tensorio_path = tensorio_path
-    self._mmap_tensor = mmap_tensor
-
-    # Make sure we can locate spec.yaml.
-    yaml_file = os.path.join(tensorio_path, 'spec.yaml')
-    if not os.path.exists(yaml_file):
-      raise ValueError('Invalid tensorio path: no spec.yaml found.')
-
-    # load spec.yaml.
-    with open(yaml_file, 'r') as file_open:
-      # Note that tensor names in the yaml are like this: \"weight\".\'1\'
-      # For user-friendliness, we remove the quotes.
-      _spec = yaml.safe_load(file_open)
-      self._spec = {k.replace("'", '').replace('"', ''): v for (k, v) in _spec.items()}
-
-  def list_tensors(self):
-    """
-    Returns a list of tensors saved in the given path.
-    """
-    return self._spec.keys()
-
-  def _load_tensor(self, name):
     """
-    Load Tensor with the given name.
-    Raise value error if the named tensor is not found.
-    Returns a numpy array if the named tensor is found.
+    Construct a TensorIO class.
+    tensorio_path: a directory containing tensors serialized using tensorio. tar file not supported.
+    mmap_tensor:
+      By default, loaded tensors use mmap storage.
+      Set this to false to not use mmap. Useful when loading multiple tensors.
     """
-    tensor_info = self._spec[name]
-    if tensor_info['type'] != 'tensor':
-      raise ValueError('Trying to load a tensor of unknown type: ' + tensor_info['type'])
-
-    filename = os.path.join(self._tensorio_path, tensor_info['filename'])
-    (data_type, element_size) = _get_data_type(tensor_info['tensorType'])
-
-    np_array = np.memmap(
-      filename,
-      dtype=data_type,
-      mode='r',
-      # -1 because lua offset is 1 based.
-      offset=(tensor_info['offset'] - 1) * element_size,
-      shape=tuple(tensor_info['size']),
-      order='C',
-    )
-
-    return np_array if self._mmap_tensor else np_array[:].copy()
-
-  def _load_nontensor_data(self, name):
-    """
-    Load non-tensor data with the given name.
-    Returns a python string.
-    """
-    tensor_info = self._spec[name]
-    return tensor_info['data']
 
-  def _load(self, name):
-    """
-    Load data serialized under the given name, it could be a tensor or regular data.
-    """
-    if name not in self._spec:
-      raise ValueError('The specified key {} is not found in {}'.format(name, self._tensorio_path))
-
-    data_type = self._spec[name]['type']
-    if data_type == 'tensor':
-      return self._load_tensor(name)
-    else:
-      return self._load_nontensor_data(name)
-
-  def load_all(self):
-    """
-    Load all tensors stored in the tensorio directory.
-    Returns a dictionary from tensor name to numpy arrays.
-    """
-    return {k: self._load(k) for k in self._spec}
-
-  ###########################################
-  # The below are utilities for convenience #
-  ###########################################
-  def __getitem__(self, k):
-    """
-    Shorthand for _load_tensor, but also supports hierarchical access like: tensorio['a']['b']['1']
-    """
-    if k in self._spec:
-      # We have a full tensor name, directly load it.
-      return self._load_tensor(k)
-    else:
-      return _KeyRecorder(self)[k]
+    def __init__(self, tensorio_path, mmap_tensor=True):
+        self._tensorio_path = tensorio_path
+        self._mmap_tensor = mmap_tensor
+
+        # Make sure we can locate spec.yaml.
+        yaml_file = os.path.join(tensorio_path, "spec.yaml")
+        if not os.path.exists(yaml_file):
+            raise ValueError("Invalid tensorio path: no spec.yaml found.")
+
+        # load spec.yaml.
+        with open(yaml_file, "r") as file_open:
+            # Note that tensor names in the yaml are like this: \"weight\".\'1\'
+            # For user-friendliness, we remove the quotes.
+            _spec = yaml.safe_load(file_open)
+            self._spec = {
+                k.replace("'", "").replace('"', ""): v for (k, v) in _spec.items()
+            }
+
+    def list_tensors(self):
+        """
+        Returns a list of tensors saved in the given path.
+        """
+        return self._spec.keys()
+
+    def _load_tensor(self, name):
+        """
+        Load Tensor with the given name.
+        Raise value error if the named tensor is not found.
+        Returns a numpy array if the named tensor is found.
+        """
+        tensor_info = self._spec[name]
+        if tensor_info["type"] != "tensor":
+            raise ValueError(
+                "Trying to load a tensor of unknown type: " + tensor_info["type"]
+            )
+
+        filename = os.path.join(self._tensorio_path, tensor_info["filename"])
+        (data_type, element_size) = _get_data_type(tensor_info["tensorType"])
+
+        np_array = np.memmap(
+            filename,
+            dtype=data_type,
+            mode="r",
+            # -1 because lua offset is 1 based.
+            offset=(tensor_info["offset"] - 1) * element_size,
+            shape=tuple(tensor_info["size"]),
+            order="C",
+        )
+
+        return np_array if self._mmap_tensor else np_array[:].copy()
+
+    def _load_nontensor_data(self, name):
+        """
+        Load non-tensor data with the given name.
+        Returns a python string.
+        """
+        tensor_info = self._spec[name]
+        return tensor_info["data"]
+
+    def _load(self, name):
+        """
+        Load data serialized under the given name, it could be a tensor or regular data.
+        """
+        if name not in self._spec:
+            raise ValueError(
+                "The specified key {} is not found in {}".format(
+                    name, self._tensorio_path
+                )
+            )
+
+        data_type = self._spec[name]["type"]
+        if data_type == "tensor":
+            return self._load_tensor(name)
+        else:
+            return self._load_nontensor_data(name)
+
+    def load_all(self):
+        """
+        Load all tensors stored in the tensorio directory.
+        Returns a dictionary from tensor name to numpy arrays.
+        """
+        return {k: self._load(k) for k in self._spec}
+
+    ###########################################
+    # The below are utilities for convenience #
+    ###########################################
+    def __getitem__(self, k):
+        """
+        Shorthand for _load_tensor, but also supports hierarchical access like: tensorio['a']['b']['1']
+        """
+        if k in self._spec:
+            # We have a full tensor name, directly load it.
+            return self._load_tensor(k)
+        else:
+            return _KeyRecorder(self)[k]
diff --git a/twml/twml/tracking/experiment_tracker.py b/twml/twml/tracking/experiment_tracker.py
index 4f275ba4b..0d314f170 100644
--- a/twml/twml/tracking/experiment_tracker.py
+++ b/twml/twml/tracking/experiment_tracker.py
@@ -1,543 +1,651 @@
 """
 This module contains the experiment tracker for tracking training in ML Metastore
 """
-from contextlib import contextmanager
-from datetime import datetime
 import getpass
 import hashlib
 import os
 import re
 import sys
 import time
+from contextlib import contextmanager
+from datetime import datetime
 
-from absl import logging
 import tensorflow.compat.v1 as tf
-from twml.hooks import MetricsUpdateHook
+from absl import logging
 
+from twml.hooks import MetricsUpdateHook
 
 try:
-  from urllib import quote as encode_url
+    from urllib import quote as encode_url
 except ImportError:
-  from urllib.parse import quote as encode_url
+    from urllib.parse import quote as encode_url
 
 
 try:
-  # ML Metastore packages might not be available on GCP.
-  # If they are not found, tracking is disabled
-  import requests
-  from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
-  from com.twitter.mlmetastore.modelrepo.core.path import (
-    check_valid_id, get_components_from_id, generate_id)
-  from com.twitter.mlmetastore.modelrepo.core import (
-    DeepbirdRun, Experiment, FeatureConfig, FeatureConfigFeature, Model, ProgressReport, Project, StatusUpdate)
+    # ML Metastore packages might not be available on GCP.
+    # If they are not found, tracking is disabled
+    import requests
+    from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
+    from com.twitter.mlmetastore.modelrepo.core import (
+        DeepbirdRun,
+        Experiment,
+        FeatureConfig,
+        FeatureConfigFeature,
+        Model,
+        ProgressReport,
+        Project,
+        StatusUpdate,
+    )
+    from com.twitter.mlmetastore.modelrepo.core.path import (
+        check_valid_id,
+        generate_id,
+        get_components_from_id,
+    )
 except ImportError:
-  ModelRepoClient = None
+    ModelRepoClient = None
 
 
 class ExperimentTracker(object):
-  """
-  A tracker that records twml runs in ML Metastore.
-  """
-
-  def __init__(self, params, run_config, save_dir):
     """
+    A tracker that records twml runs in ML Metastore.
+    """
+
+    def __init__(self, params, run_config, save_dir):
+        """
+
+        Args:
+          params (python dict):
+            The trainer params. ExperimentTracker uses `params.experiment_tracking_path` (String) and
+            `params.disable_experiment_tracking`.
+            If `experiment_tracking_path` is set to None, the tracker tries to guess a path with
+            save_dir.
+            If `disable_experiment_tracking` is True, the tracker is disabled.
+          run_config (tf.estimator.RunConfig):
+            The run config used by the estimator.
+          save_dir (str):
+            save_dir of the trainer
+        """
+        if isinstance(params, dict):
+            self._params = params
+        else:
+            # preserving backward compatibility for people still using HParams
+            logging.warning(
+                "Please stop using HParams and use python dicts. HParams are removed in TF 2"
+            )
+            self._params = dict(
+                (k, v) for k, v in params.values().items() if v != "null"
+            )
+        self._run_config = run_config
+        self._graceful_shutdown_port = self._params.get("health_port")
+
+        self.tracking_path = self._params.get("experiment_tracking_path")
+        is_tracking_path_too_long = (
+            self.tracking_path is not None and len(self.tracking_path) > 256
+        )
 
-    Args:
-      params (python dict):
-        The trainer params. ExperimentTracker uses `params.experiment_tracking_path` (String) and
-        `params.disable_experiment_tracking`.
-        If `experiment_tracking_path` is set to None, the tracker tries to guess a path with
-        save_dir.
-        If `disable_experiment_tracking` is True, the tracker is disabled.
-      run_config (tf.estimator.RunConfig):
-        The run config used by the estimator.
-      save_dir (str):
-        save_dir of the trainer
-    """
-    if isinstance(params, dict):
-      self._params = params
-    else:
-      # preserving backward compatibility for people still using HParams
-      logging.warning("Please stop using HParams and use python dicts. HParams are removed in TF 2")
-      self._params = dict((k, v) for k, v in params.values().items() if v != 'null')
-    self._run_config = run_config
-    self._graceful_shutdown_port = self._params.get('health_port')
-
-    self.tracking_path = self._params.get('experiment_tracking_path')
-    is_tracking_path_too_long = self.tracking_path is not None and len(self.tracking_path) > 256
-
-    if is_tracking_path_too_long:
-      raise ValueError("Experiment Tracking Path longer than 256 characters")
-
-    self.disabled = (
-      self._params.get('disable_experiment_tracking', False) or
-      not self._is_env_eligible_for_tracking() or
-      ModelRepoClient is None
-    )
+        if is_tracking_path_too_long:
+            raise ValueError("Experiment Tracking Path longer than 256 characters")
 
-    self._is_hogwild = bool(os.environ.get('TWML_HOGWILD_PORTS'))
+        self.disabled = (
+            self._params.get("disable_experiment_tracking", False)
+            or not self._is_env_eligible_for_tracking()
+            or ModelRepoClient is None
+        )
 
-    self._is_distributed = bool(os.environ.get('TF_CONFIG'))
+        self._is_hogwild = bool(os.environ.get("TWML_HOGWILD_PORTS"))
 
-    self._client = None if self.disabled else ModelRepoClient()
+        self._is_distributed = bool(os.environ.get("TF_CONFIG"))
 
-    run_name_from_environ = self.run_name_from_environ()
-    run_name_can_be_inferred = (
-      self.tracking_path is not None or run_name_from_environ is not None)
+        self._client = None if self.disabled else ModelRepoClient()
 
-    # Turn the flags off as needed in hogwild / distributed
-    if self._is_hogwild or self._is_distributed:
-      self._env_eligible_for_recording_experiment = (
-        self._run_config.task_type == "evaluator")
-      if run_name_can_be_inferred:
-        self._env_eligible_for_recording_export_metadata = (
-          self._run_config.task_type == "chief")
-      else:
-        logging.info(
-          'experiment_tracking_path is not set and can not be inferred. '
-          'Recording export metadata is disabled because the chief node and eval node '
-          'are setting different experiment tracking paths.')
-        self._env_eligible_for_recording_export_metadata = False
-    else:
-      # Defaults to True
-      self._env_eligible_for_recording_experiment = True
-      self._env_eligible_for_recording_export_metadata = True
-
-    if not self.disabled:
-      # Sanitize passed in experiment tracking paths. e.g. own:proJ:exp:Run.Name
-      # -> own:proj:exp:Run_Name
-      if self.tracking_path:
-        try:
-          check_valid_id(self.tracking_path)
-        except ValueError as err:
-          logging.error(f'Invalid experiment tracking path provided. Sanitizing: {self.tracking_path}\nError: {err}')
-          self.tracking_path = generate_id(
-            owner=self.path['owner'],
-            project_name=self.path['project_name'],
-            experiment_name=self.path['experiment_name'],
-            run_name=self.path['run_name']
-          )
-          logging.error(f'Generated sanitized experiment tracking path: {self.tracking_path}')
-      else:
-        logging.info(
-          'No experiment_tracking_path set. Experiment Tracker will try to guess a path')
-        self.tracking_path = self.guess_path(save_dir, run_name_from_environ)
-        logging.info('Guessed path: %s', self.tracking_path)
-
-      # additional check to see if generated path is valid
-      try:
-        check_valid_id(self.tracking_path)
-      except ValueError as err:
-        logging.error(
-          'Could not generate valid experiment tracking path. Disabling tracking. ' +
-          'Error:\n{}'.format(err)
+        run_name_from_environ = self.run_name_from_environ()
+        run_name_can_be_inferred = (
+            self.tracking_path is not None or run_name_from_environ is not None
         )
-        self.disabled = True
 
-    self.project_id = None if self.disabled else '{}:{}'.format(
-      self.path['owner'], self.path['project_name'])
-    self.base_run_id = None if self.disabled else self.tracking_path
-    self._current_run_name_suffix = None
-
-    self._current_tracker_hook = None
-
-    if self.disabled:
-      logging.info('Experiment Tracker is disabled')
-    else:
-      logging.info('Experiment Tracker initialized with base run id: %s', self.base_run_id)
-
-  @contextmanager
-  def track_experiment(self, eval_hooks, get_estimator_spec_fn, name=None):
-    """
-    A context manager for tracking experiment. It should wrap the training loop.
-    An experiment tracker eval hook is appended to eval_hooks to collect metrics.
-
-    Args:
-      eval_hooks (list):
-        The list of eval_hooks to be used. When it's not None, and does not contain any ,
-        MetricsUpdateHook an experiment tracker eval hook is appended to it. When it contains
-        any MetricsUpdateHook, this tracker is disabled to avoid conflict with legacy Model Repo
-        tracker (`TrackRun`).
-      get_estimator_spec_fn (func):
-        A function to get the current EstimatorSpec of the trainer, used by the eval hook.
-      name (str);
-        Name of this training or evaluation. Used as a suffix of the run_id.
-
-    Returns:
-      The tracker's eval hook which is appended to eval_hooks.
-    """
+        # Turn the flags off as needed in hogwild / distributed
+        if self._is_hogwild or self._is_distributed:
+            self._env_eligible_for_recording_experiment = (
+                self._run_config.task_type == "evaluator"
+            )
+            if run_name_can_be_inferred:
+                self._env_eligible_for_recording_export_metadata = (
+                    self._run_config.task_type == "chief"
+                )
+            else:
+                logging.info(
+                    "experiment_tracking_path is not set and can not be inferred. "
+                    "Recording export metadata is disabled because the chief node and eval node "
+                    "are setting different experiment tracking paths."
+                )
+                self._env_eligible_for_recording_export_metadata = False
+        else:
+            # Defaults to True
+            self._env_eligible_for_recording_experiment = True
+            self._env_eligible_for_recording_export_metadata = True
+
+        if not self.disabled:
+            # Sanitize passed in experiment tracking paths. e.g. own:proJ:exp:Run.Name
+            # -> own:proj:exp:Run_Name
+            if self.tracking_path:
+                try:
+                    check_valid_id(self.tracking_path)
+                except ValueError as err:
+                    logging.error(
+                        f"Invalid experiment tracking path provided. Sanitizing: {self.tracking_path}\nError: {err}"
+                    )
+                    self.tracking_path = generate_id(
+                        owner=self.path["owner"],
+                        project_name=self.path["project_name"],
+                        experiment_name=self.path["experiment_name"],
+                        run_name=self.path["run_name"],
+                    )
+                    logging.error(
+                        f"Generated sanitized experiment tracking path: {self.tracking_path}"
+                    )
+            else:
+                logging.info(
+                    "No experiment_tracking_path set. Experiment Tracker will try to guess a path"
+                )
+                self.tracking_path = self.guess_path(save_dir, run_name_from_environ)
+                logging.info("Guessed path: %s", self.tracking_path)
+
+            # additional check to see if generated path is valid
+            try:
+                check_valid_id(self.tracking_path)
+            except ValueError as err:
+                logging.error(
+                    "Could not generate valid experiment tracking path. Disabling tracking. "
+                    + "Error:\n{}".format(err)
+                )
+                self.disabled = True
+
+        self.project_id = (
+            None
+            if self.disabled
+            else "{}:{}".format(self.path["owner"], self.path["project_name"])
+        )
+        self.base_run_id = None if self.disabled else self.tracking_path
+        self._current_run_name_suffix = None
 
-    # disable this tracker if legacy TrackRun hook is present
-    # TODO: remove this once we completely deprecate the old TrackRun interface
-    if eval_hooks is not None:
-      self.disabled = self.disabled or any(isinstance(x, MetricsUpdateHook) for x in eval_hooks)
-
-    logging.info('Is environment eligible for recording experiment: %s',
-                 self._env_eligible_for_recording_experiment)
-
-    if self._env_eligible_for_recording_experiment and self._graceful_shutdown_port:
-      requests.post('http://localhost:{}/track_training_start'.format(
-        self._graceful_shutdown_port
-      ))
-
-    if self.disabled or eval_hooks is None:
-      yield None
-    else:
-      assert self._current_tracker_hook is None, 'experiment tracking has been started already'
-
-      if name is not None:
-        self._current_run_name_suffix = '_' + name
-
-      logging.info('Starting experiment tracking. Path: %s', self._current_run_id)
-      logging.info('Is environment eligible for recording export metadata: %s',
-                   self._env_eligible_for_recording_export_metadata)
-      logging.info('This run will be available at: http://go/mldash/experiments/%s',
-                   encode_url(self.experiment_id))
-
-      try:
-        self._record_run()
-        self._add_run_status(StatusUpdate(self._current_run_id, status='RUNNING'))
-        self._register_for_graceful_shutdown()
-
-        self._current_tracker_hook = self.create_eval_hook(get_estimator_spec_fn)
-      except Exception as err:
-        logging.error(
-          'Failed to record run. This experiment will not be tracked. Error: %s', str(err))
         self._current_tracker_hook = None
 
-      if self._current_tracker_hook is None:
-        yield None
-      else:
-        try:
-          eval_hooks.append(self._current_tracker_hook)
-          yield self._current_tracker_hook
-        except Exception as err:
-          self._add_run_status(
-            StatusUpdate(self._current_run_id, status='FAILED', description=str(err)))
-          self._deregister_for_graceful_shutdown()
-          self._current_tracker_hook = None
-          self._current_run_name_suffix = None
-          logging.error('Experiment tracking done. Experiment failed.')
-          raise
-
-        try:
-          if self._current_tracker_hook.metric_values:
-            self._record_update(self._current_tracker_hook.metric_values)
-          self._add_run_status(StatusUpdate(self._current_run_id, status='SUCCESS'))
-          logging.info('Experiment tracking done. Experiment succeeded.')
-        except Exception as err:
-          logging.error(
-            'Failed to update mark run as successful. Error: %s', str(err))
-        finally:
-          self._deregister_for_graceful_shutdown()
-          self._current_tracker_hook = None
-          self._current_run_name_suffix = None
-
-  def create_eval_hook(self, get_estimator_spec_fn):
-    """
-    Create an eval_hook to track eval metrics
+        if self.disabled:
+            logging.info("Experiment Tracker is disabled")
+        else:
+            logging.info(
+                "Experiment Tracker initialized with base run id: %s", self.base_run_id
+            )
+
+    @contextmanager
+    def track_experiment(self, eval_hooks, get_estimator_spec_fn, name=None):
+        """
+        A context manager for tracking experiment. It should wrap the training loop.
+        An experiment tracker eval hook is appended to eval_hooks to collect metrics.
+
+        Args:
+          eval_hooks (list):
+            The list of eval_hooks to be used. When it's not None, and does not contain any ,
+            MetricsUpdateHook an experiment tracker eval hook is appended to it. When it contains
+            any MetricsUpdateHook, this tracker is disabled to avoid conflict with legacy Model Repo
+            tracker (`TrackRun`).
+          get_estimator_spec_fn (func):
+            A function to get the current EstimatorSpec of the trainer, used by the eval hook.
+          name (str);
+            Name of this training or evaluation. Used as a suffix of the run_id.
+
+        Returns:
+          The tracker's eval hook which is appended to eval_hooks.
+        """
+
+        # disable this tracker if legacy TrackRun hook is present
+        # TODO: remove this once we completely deprecate the old TrackRun interface
+        if eval_hooks is not None:
+            self.disabled = self.disabled or any(
+                isinstance(x, MetricsUpdateHook) for x in eval_hooks
+            )
 
-    Args:
-      get_estimator_spec_fn (func):
-        A function that returns the current EstimatorSpec of the trainer.
-    """
-    return MetricsUpdateHook(
-      get_estimator_spec_fn=get_estimator_spec_fn,
-      add_metrics_fn=self._record_update)
-
-  def register_model(self, export_path):
-    """
-    Record the exported model.
-
-    Args:
-      export_path (str):
-        The path to the exported model.
-    """
-    if self.disabled:
-      return None
-
-    try:
-      logging.info('Model is exported to %s. Computing hash of the model.', export_path)
-      model_hash = self.compute_model_hash(export_path)
-      logging.info('Model hash: %s. Registering it in ML Metastore.', model_hash)
-      self._client.register_model(Model(model_hash, self.path['owner'], self.base_run_id))
-    except Exception as err:
-      logging.error('Failed to register model. Error: %s', str(err))
-
-  def export_feature_spec(self, feature_spec_dict):
-    """
-    Export feature spec to ML Metastore (go/ml-metastore).
-
-    Please note that the feature list in FeatureConfig only keeps the list of feature hash ids due
-    to the 1mb upper limit for values in manhattan, and more specific information (feature type,
-    feature name) for each feature config feature is stored separately in FeatureConfigFeature dataset.
-
-    Args:
-       feature_spec_dict (dict): A dictionary obtained from FeatureConfig.get_feature_spec()
-    """
-    if self.disabled or not self._env_eligible_for_recording_export_metadata:
-      return None
-
-    try:
-      logging.info('Exporting feature spec to ML Metastore.')
-      feature_list = feature_spec_dict['features']
-      label_list = feature_spec_dict['labels']
-      weight_list = feature_spec_dict['weight']
-      self._client.add_feature_config(FeatureConfig(self._current_run_id, list(feature_list.keys()),
-                                                    list(label_list.keys()), list(weight_list.keys())))
-
-      feature_config_features = [
-        FeatureConfigFeature(
-          hash_id=_feature_hash_id,
-          feature_name=_feature['featureName'],
-          feature_type=_feature['featureType']
-        )
-        for _feature_hash_id, _feature in zip(feature_list.keys(), feature_list.values())
-      ]
-      self._client.add_feature_config_features(list(feature_list.keys()), feature_config_features)
-
-      feature_config_labels = [
-        FeatureConfigFeature(
-          hash_id=_label_hash_id,
-          feature_name=_label['featureName']
-        )
-        for _label_hash_id, _label in zip(label_list.keys(), label_list.values())
-      ]
-      self._client.add_feature_config_features(list(label_list.keys()), feature_config_labels)
-
-      feature_config_weights = [
-        FeatureConfigFeature(
-          hash_id=_weight_hash_id,
-          feature_name=_weight['featureName'],
-          feature_type=_weight['featureType']
+        logging.info(
+            "Is environment eligible for recording experiment: %s",
+            self._env_eligible_for_recording_experiment,
         )
-        for _weight_hash_id, _weight in zip(weight_list.keys(), weight_list.values())
-      ]
-      self._client.add_feature_config_features(list(weight_list.keys()), feature_config_weights)
-
-    except Exception as err:
-      logging.error('Failed to export feature spec. Error: %s', str(err))
-
-  @property
-  def path(self):
-    if self.disabled:
-      return None
-    return get_components_from_id(self.tracking_path, ensure_valid_id=False)
-
-  @property
-  def experiment_id(self):
-    if self.disabled:
-      return None
-    return '%s:%s:%s' % (self.path['owner'], self.path['project_name'],
-                         self.path['experiment_name'])
-
-  @property
-  def _current_run_name(self):
-    """
-    Return the current run name.
-    """
-    if self._current_run_name_suffix is not None:
-      return self.path['run_name'] + self._current_run_name_suffix
-    else:
-      return self.path['run_name']
-
-  @property
-  def _current_run_id(self):
-    """
-    Return the current run id.
-    """
-    if self._current_run_name_suffix is not None:
-      return self.base_run_id + self._current_run_name_suffix
-    else:
-      return self.base_run_id
 
-  def get_run_status(self) -> str:
-    if not self.disabled:
-      return self._client.get_latest_dbv2_status(self._current_run_id)
-
-  def _add_run_status(self, status):
-    """
-    Add run status with underlying client.
-
-    Args:
-      status (StatusUpdate):
-        The status update to add.
-    """
-    if not self.disabled and self._env_eligible_for_recording_experiment:
-      self._client.add_run_status(status)
-
-  def _record_run(self):
-    """
-    Record the run in ML Metastore.
-    """
-    if self.disabled or not self._env_eligible_for_recording_experiment:
-      return None
-
-    if not self._client.project_exists(self.project_id):
-      self._client.add_project(Project(self.path['project_name'], self.path['owner']))
-      time.sleep(1)
-
-    if not self._client.experiment_exists(self.experiment_id):
-      self._client.add_experiment(Experiment(
-        self.path['experiment_name'], self.path['owner'], self.project_id, ''))
-      time.sleep(1)
-
-    run = DeepbirdRun(self.experiment_id, self._current_run_name, '',
-                      {'raw_command': ' '.join(sys.argv)}, self._params)
-    self._client.add_deepbird_run(run, force=True)
-    time.sleep(1)
-
-  def _record_update(self, metrics):
-    """
-    Record metrics update in ML Metastore.
-
-    Args:
-      metrics (dict):
-        The dict of the metrics and their values.
-    """
-
-    if self.disabled or not self._env_eligible_for_recording_experiment:
-      return None
-
-    reported_metrics = {}
-    for k, v in metrics.items():
-
-      if hasattr(v, 'item'):
-        reported_metrics[k] = v.item() if v.size == 1 else str(v.tolist())
-      else:
-        logging.warning("Ignoring %s because the value (%s) is not valid" % (k, str(v)))
-
-    report = ProgressReport(self._current_run_id, reported_metrics)
+        if self._env_eligible_for_recording_experiment and self._graceful_shutdown_port:
+            requests.post(
+                "http://localhost:{}/track_training_start".format(
+                    self._graceful_shutdown_port
+                )
+            )
+
+        if self.disabled or eval_hooks is None:
+            yield None
+        else:
+            assert (
+                self._current_tracker_hook is None
+            ), "experiment tracking has been started already"
+
+            if name is not None:
+                self._current_run_name_suffix = "_" + name
+
+            logging.info("Starting experiment tracking. Path: %s", self._current_run_id)
+            logging.info(
+                "Is environment eligible for recording export metadata: %s",
+                self._env_eligible_for_recording_export_metadata,
+            )
+            logging.info(
+                "This run will be available at: http://go/mldash/experiments/%s",
+                encode_url(self.experiment_id),
+            )
+
+            try:
+                self._record_run()
+                self._add_run_status(
+                    StatusUpdate(self._current_run_id, status="RUNNING")
+                )
+                self._register_for_graceful_shutdown()
+
+                self._current_tracker_hook = self.create_eval_hook(
+                    get_estimator_spec_fn
+                )
+            except Exception as err:
+                logging.error(
+                    "Failed to record run. This experiment will not be tracked. Error: %s",
+                    str(err),
+                )
+                self._current_tracker_hook = None
+
+            if self._current_tracker_hook is None:
+                yield None
+            else:
+                try:
+                    eval_hooks.append(self._current_tracker_hook)
+                    yield self._current_tracker_hook
+                except Exception as err:
+                    self._add_run_status(
+                        StatusUpdate(
+                            self._current_run_id, status="FAILED", description=str(err)
+                        )
+                    )
+                    self._deregister_for_graceful_shutdown()
+                    self._current_tracker_hook = None
+                    self._current_run_name_suffix = None
+                    logging.error("Experiment tracking done. Experiment failed.")
+                    raise
+
+                try:
+                    if self._current_tracker_hook.metric_values:
+                        self._record_update(self._current_tracker_hook.metric_values)
+                    self._add_run_status(
+                        StatusUpdate(self._current_run_id, status="SUCCESS")
+                    )
+                    logging.info("Experiment tracking done. Experiment succeeded.")
+                except Exception as err:
+                    logging.error(
+                        "Failed to update mark run as successful. Error: %s", str(err)
+                    )
+                finally:
+                    self._deregister_for_graceful_shutdown()
+                    self._current_tracker_hook = None
+                    self._current_run_name_suffix = None
+
+    def create_eval_hook(self, get_estimator_spec_fn):
+        """
+        Create an eval_hook to track eval metrics
+
+        Args:
+          get_estimator_spec_fn (func):
+            A function that returns the current EstimatorSpec of the trainer.
+        """
+        return MetricsUpdateHook(
+            get_estimator_spec_fn=get_estimator_spec_fn,
+            add_metrics_fn=self._record_update,
+        )
 
-    try:
-      self._client.add_progress_report(report)
-    except Exception as err:
-      logging.error('Failed to record metrics in ML Metastore. Error: {}'.format(err))
-      logging.error('Run ID: {}'.format(self._current_run_id))
-      logging.error('Progress Report: {}'.format(report.to_json_string()))
+    def register_model(self, export_path):
+        """
+        Record the exported model.
 
-  def _register_for_graceful_shutdown(self):
-    """
-    Register the tracker with the health server, enabling graceful shutdown.
+        Args:
+          export_path (str):
+            The path to the exported model.
+        """
+        if self.disabled:
+            return None
 
-    Returns:
-      (Response) health server response
-    """
-    if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment:
-      return requests.post('http://localhost:{}/register_id/{}'.format(
-        self._graceful_shutdown_port,
-        self._current_run_id
-      ))
+        try:
+            logging.info(
+                "Model is exported to %s. Computing hash of the model.", export_path
+            )
+            model_hash = self.compute_model_hash(export_path)
+            logging.info("Model hash: %s. Registering it in ML Metastore.", model_hash)
+            self._client.register_model(
+                Model(model_hash, self.path["owner"], self.base_run_id)
+            )
+        except Exception as err:
+            logging.error("Failed to register model. Error: %s", str(err))
 
-  def _deregister_for_graceful_shutdown(self):
-    """
-    Deregister the tracker with the health server, disabling graceful shutdown.
+    def export_feature_spec(self, feature_spec_dict):
+        """
+        Export feature spec to ML Metastore (go/ml-metastore).
 
-    Returns:
-      (Response) health server response
-    """
-    if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment:
-      return requests.post('http://localhost:{}/deregister_id/{}'.format(
-        self._graceful_shutdown_port,
-        self._current_run_id
-      ))
+        Please note that the feature list in FeatureConfig only keeps the list of feature hash ids due
+        to the 1mb upper limit for values in manhattan, and more specific information (feature type,
+        feature name) for each feature config feature is stored separately in FeatureConfigFeature dataset.
 
-  def _is_env_eligible_for_tracking(self):
-    """
-    Determine if experiment tracking should run in the env.
-    """
-    is_unit_test = (
-      os.environ.get('PYTEST_CURRENT_TEST') is not None and
-      os.environ.get('TEST_EXP_TRACKER') is None
-    )
+        Args:
+           feature_spec_dict (dict): A dictionary obtained from FeatureConfig.get_feature_spec()
+        """
+        if self.disabled or not self._env_eligible_for_recording_export_metadata:
+            return None
 
-    is_running_on_ci = (
-      getpass.getuser() == 'scoot-service' and
-      os.environ.get('TEST_EXP_TRACKER') is None
-    )
+        try:
+            logging.info("Exporting feature spec to ML Metastore.")
+            feature_list = feature_spec_dict["features"]
+            label_list = feature_spec_dict["labels"]
+            weight_list = feature_spec_dict["weight"]
+            self._client.add_feature_config(
+                FeatureConfig(
+                    self._current_run_id,
+                    list(feature_list.keys()),
+                    list(label_list.keys()),
+                    list(weight_list.keys()),
+                )
+            )
+
+            feature_config_features = [
+                FeatureConfigFeature(
+                    hash_id=_feature_hash_id,
+                    feature_name=_feature["featureName"],
+                    feature_type=_feature["featureType"],
+                )
+                for _feature_hash_id, _feature in zip(
+                    feature_list.keys(), feature_list.values()
+                )
+            ]
+            self._client.add_feature_config_features(
+                list(feature_list.keys()), feature_config_features
+            )
+
+            feature_config_labels = [
+                FeatureConfigFeature(
+                    hash_id=_label_hash_id, feature_name=_label["featureName"]
+                )
+                for _label_hash_id, _label in zip(
+                    label_list.keys(), label_list.values()
+                )
+            ]
+            self._client.add_feature_config_features(
+                list(label_list.keys()), feature_config_labels
+            )
+
+            feature_config_weights = [
+                FeatureConfigFeature(
+                    hash_id=_weight_hash_id,
+                    feature_name=_weight["featureName"],
+                    feature_type=_weight["featureType"],
+                )
+                for _weight_hash_id, _weight in zip(
+                    weight_list.keys(), weight_list.values()
+                )
+            ]
+            self._client.add_feature_config_features(
+                list(weight_list.keys()), feature_config_weights
+            )
 
-    return (
-      not is_unit_test and
-      not is_running_on_ci
-    )
+        except Exception as err:
+            logging.error("Failed to export feature spec. Error: %s", str(err))
+
+    @property
+    def path(self):
+        if self.disabled:
+            return None
+        return get_components_from_id(self.tracking_path, ensure_valid_id=False)
+
+    @property
+    def experiment_id(self):
+        if self.disabled:
+            return None
+        return "%s:%s:%s" % (
+            self.path["owner"],
+            self.path["project_name"],
+            self.path["experiment_name"],
+        )
 
-  @classmethod
-  def run_name_from_environ(cls):
-    """
-    Create run id from environment if possible.
-    """
-    job_name = os.environ.get("TWML_JOB_NAME")
-    job_launch_time = os.environ.get("TWML_JOB_LAUNCH_TIME")
-
-    if not job_name or not job_launch_time:
-      return None
-
-    try:
-      # job_launch_time should be in isoformat
-      # python2 doesnt support datetime.fromisoformat, so use hardcoded format string.
-      job_launch_time_formatted = datetime.strptime(job_launch_time,
-                                                    "%Y-%m-%dT%H:%M:%S.%f")
-    except ValueError:
-      # Fallback in case aurora config is generating datetime in a different format.
-      job_launch_time_formatted = (job_launch_time
-                                   .replace("-", "_").replace("T", "_")
-                                   .replace(":", "_").replace(".", "_"))
-
-    return '{}_{}'.format(
-      job_name, job_launch_time_formatted.strftime('%m_%d_%Y_%I_%M_%p'))
-
-  @classmethod
-  def guess_path(cls, save_dir, run_name=None):
-    """
-    Guess an experiment tracking path based on save_dir.
+    @property
+    def _current_run_name(self):
+        """
+        Return the current run name.
+        """
+        if self._current_run_name_suffix is not None:
+            return self.path["run_name"] + self._current_run_name_suffix
+        else:
+            return self.path["run_name"]
+
+    @property
+    def _current_run_id(self):
+        """
+        Return the current run id.
+        """
+        if self._current_run_name_suffix is not None:
+            return self.base_run_id + self._current_run_name_suffix
+        else:
+            return self.base_run_id
+
+    def get_run_status(self) -> str:
+        if not self.disabled:
+            return self._client.get_latest_dbv2_status(self._current_run_id)
+
+    def _add_run_status(self, status):
+        """
+        Add run status with underlying client.
+
+        Args:
+          status (StatusUpdate):
+            The status update to add.
+        """
+        if not self.disabled and self._env_eligible_for_recording_experiment:
+            self._client.add_run_status(status)
+
+    def _record_run(self):
+        """
+        Record the run in ML Metastore.
+        """
+        if self.disabled or not self._env_eligible_for_recording_experiment:
+            return None
+
+        if not self._client.project_exists(self.project_id):
+            self._client.add_project(
+                Project(self.path["project_name"], self.path["owner"])
+            )
+            time.sleep(1)
+
+        if not self._client.experiment_exists(self.experiment_id):
+            self._client.add_experiment(
+                Experiment(
+                    self.path["experiment_name"],
+                    self.path["owner"],
+                    self.project_id,
+                    "",
+                )
+            )
+            time.sleep(1)
+
+        run = DeepbirdRun(
+            self.experiment_id,
+            self._current_run_name,
+            "",
+            {"raw_command": " ".join(sys.argv)},
+            self._params,
+        )
+        self._client.add_deepbird_run(run, force=True)
+        time.sleep(1)
 
-    Returns:
-      (str) guessed path
-    """
-    if not run_name:
-      run_name = 'Unnamed_{}'.format(datetime.now().strftime('%m_%d_%Y_%I_%M_%p'))
+    def _record_update(self, metrics):
+        """
+        Record metrics update in ML Metastore.
 
-    if save_dir.startswith('hdfs://'):
-      path_match = re.search(r'/user/([a-z0-9\-_]+)/([a-z0-9\-_]+)', save_dir)
+        Args:
+          metrics (dict):
+            The dict of the metrics and their values.
+        """
 
-      if path_match:
-        groups = path_match.groups()
-        user = groups[0]
-        project_name = groups[1]
+        if self.disabled or not self._env_eligible_for_recording_experiment:
+            return None
 
-        return generate_id(user, 'default', project_name, run_name)
+        reported_metrics = {}
+        for k, v in metrics.items():
+            if hasattr(v, "item"):
+                reported_metrics[k] = v.item() if v.size == 1 else str(v.tolist())
+            else:
+                logging.warning(
+                    "Ignoring %s because the value (%s) is not valid" % (k, str(v))
+                )
 
-    user = getpass.getuser()
-    project_name = re.sub(r'^[a-z0-9\-_]', os.path.basename(save_dir), '')
-    if not project_name:
-      project_name = 'unnamed'
+        report = ProgressReport(self._current_run_id, reported_metrics)
 
-    return generate_id(user, 'default', project_name, run_name)
+        try:
+            self._client.add_progress_report(report)
+        except Exception as err:
+            logging.error(
+                "Failed to record metrics in ML Metastore. Error: {}".format(err)
+            )
+            logging.error("Run ID: {}".format(self._current_run_id))
+            logging.error("Progress Report: {}".format(report.to_json_string()))
+
+    def _register_for_graceful_shutdown(self):
+        """
+        Register the tracker with the health server, enabling graceful shutdown.
+
+        Returns:
+          (Response) health server response
+        """
+        if (
+            self._graceful_shutdown_port
+            and not self.disabled
+            and self._env_eligible_for_recording_experiment
+        ):
+            return requests.post(
+                "http://localhost:{}/register_id/{}".format(
+                    self._graceful_shutdown_port, self._current_run_id
+                )
+            )
+
+    def _deregister_for_graceful_shutdown(self):
+        """
+        Deregister the tracker with the health server, disabling graceful shutdown.
+
+        Returns:
+          (Response) health server response
+        """
+        if (
+            self._graceful_shutdown_port
+            and not self.disabled
+            and self._env_eligible_for_recording_experiment
+        ):
+            return requests.post(
+                "http://localhost:{}/deregister_id/{}".format(
+                    self._graceful_shutdown_port, self._current_run_id
+                )
+            )
+
+    def _is_env_eligible_for_tracking(self):
+        """
+        Determine if experiment tracking should run in the env.
+        """
+        is_unit_test = (
+            os.environ.get("PYTEST_CURRENT_TEST") is not None
+            and os.environ.get("TEST_EXP_TRACKER") is None
+        )
 
-  @classmethod
-  def compute_model_hash(cls, export_path):
-    """
-    Computes the hash of an exported model. This is a gfile version of
-    twitter.mlmetastore.common.versioning.compute_hash. The two functions should generate
-    the same hash when given the same model.
+        is_running_on_ci = (
+            getpass.getuser() == "scoot-service"
+            and os.environ.get("TEST_EXP_TRACKER") is None
+        )
 
-    Args:
-      export_path (str):
-        The path to the exported model.
+        return not is_unit_test and not is_running_on_ci
 
-    Returns:
-      (str) hash of the exported model
-    """
-    paths = []
-    for path, subdirs, files in tf.io.gfile.walk(export_path):
-      for name in sorted(files):
-        paths.append(os.path.join(path, name))
+    @classmethod
+    def run_name_from_environ(cls):
+        """
+        Create run id from environment if possible.
+        """
+        job_name = os.environ.get("TWML_JOB_NAME")
+        job_launch_time = os.environ.get("TWML_JOB_LAUNCH_TIME")
 
-    paths.sort()
-    hash_object = hashlib.new('sha1')
+        if not job_name or not job_launch_time:
+            return None
 
-    for path in paths:
-      with tf.io.gfile.GFile(path, "rb") as file:
-        hash_object.update(file.read())
+        try:
+            # job_launch_time should be in isoformat
+            # python2 doesnt support datetime.fromisoformat, so use hardcoded format string.
+            job_launch_time_formatted = datetime.strptime(
+                job_launch_time, "%Y-%m-%dT%H:%M:%S.%f"
+            )
+        except ValueError:
+            # Fallback in case aurora config is generating datetime in a different format.
+            job_launch_time_formatted = (
+                job_launch_time.replace("-", "_")
+                .replace("T", "_")
+                .replace(":", "_")
+                .replace(".", "_")
+            )
+
+        return "{}_{}".format(
+            job_name, job_launch_time_formatted.strftime("%m_%d_%Y_%I_%M_%p")
+        )
 
-    return hash_object.hexdigest()
+    @classmethod
+    def guess_path(cls, save_dir, run_name=None):
+        """
+        Guess an experiment tracking path based on save_dir.
+
+        Returns:
+          (str) guessed path
+        """
+        if not run_name:
+            run_name = "Unnamed_{}".format(datetime.now().strftime("%m_%d_%Y_%I_%M_%p"))
+
+        if save_dir.startswith("hdfs://"):
+            path_match = re.search(r"/user/([a-z0-9\-_]+)/([a-z0-9\-_]+)", save_dir)
+
+            if path_match:
+                groups = path_match.groups()
+                user = groups[0]
+                project_name = groups[1]
+
+                return generate_id(user, "default", project_name, run_name)
+
+        user = getpass.getuser()
+        project_name = re.sub(r"^[a-z0-9\-_]", os.path.basename(save_dir), "")
+        if not project_name:
+            project_name = "unnamed"
+
+        return generate_id(user, "default", project_name, run_name)
+
+    @classmethod
+    def compute_model_hash(cls, export_path):
+        """
+        Computes the hash of an exported model. This is a gfile version of
+        twitter.mlmetastore.common.versioning.compute_hash. The two functions should generate
+        the same hash when given the same model.
+
+        Args:
+          export_path (str):
+            The path to the exported model.
+
+        Returns:
+          (str) hash of the exported model
+        """
+        paths = []
+        for path, subdirs, files in tf.io.gfile.walk(export_path):
+            for name in sorted(files):
+                paths.append(os.path.join(path, name))
+
+        paths.sort()
+        hash_object = hashlib.new("sha1")
+
+        for path in paths:
+            with tf.io.gfile.GFile(path, "rb") as file:
+                hash_object.update(file.read())
+
+        return hash_object.hexdigest()
diff --git a/twml/twml/trainers/__init__.py b/twml/twml/trainers/__init__.py
index e6664d9a6..9dbaf3cf4 100644
--- a/twml/twml/trainers/__init__.py
+++ b/twml/twml/trainers/__init__.py
@@ -6,5 +6,5 @@
 <https://www.tensorflow.org/versions/master/api_docs/python/tf/estimator/Estimator>`_.
 """
 
-from .trainer import Trainer  # noqa: F401
 from .data_record_trainer import DataRecordTrainer  # noqa: F401
+from .trainer import Trainer  # noqa: F401
diff --git a/twml/twml/trainers/data_record_trainer.py b/twml/twml/trainers/data_record_trainer.py
index 76dd16f80..370938743 100644
--- a/twml/twml/trainers/data_record_trainer.py
+++ b/twml/twml/trainers/data_record_trainer.py
@@ -58,764 +58,915 @@
 import datetime
 
 import tensorflow.compat.v1 as tf
+from absl import logging
 from twitter.deepbird.io.dal import dal_to_hdfs_path, is_dal_path
+
 import twml
-from twml.trainers import Trainer
 from twml.contrib.feature_importances.feature_importances import (
-  compute_feature_importances,
-  TREE,
-  write_feature_importances_to_hdfs,
-  write_feature_importances_to_ml_dash)
-from absl import logging
+    TREE,
+    compute_feature_importances,
+    write_feature_importances_to_hdfs,
+    write_feature_importances_to_ml_dash,
+)
+from twml.trainers import Trainer
 
 
 class DataRecordTrainer(Trainer):  # pylint: disable=abstract-method
-  """
-  The ``DataRecordTrainer`` implementation is intended to satisfy the most common use cases
-  at Twitter where only the build_graph methods needs to be overridden.
-  For this reason, ``Trainer.[train,eval]_input_fn`` methods
-  assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
-
-  For use-cases that differ from this common Twitter use-case,
-  further Trainer methods can be overridden.
-  If that still doesn't provide enough flexibility, the user can always
-  use the tf.estimator.Esimator or tf.session.run directly.
-  """
-
-  def __init__(
-          self, name, params,
-          build_graph_fn,
-          feature_config=None,
-          **kwargs):
     """
-    The DataRecordTrainer constructor builds a
-    ``tf.estimator.Estimator`` and stores it in self.estimator.
-    For this reason, DataRecordTrainer accepts the same Estimator constructor arguments.
-    It also accepts additional arguments to facilitate metric evaluation and multi-phase training
-    (init_from_dir, init_map).
-
-    Args:
-      parent arguments:
-        See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
-        for a full list of arguments accepted by the parent class.
-      name, params, build_graph_fn (and other parent class args):
-        see documentation for twml.Trainer doc.
-      feature_config:
-        An object of type FeatureConfig describing what features to decode.
-        Defaults to None. But it is needed in the following cases:
-          - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
-          - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
-
-      **kwargs:
-        further kwargs can be specified and passed to the Estimator constructor.
+    The ``DataRecordTrainer`` implementation is intended to satisfy the most common use cases
+    at Twitter where only the build_graph methods needs to be overridden.
+    For this reason, ``Trainer.[train,eval]_input_fn`` methods
+    assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
+
+    For use-cases that differ from this common Twitter use-case,
+    further Trainer methods can be overridden.
+    If that still doesn't provide enough flexibility, the user can always
+    use the tf.estimator.Esimator or tf.session.run directly.
     """
 
-    # NOTE: DO NOT MODIFY `params` BEFORE THIS CALL.
-    super(DataRecordTrainer, self).__init__(
-      name=name, params=params, build_graph_fn=build_graph_fn, **kwargs)
-
-    self._feature_config = feature_config
-
-    # date range parameters common to both training and evaluation data:
-    hour_resolution = self.params.get("hour_resolution", 1)
-    data_threads = self.params.get("data_threads", 4)
-    datetime_format = self.params.get("datetime_format", "%Y/%m/%d")
-
-    # retrieve the desired training dataset files
-    self._train_files = self.build_files_list(
-      files_list_path=self.params.get("train_files_list", None),
-      data_dir=self.params.get("train_data_dir", None),
-      start_datetime=self.params.get("train_start_datetime", None),
-      end_datetime=self.params.get("train_end_datetime", None),
-      datetime_format=datetime_format, data_threads=data_threads,
-      hour_resolution=hour_resolution, maybe_save=self.is_chief(),
-      overwrite=self.params.get("train_overwrite_files_list", False),
-    )
-
-    # retrieve the desired evaluation dataset files
-    eval_name = self.params.get("eval_name", None)
-
-    if eval_name == "train":
-      self._eval_files = self._train_files
-    else:
-      self._eval_files = self.build_files_list(
-        files_list_path=self.params.get("eval_files_list", None),
-        data_dir=self.params.get("eval_data_dir", None),
-        start_datetime=self.params.get("eval_start_datetime", None),
-        end_datetime=self.params.get("eval_end_datetime", None),
-        datetime_format=datetime_format, data_threads=data_threads,
-        hour_resolution=hour_resolution, maybe_save=self.is_chief(),
-        overwrite=self.params.get("eval_overwrite_files_list", False),
-      )
-
-      if not self.params.get("allow_train_eval_overlap"):
-        # if there is overlap between train and eval, error out!
-        if self._train_files and self._eval_files:
-          overlap_files = set(self._train_files) & set(self._eval_files)
+    def __init__(self, name, params, build_graph_fn, feature_config=None, **kwargs):
+        """
+        The DataRecordTrainer constructor builds a
+        ``tf.estimator.Estimator`` and stores it in self.estimator.
+        For this reason, DataRecordTrainer accepts the same Estimator constructor arguments.
+        It also accepts additional arguments to facilitate metric evaluation and multi-phase training
+        (init_from_dir, init_map).
+
+        Args:
+          parent arguments:
+            See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
+            for a full list of arguments accepted by the parent class.
+          name, params, build_graph_fn (and other parent class args):
+            see documentation for twml.Trainer doc.
+          feature_config:
+            An object of type FeatureConfig describing what features to decode.
+            Defaults to None. But it is needed in the following cases:
+              - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
+              - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
+
+          **kwargs:
+            further kwargs can be specified and passed to the Estimator constructor.
+        """
+
+        # NOTE: DO NOT MODIFY `params` BEFORE THIS CALL.
+        super(DataRecordTrainer, self).__init__(
+            name=name, params=params, build_graph_fn=build_graph_fn, **kwargs
+        )
+
+        self._feature_config = feature_config
+
+        # date range parameters common to both training and evaluation data:
+        hour_resolution = self.params.get("hour_resolution", 1)
+        data_threads = self.params.get("data_threads", 4)
+        datetime_format = self.params.get("datetime_format", "%Y/%m/%d")
+
+        # retrieve the desired training dataset files
+        self._train_files = self.build_files_list(
+            files_list_path=self.params.get("train_files_list", None),
+            data_dir=self.params.get("train_data_dir", None),
+            start_datetime=self.params.get("train_start_datetime", None),
+            end_datetime=self.params.get("train_end_datetime", None),
+            datetime_format=datetime_format,
+            data_threads=data_threads,
+            hour_resolution=hour_resolution,
+            maybe_save=self.is_chief(),
+            overwrite=self.params.get("train_overwrite_files_list", False),
+        )
+
+        # retrieve the desired evaluation dataset files
+        eval_name = self.params.get("eval_name", None)
+
+        if eval_name == "train":
+            self._eval_files = self._train_files
         else:
-          overlap_files = set()
-        if overlap_files:
-          raise ValueError("There is an overlap between train and eval files:\n %s" %
-                           (overlap_files))
-
-  @staticmethod
-  def build_hdfs_files_list(
-      files_list_path, data_dir,
-      start_datetime, end_datetime, datetime_format,
-      data_threads, hour_resolution, maybe_save, overwrite):
-    if files_list_path:
-      files_list_path = twml.util.preprocess_path(files_list_path)
-
-    if isinstance(start_datetime, datetime.datetime):
-      start_datetime = start_datetime.strftime(datetime_format)
-    if isinstance(end_datetime, datetime.datetime):
-      end_datetime = end_datetime.strftime(datetime_format)
-
-    list_files_by_datetime_args = {
-      "base_path": data_dir,
-      "start_datetime": start_datetime,
-      "end_datetime": end_datetime,
-      "datetime_prefix_format": datetime_format,
-      "extension": "lzo",
-      "parallelism": data_threads,
-      "hour_resolution": hour_resolution,
-      "sort": True,
-    }
-
-    # no cache of data file paths, just get the list by scraping the directory
-    if not files_list_path or not tf.io.gfile.exists(files_list_path):
-      # twml.util.list_files_by_datetime returns None if data_dir is None.
-      # twml.util.list_files_by_datetime passes through data_dir if data_dir is a list
-      files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
-    else:
-      # the cached data file paths file exists.
-      files_info = twml.util.read_file(files_list_path, decode="json")
-      # use the cached list if data params match current params,
-      #  or if current params are None
-      # Not including None checks for datetime_format and hour_resolution,
-      #  since those are shared between eval and training.
-      if (all(param is None for param in [data_dir, start_datetime, end_datetime]) or
-          (files_info["data_dir"] == data_dir and
-           files_info["start_datetime"] == start_datetime and
-           files_info["end_datetime"] == end_datetime and
-           files_info["datetime_format"] == datetime_format and
-           files_info["hour_resolution"] == hour_resolution)):
-        files_list = files_info["files"]
-      elif overwrite:
-        # current params are not none and don't match saved params
-        # `overwrite` indicates we should thus update the list
-        files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
-      else:
-        # dont update the cached list
-        raise ValueError("Information in files_list is inconsistent with provided args.\n"
-                         "Did you intend to overwrite files_list using "
-                         "--train.overwrite_files_list or --eval.overwrite_files_list?\n"
-                         "If you instead want to use the paths in files_list, ensure that "
-                         "data_dir, start_datetime, and end_datetime are None.")
-
-    if maybe_save and files_list_path and (overwrite or not tf.io.gfile.exists(files_list_path)):
-      save_dict = {}
-      save_dict["files"] = files_list
-      save_dict["data_dir"] = data_dir
-      save_dict["start_datetime"] = start_datetime
-      save_dict["end_datetime"] = end_datetime
-      save_dict["datetime_format"] = datetime_format
-      save_dict["hour_resolution"] = hour_resolution
-      twml.util.write_file(files_list_path, save_dict, encode="json")
-
-    return files_list
-
-  @staticmethod
-  def build_files_list(files_list_path, data_dir,
-                        start_datetime, end_datetime, datetime_format,
-                        data_threads, hour_resolution, maybe_save, overwrite):
-    '''
-    When specifying DAL datasets, only data_dir, start_dateime, and end_datetime
-    should be given with the format:
-
-    dal://{cluster}/{role}/{dataset_name}/{env}
-
-    '''
-    if not data_dir or not is_dal_path(data_dir):
-      logging.warn(f"Please consider specifying a dal:// dataset rather than passing a physical hdfs path.")
-      return DataRecordTrainer.build_hdfs_files_list(
-        files_list_path, data_dir,
-        start_datetime, end_datetime, datetime_format,
-        data_threads, hour_resolution, maybe_save, overwrite)
-
-    del datetime_format
-    del data_threads
-    del hour_resolution
-    del maybe_save
-    del overwrite
-
-    return dal_to_hdfs_path(
-      path=data_dir,
-      start_datetime=start_datetime,
-      end_datetime=end_datetime,
-    )
-
-  @property
-  def train_files(self):
-    return self._train_files
-
-  @property
-  def eval_files(self):
-    return self._eval_files
-
-  @staticmethod
-  def add_parser_arguments():
-    """
-    Add common commandline args to parse for the Trainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `Trainer code <_modules/twml/trainers/trainer.html#Trainer.add_parser_arguments>`_
-    and `DataRecordTrainer code
-    <_modules/twml/trainers/trainer.html#DataRecordTrainer.add_parser_arguments>`_
-    for a list and description of all cmd-line arguments.
-
-    Args:
-      learning_rate_decay:
-        Defaults to False. When True, parses learning rate decay arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    parser = super(DataRecordTrainer, DataRecordTrainer).add_parser_arguments()
-    parser.add_argument(
-      "--train.files_list", "--train_files_list", type=str, default=None,
-      dest="train_files_list",
-      help="Path for a json file storing information on training data.\n"
-           "Specifically, the file at files_list should contain the dataset parameters "
-           "for constructing the list of data files, and the list of data file paths.\n"
-           "If the json file does not exist, other args are used to construct the "
-           "training files list, and that list will be saved to the indicated json file.\n"
-           "If the json file does exist, and current args are consistent with "
-           "saved args, or are all None, then the saved files list will be used.\n"
-           "If current args are not consistent with the saved args, then error out "
-           "if train_overwrite_files_list==False, else overwrite files_list with "
-           "a newly constructed list.")
-    parser.add_argument(
-      "--train.overwrite_files_list", "--train_overwrite_files_list", action="store_true", default=False,
-      dest="train_overwrite_files_list",
-      help="When the --train.files_list param is used, indicates whether to "
-           "overwrite the existing --train.files_list when there are differences "
-           "between the current and saved dataset args. Default (False) is to "
-           "error out if files_list exists and differs from current params.")
-    parser.add_argument(
-      "--train.data_dir", "--train_data_dir", type=str, default=None,
-      dest="train_data_dir",
-      help="Path to the training data directory."
-           "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
-           "and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--train.start_date", "--train_start_datetime",
-      type=str, default=None,
-      dest="train_start_datetime",
-      help="Starting date for training inside the train data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--train.end_date", "--train_end_datetime", type=str, default=None,
-      dest="train_end_datetime",
-      help="Ending date for training inside the train data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--eval.files_list", "--eval_files_list", type=str, default=None,
-      dest="eval_files_list",
-      help="Path for a json file storing information on evaluation data.\n"
-           "Specifically, the file at files_list should contain the dataset parameters "
-           "for constructing the list of data files, and the list of data file paths.\n"
-           "If the json file does not exist, other args are used to construct the "
-           "evaluation files list, and that list will be saved to the indicated json file.\n"
-           "If the json file does exist, and current args are consistent with "
-           "saved args, or are all None, then the saved files list will be used.\n"
-           "If current args are not consistent with the saved args, then error out "
-           "if eval_overwrite_files_list==False, else overwrite files_list with "
-           "a newly constructed list.")
-    parser.add_argument(
-      "--eval.overwrite_files_list", "--eval_overwrite_files_list", action="store_true", default=False,
-      dest="eval_overwrite_files_list",
-      help="When the --eval.files_list param is used, indicates whether to "
-           "overwrite the existing --eval.files_list when there are differences "
-           "between the current and saved dataset args. Default (False) is to "
-           "error out if files_list exists and differs from current params.")
-    parser.add_argument(
-      "--eval.data_dir", "--eval_data_dir", type=str, default=None,
-      dest="eval_data_dir",
-      help="Path to the cross-validation data directory."
-           "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
-           "and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--eval.start_date", "--eval_start_datetime",
-      type=str, default=None,
-      dest="eval_start_datetime",
-      help="Starting date for evaluating inside the eval data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--eval.end_date", "--eval_end_datetime", type=str, default=None,
-      dest="eval_end_datetime",
-      help="Ending date for evaluating inside the eval data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--datetime_format", type=str, default="%Y/%m/%d",
-      help="Date format for training and evaluation datasets."
-           "Has to be a format that is understood by python datetime."
-           "e.g. %%Y/%%m/%%d for 2019/01/15."
-           "Used only if {train/eval}.{start/end}_date are provided.")
-    parser.add_argument(
-      "--hour_resolution", type=int, default=None,
-      help="Specify the hourly resolution of the stored data.")
-    parser.add_argument(
-      "--data_spec", type=str, required=True,
-      help="Path to data specification JSON file. This file is used to decode DataRecords")
-    parser.add_argument(
-      "--train.keep_rate", "--train_keep_rate", type=float, default=None,
-      dest="train_keep_rate",
-      help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
-      distribution with p = 1 - keep_rate.")
-    parser.add_argument(
-      "--eval.keep_rate", "--eval_keep_rate", type=float, default=None,
-      dest="eval_keep_rate",
-      help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
-      distribution with p = 1 - keep_rate.")
-    parser.add_argument(
-      "--train.parts_downsampling_rate", "--train_parts_downsampling_rate",
-      dest="train_parts_downsampling_rate",
-      type=float, default=None,
-      help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
+            self._eval_files = self.build_files_list(
+                files_list_path=self.params.get("eval_files_list", None),
+                data_dir=self.params.get("eval_data_dir", None),
+                start_datetime=self.params.get("eval_start_datetime", None),
+                end_datetime=self.params.get("eval_end_datetime", None),
+                datetime_format=datetime_format,
+                data_threads=data_threads,
+                hour_resolution=hour_resolution,
+                maybe_save=self.is_chief(),
+                overwrite=self.params.get("eval_overwrite_files_list", False),
+            )
+
+            if not self.params.get("allow_train_eval_overlap"):
+                # if there is overlap between train and eval, error out!
+                if self._train_files and self._eval_files:
+                    overlap_files = set(self._train_files) & set(self._eval_files)
+                else:
+                    overlap_files = set()
+                if overlap_files:
+                    raise ValueError(
+                        "There is an overlap between train and eval files:\n %s"
+                        % (overlap_files)
+                    )
+
+    @staticmethod
+    def build_hdfs_files_list(
+        files_list_path,
+        data_dir,
+        start_datetime,
+        end_datetime,
+        datetime_format,
+        data_threads,
+        hour_resolution,
+        maybe_save,
+        overwrite,
+    ):
+        if files_list_path:
+            files_list_path = twml.util.preprocess_path(files_list_path)
+
+        if isinstance(start_datetime, datetime.datetime):
+            start_datetime = start_datetime.strftime(datetime_format)
+        if isinstance(end_datetime, datetime.datetime):
+            end_datetime = end_datetime.strftime(datetime_format)
+
+        list_files_by_datetime_args = {
+            "base_path": data_dir,
+            "start_datetime": start_datetime,
+            "end_datetime": end_datetime,
+            "datetime_prefix_format": datetime_format,
+            "extension": "lzo",
+            "parallelism": data_threads,
+            "hour_resolution": hour_resolution,
+            "sort": True,
+        }
+
+        # no cache of data file paths, just get the list by scraping the directory
+        if not files_list_path or not tf.io.gfile.exists(files_list_path):
+            # twml.util.list_files_by_datetime returns None if data_dir is None.
+            # twml.util.list_files_by_datetime passes through data_dir if data_dir is a list
+            files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
+        else:
+            # the cached data file paths file exists.
+            files_info = twml.util.read_file(files_list_path, decode="json")
+            # use the cached list if data params match current params,
+            #  or if current params are None
+            # Not including None checks for datetime_format and hour_resolution,
+            #  since those are shared between eval and training.
+            if all(
+                param is None for param in [data_dir, start_datetime, end_datetime]
+            ) or (
+                files_info["data_dir"] == data_dir
+                and files_info["start_datetime"] == start_datetime
+                and files_info["end_datetime"] == end_datetime
+                and files_info["datetime_format"] == datetime_format
+                and files_info["hour_resolution"] == hour_resolution
+            ):
+                files_list = files_info["files"]
+            elif overwrite:
+                # current params are not none and don't match saved params
+                # `overwrite` indicates we should thus update the list
+                files_list = twml.util.list_files_by_datetime(
+                    **list_files_by_datetime_args
+                )
+            else:
+                # dont update the cached list
+                raise ValueError(
+                    "Information in files_list is inconsistent with provided args.\n"
+                    "Did you intend to overwrite files_list using "
+                    "--train.overwrite_files_list or --eval.overwrite_files_list?\n"
+                    "If you instead want to use the paths in files_list, ensure that "
+                    "data_dir, start_datetime, and end_datetime are None."
+                )
+
+        if (
+            maybe_save
+            and files_list_path
+            and (overwrite or not tf.io.gfile.exists(files_list_path))
+        ):
+            save_dict = {}
+            save_dict["files"] = files_list
+            save_dict["data_dir"] = data_dir
+            save_dict["start_datetime"] = start_datetime
+            save_dict["end_datetime"] = end_datetime
+            save_dict["datetime_format"] = datetime_format
+            save_dict["hour_resolution"] = hour_resolution
+            twml.util.write_file(files_list_path, save_dict, encode="json")
+
+        return files_list
+
+    @staticmethod
+    def build_files_list(
+        files_list_path,
+        data_dir,
+        start_datetime,
+        end_datetime,
+        datetime_format,
+        data_threads,
+        hour_resolution,
+        maybe_save,
+        overwrite,
+    ):
+        """
+        When specifying DAL datasets, only data_dir, start_dateime, and end_datetime
+        should be given with the format:
+
+        dal://{cluster}/{role}/{dataset_name}/{env}
+
+        """
+        if not data_dir or not is_dal_path(data_dir):
+            logging.warn(
+                f"Please consider specifying a dal:// dataset rather than passing a physical hdfs path."
+            )
+            return DataRecordTrainer.build_hdfs_files_list(
+                files_list_path,
+                data_dir,
+                start_datetime,
+                end_datetime,
+                datetime_format,
+                data_threads,
+                hour_resolution,
+                maybe_save,
+                overwrite,
+            )
+
+        del datetime_format
+        del data_threads
+        del hour_resolution
+        del maybe_save
+        del overwrite
+
+        return dal_to_hdfs_path(
+            path=data_dir,
+            start_datetime=start_datetime,
+            end_datetime=end_datetime,
+        )
+
+    @property
+    def train_files(self):
+        return self._train_files
+
+    @property
+    def eval_files(self):
+        return self._eval_files
+
+    @staticmethod
+    def add_parser_arguments():
+        """
+        Add common commandline args to parse for the Trainer class.
+        Typically, the user calls this function and then parses cmd-line arguments
+        into an argparse.Namespace object which is then passed to the Trainer constructor
+        via the params argument.
+
+        See the `Trainer code <_modules/twml/trainers/trainer.html#Trainer.add_parser_arguments>`_
+        and `DataRecordTrainer code
+        <_modules/twml/trainers/trainer.html#DataRecordTrainer.add_parser_arguments>`_
+        for a list and description of all cmd-line arguments.
+
+        Args:
+          learning_rate_decay:
+            Defaults to False. When True, parses learning rate decay arguments.
+
+        Returns:
+          argparse.ArgumentParser instance with some useful args already added.
+        """
+        parser = super(DataRecordTrainer, DataRecordTrainer).add_parser_arguments()
+        parser.add_argument(
+            "--train.files_list",
+            "--train_files_list",
+            type=str,
+            default=None,
+            dest="train_files_list",
+            help="Path for a json file storing information on training data.\n"
+            "Specifically, the file at files_list should contain the dataset parameters "
+            "for constructing the list of data files, and the list of data file paths.\n"
+            "If the json file does not exist, other args are used to construct the "
+            "training files list, and that list will be saved to the indicated json file.\n"
+            "If the json file does exist, and current args are consistent with "
+            "saved args, or are all None, then the saved files list will be used.\n"
+            "If current args are not consistent with the saved args, then error out "
+            "if train_overwrite_files_list==False, else overwrite files_list with "
+            "a newly constructed list.",
+        )
+        parser.add_argument(
+            "--train.overwrite_files_list",
+            "--train_overwrite_files_list",
+            action="store_true",
+            default=False,
+            dest="train_overwrite_files_list",
+            help="When the --train.files_list param is used, indicates whether to "
+            "overwrite the existing --train.files_list when there are differences "
+            "between the current and saved dataset args. Default (False) is to "
+            "error out if files_list exists and differs from current params.",
+        )
+        parser.add_argument(
+            "--train.data_dir",
+            "--train_data_dir",
+            type=str,
+            default=None,
+            dest="train_data_dir",
+            help="Path to the training data directory."
+            "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
+            "and HDFS (hdfs://default/<path> ) paths.",
+        )
+        parser.add_argument(
+            "--train.start_date",
+            "--train_start_datetime",
+            type=str,
+            default=None,
+            dest="train_start_datetime",
+            help="Starting date for training inside the train data dir."
+            "The start datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--train.end_date",
+            "--train_end_datetime",
+            type=str,
+            default=None,
+            dest="train_end_datetime",
+            help="Ending date for training inside the train data dir."
+            "The end datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--eval.files_list",
+            "--eval_files_list",
+            type=str,
+            default=None,
+            dest="eval_files_list",
+            help="Path for a json file storing information on evaluation data.\n"
+            "Specifically, the file at files_list should contain the dataset parameters "
+            "for constructing the list of data files, and the list of data file paths.\n"
+            "If the json file does not exist, other args are used to construct the "
+            "evaluation files list, and that list will be saved to the indicated json file.\n"
+            "If the json file does exist, and current args are consistent with "
+            "saved args, or are all None, then the saved files list will be used.\n"
+            "If current args are not consistent with the saved args, then error out "
+            "if eval_overwrite_files_list==False, else overwrite files_list with "
+            "a newly constructed list.",
+        )
+        parser.add_argument(
+            "--eval.overwrite_files_list",
+            "--eval_overwrite_files_list",
+            action="store_true",
+            default=False,
+            dest="eval_overwrite_files_list",
+            help="When the --eval.files_list param is used, indicates whether to "
+            "overwrite the existing --eval.files_list when there are differences "
+            "between the current and saved dataset args. Default (False) is to "
+            "error out if files_list exists and differs from current params.",
+        )
+        parser.add_argument(
+            "--eval.data_dir",
+            "--eval_data_dir",
+            type=str,
+            default=None,
+            dest="eval_data_dir",
+            help="Path to the cross-validation data directory."
+            "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
+            "and HDFS (hdfs://default/<path> ) paths.",
+        )
+        parser.add_argument(
+            "--eval.start_date",
+            "--eval_start_datetime",
+            type=str,
+            default=None,
+            dest="eval_start_datetime",
+            help="Starting date for evaluating inside the eval data dir."
+            "The start datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--eval.end_date",
+            "--eval_end_datetime",
+            type=str,
+            default=None,
+            dest="eval_end_datetime",
+            help="Ending date for evaluating inside the eval data dir."
+            "The end datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--datetime_format",
+            type=str,
+            default="%Y/%m/%d",
+            help="Date format for training and evaluation datasets."
+            "Has to be a format that is understood by python datetime."
+            "e.g. %%Y/%%m/%%d for 2019/01/15."
+            "Used only if {train/eval}.{start/end}_date are provided.",
+        )
+        parser.add_argument(
+            "--hour_resolution",
+            type=int,
+            default=None,
+            help="Specify the hourly resolution of the stored data.",
+        )
+        parser.add_argument(
+            "--data_spec",
+            type=str,
+            required=True,
+            help="Path to data specification JSON file. This file is used to decode DataRecords",
+        )
+        parser.add_argument(
+            "--train.keep_rate",
+            "--train_keep_rate",
+            type=float,
+            default=None,
+            dest="train_keep_rate",
+            help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
+      distribution with p = 1 - keep_rate.",
+        )
+        parser.add_argument(
+            "--eval.keep_rate",
+            "--eval_keep_rate",
+            type=float,
+            default=None,
+            dest="eval_keep_rate",
+            help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
+      distribution with p = 1 - keep_rate.",
+        )
+        parser.add_argument(
+            "--train.parts_downsampling_rate",
+            "--train_parts_downsampling_rate",
+            dest="train_parts_downsampling_rate",
+            type=float,
+            default=None,
+            help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
       files. For example, a value of 0.2 means only 20 percent of part files become part of the \
-      dataset.")
-    parser.add_argument(
-      "--eval.parts_downsampling_rate", "--eval_parts_downsampling_rate",
-      dest="eval_parts_downsampling_rate",
-      type=float, default=None,
-      help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
+      dataset.",
+        )
+        parser.add_argument(
+            "--eval.parts_downsampling_rate",
+            "--eval_parts_downsampling_rate",
+            dest="eval_parts_downsampling_rate",
+            type=float,
+            default=None,
+            help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
       files. For example, a value of 0.2 means only 20 percent of part files become part of the \
-      dataset.")
-    parser.add_argument(
-      "--allow_train_eval_overlap",
-      dest="allow_train_eval_overlap",
-      action="store_true",
-      help="Allow overlap between train and eval datasets."
-    )
-    parser.add_argument(
-      "--eval_name", type=str, default=None,
-      help="String denoting what we want to name the eval. If this is `train`, then we eval on \
-      the training dataset."
-    )
-    return parser
-
-  def contrib_run_feature_importances(self, feature_importances_parse_fn=None, write_to_hdfs=True, extra_groups=None, datarecord_filter_fn=None, datarecord_filter_run_name=None):
-    """Compute feature importances on a trained model (this is a contrib feature)
-    Args:
-      feature_importances_parse_fn (fn): The same parse_fn that we use for training/evaluation.
-        Defaults to feature_config.get_parse_fn()
-      write_to_hdfs (bool): Setting this to True writes the feature importance metrics to HDFS
-    extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
-      the names of the features in the group
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    """
-    logging.info("Computing feature importance")
-    algorithm = self._params.feature_importance_algorithm
-
-    kwargs = {}
-    if algorithm == TREE:
-      kwargs["split_feature_group_on_period"] = self._params.split_feature_group_on_period
-      kwargs["stopping_metric"] = self._params.feature_importance_metric
-      kwargs["sensitivity"] = self._params.feature_importance_sensitivity
-      kwargs["dont_build_tree"] = self._params.dont_build_tree
-      kwargs["extra_groups"] = extra_groups
-      if self._params.feature_importance_is_metric_larger_the_better:
-        # The user has specified that the stopping metric is one where larger values are better (e.g. ROC_AUC)
-        kwargs["is_metric_larger_the_better"] = True
-      elif self._params.feature_importance_is_metric_smaller_the_better:
-        # The user has specified that the stopping metric is one where smaller values are better (e.g. LOSS)
-        kwargs["is_metric_larger_the_better"] = False
-      else:
-        # The user has not specified which direction is better for the stopping metric
-        kwargs["is_metric_larger_the_better"] = None
-      logging.info("Using the tree algorithm with kwargs {}".format(kwargs))
-
-    feature_importances = compute_feature_importances(
-      trainer=self,
-      data_dir=self._params.get('feature_importance_data_dir'),
-      feature_config=self._feature_config,
-      algorithm=algorithm,
-      record_count=self._params.feature_importance_example_count,
-      parse_fn=feature_importances_parse_fn,
-      datarecord_filter_fn=datarecord_filter_fn,
-      **kwargs)
-
-    if not feature_importances:
-      logging.info("Feature importances returned None")
-    else:
-      if write_to_hdfs:
-        logging.info("Writing feature importance to HDFS")
-        write_feature_importances_to_hdfs(
-          trainer=self,
-          feature_importances=feature_importances,
-          output_path=datarecord_filter_run_name,
-          metric=self._params.get('feature_importance_metric'))
-      else:
-        logging.info("Not writing feature importance to HDFS")
-
-      logging.info("Writing feature importance to ML Metastore")
-      write_feature_importances_to_ml_dash(
-        trainer=self, feature_importances=feature_importances)
-    return feature_importances
-
-  def export_model(self, serving_input_receiver_fn=None,
-                   export_output_fn=None,
-                   export_dir=None, checkpoint_path=None,
-                   feature_spec=None):
-    """
-    Export the model for prediction. Typically, the exported model
-    will later be run in production servers. This method is called
-    by the user to export the PREDICT graph to disk.
-
-    Internally, this method calls `tf.estimator.Estimator.export_savedmodel
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
-
-    Args:
-      serving_input_receiver_fn (Function):
-        function preparing the model for inference requests.
-        If not set; defaults to the the serving input receiver fn set by the FeatureConfig.
-      export_output_fn (Function):
-        Function to export the graph_output (output of build_graph) for
-        prediction. Takes a graph_output dict as sole argument and returns
-        the export_output_fns dict.
-        Defaults to ``twml.export_output_fns.batch_prediction_continuous_output_fn``.
-      export_dir:
-        directory to export a SavedModel for prediction servers.
-        Defaults to ``[save_dir]/exported_models``.
-      checkpoint_path:
-        the checkpoint path to export. If None (the default), the most recent checkpoint
-        found within the model directory ``save_dir`` is chosen.
-
-    Returns:
-      The export directory where the PREDICT graph is saved.
-    """
-    if serving_input_receiver_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      serving_input_receiver_fn = self._feature_config.get_serving_input_receiver_fn()
-
-    if feature_spec is None:
-      if self._feature_config is None:
-        raise ValueError("feature_spec can not be inferred."
-                         "Please pass feature_spec=feature_config.get_feature_spec() to the trainer.export_model method")
-      else:
-        feature_spec = self._feature_config.get_feature_spec()
-
-    if isinstance(serving_input_receiver_fn, twml.feature_config.FeatureConfig):
-      raise ValueError("Cannot pass FeatureConfig as a parameter to serving_input_receiver_fn")
-    elif not callable(serving_input_receiver_fn):
-      raise ValueError("Expecting Function for serving_input_receiver_fn")
-
-    if export_output_fn is None:
-      export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn
-
-    return super(DataRecordTrainer, self).export_model(
-      export_dir=export_dir,
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      checkpoint_path=checkpoint_path,
-      export_output_fn=export_output_fn,
-      feature_spec=feature_spec,
-    )
-
-  def get_train_input_fn(
-      self, parse_fn=None, repeat=None, shuffle=True, interleave=True, shuffle_files=None,
-      initializable=False, log_tf_data_summaries=False, **kwargs):
-    """
-    This method is used to create input function used by estimator.train().
-
-    Args:
-      parse_fn:
-        Function to parse a data record into a set of features.
-        Defaults to the parser returned by the FeatureConfig selected
-      repeat (optional):
-        Specifies if the dataset is to be repeated. Defaults to `params.train_steps > 0`.
-        This ensures the training is run for atleast `params.train_steps`.
-        Toggling this to `False` results in training finishing when one of the following happens:
-          - The entire dataset has been trained upon once.
-          - `params.train_steps` has been reached.
-      shuffle (optional):
-        Specifies if the files and records in the files need to be shuffled.
-        When `True`,  files are shuffled, and records of each files are shuffled.
-        When `False`, files are read in alpha-numerical order. Also when `False`
-        the dataset is sharded among workers for Hogwild and distributed training
-        if no sharding configuration is provided in `params.train_dataset_shards`.
-        Defaults to `True`.
-      interleave (optional):
-        Specifies if records from multiple files need to be interleaved in parallel.
-        Defaults to `True`.
-      shuffle_files (optional):
-        Shuffle the list of files. Defaults to 'Shuffle' if not provided.
-      initializable (optional):
-        A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
-        a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
-        (false) is used for most plain iterators.
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-    Returns:
-      An input_fn that can be consumed by `estimator.train()`.
-    """
-    if parse_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      parse_fn = self._feature_config.get_parse_fn()
-
-    if not callable(parse_fn):
-      raise ValueError("Expecting parse_fn to be a function.")
-
-    if log_tf_data_summaries and not initializable:
-      raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-    if repeat is None:
-      repeat = self.params.train_steps > 0 or self.params.get('distributed', False)
-
-    if not shuffle and self.num_workers > 1 and self.params.train_dataset_shards is None:
-      num_shards = self.num_workers
-      shard_index = self.worker_index
-    else:
-      num_shards = self.params.train_dataset_shards
-      shard_index = self.params.train_dataset_shard_index
-
-    return lambda: twml.input_fns.default_input_fn(
-      files=self._train_files,
-      batch_size=self.params.train_batch_size,
-      parse_fn=parse_fn,
-      num_threads=self.params.num_threads,
-      repeat=repeat,
-      keep_rate=self.params.train_keep_rate,
-      parts_downsampling_rate=self.params.train_parts_downsampling_rate,
-      shards=num_shards,
-      shard_index=shard_index,
-      shuffle=shuffle,
-      shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
-      interleave=interleave,
-      initializable=initializable,
-      log_tf_data_summaries=log_tf_data_summaries,
-      **kwargs)
-
-  def get_eval_input_fn(
-      self, parse_fn=None, repeat=None,
-      shuffle=True, interleave=True,
-      shuffle_files=None, initializable=False, log_tf_data_summaries=False, **kwargs):
-    """
-    This method is used to create input function used by estimator.eval().
-
-    Args:
-      parse_fn:
-        Function to parse a data record into a set of features.
-        Defaults to twml.parsers.get_sparse_parse_fn(feature_config).
-      repeat (optional):
-        Specifies if the dataset is to be repeated. Defaults to `params.eval_steps > 0`.
-        This ensures the evaluation is run for atleast `params.eval_steps`.
-        Toggling this to `False` results in evaluation finishing when one of the following happens:
-          - The entire dataset has been evaled upon once.
-          - `params.eval_steps` has been reached.
-      shuffle (optional):
-        Specifies if the files and records in the files need to be shuffled.
-        When `False`, files are read in alpha-numerical order.
-        When `True`,  files are shuffled, and records of each files are shuffled.
-        Defaults to `True`.
-      interleave (optional):
-        Specifies if records from multiple files need to be interleaved in parallel.
-        Defaults to `True`.
-      shuffle_files (optional):
-        Shuffles the list of files. Defaults to 'Shuffle' if not provided.
-      initializable (optional):
-        A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
-        a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
-        (false) is used for most plain iterators.
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-    Returns:
-      An input_fn that can be consumed by `estimator.eval()`.
-    """
-    if parse_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      parse_fn = self._feature_config.get_parse_fn()
-
-    if not self._eval_files:
-      raise ValueError("`eval_files` was not present in `params` passed to `DataRecordTrainer`")
-
-    if not callable(parse_fn):
-      raise ValueError("Expecting parse_fn to be a function.")
-
-    if log_tf_data_summaries and not initializable:
-      raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-    if repeat is None:
-      repeat = self.params.eval_steps > 0
-
-    return lambda: twml.input_fns.default_input_fn(
-      files=self._eval_files,
-      batch_size=self.params.eval_batch_size,
-      parse_fn=parse_fn,
-      num_threads=self.params.num_threads,
-      repeat=repeat,
-      keep_rate=self.params.eval_keep_rate,
-      parts_downsampling_rate=self.params.eval_parts_downsampling_rate,
-      shuffle=shuffle,
-      shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
-      interleave=interleave,
-      initializable=initializable,
-      log_tf_data_summaries=log_tf_data_summaries,
-      **kwargs
-    )
-
-  def _assert_train_files(self):
-    if not self._train_files:
-      raise ValueError("train.data_dir was not set in params passed to DataRecordTrainer.")
-
-  def _assert_eval_files(self):
-    if not self._eval_files:
-      raise ValueError("eval.data_dir was not set in params passed to DataRecordTrainer.")
-
-  def train(self, input_fn=None, steps=None, hooks=None):
-    """
-    Makes input functions optional. input_fn defaults to self.get_train_input_fn().
-    See Trainer for more detailed documentation documentation.
-    """
-    if input_fn is None:
-      self._assert_train_files()
-    input_fn = input_fn if input_fn else self.get_train_input_fn()
-    super(DataRecordTrainer, self).train(input_fn=input_fn, steps=steps, hooks=hooks)
-
-  def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
-    """
-    Makes input functions optional. input_fn defaults to self.get_eval_input_fn().
-    See Trainer for more detailed documentation.
-    """
-    if input_fn is None:
-      self._assert_eval_files()
-    input_fn = input_fn if input_fn else self.get_eval_input_fn(repeat=False)
-    return super(DataRecordTrainer, self).evaluate(
-      input_fn=input_fn,
-      steps=steps,
-      hooks=hooks,
-      name=name
-    )
-
-  def learn(self, train_input_fn=None, eval_input_fn=None, **kwargs):
-    """
-    Overrides ``Trainer.learn`` to make ``input_fn`` functions optional.
-    Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
-    ``self.train_input_fn`` and ``self.eval_input_fn``.
-    See ``Trainer.learn`` for more detailed documentation.
-    """
-    if train_input_fn is None:
-      self._assert_train_files()
-    if eval_input_fn is None:
-      self._assert_eval_files()
-    train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
-    eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
-
-    super(DataRecordTrainer, self).learn(
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      **kwargs
-    )
-
-  def train_and_evaluate(self,
-                         train_input_fn=None, eval_input_fn=None,
-                          **kwargs):
-    """
-    Overrides ``Trainer.train_and_evaluate`` to make ``input_fn`` functions optional.
-    Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
-    ``self.train_input_fn`` and ``self.eval_input_fn``.
-    See ``Trainer.train_and_evaluate`` for detailed documentation.
-    """
-    if train_input_fn is None:
-      self._assert_train_files()
-    if eval_input_fn is None:
-      self._assert_eval_files()
-    train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
-    eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
-
-    super(DataRecordTrainer, self).train_and_evaluate(
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      **kwargs
-    )
-
-  def _model_fn(self, features, labels, mode, params, config=None):
-    """
-    Overrides the _model_fn to correct for the features shape of the sparse features
-    extracted with the contrib.FeatureConfig
-    """
-    if isinstance(self._feature_config, twml.contrib.feature_config.FeatureConfig):
-      # Fix the shape of the features. The features dictionary will be modified to
-      # contain the shape changes.
-      twml.util.fix_shape_sparse(features, self._feature_config)
-    return super(DataRecordTrainer, self)._model_fn(
-      features=features,
-      labels=labels,
-      mode=mode,
-      params=params,
-      config=config
-    )
-
-  def calibrate(self,
-                calibrator,
-                input_fn=None,
-                steps=None,
-                save_calibrator=True,
-                hooks=None):
-    """
-    Makes input functions optional. input_fn defaults to self.train_input_fn.
-    See Trainer for more detailed documentation.
-    """
-    if input_fn is None:
-      self._assert_train_files()
-    input_fn = input_fn if input_fn else self.get_train_input_fn()
-    super(DataRecordTrainer, self).calibrate(calibrator=calibrator,
-                                             input_fn=input_fn,
-                                             steps=steps,
-                                             save_calibrator=save_calibrator,
-                                             hooks=hooks)
-
-  def save_checkpoints_and_export_model(self,
-                                        serving_input_receiver_fn,
-                                        export_output_fn=None,
-                                        export_dir=None,
-                                        checkpoint_path=None,
-                                        input_fn=None):
-    """
-    Exports saved module after saving checkpoint to save_dir.
-    Please note that to use this method, you need to assign a loss to the output
-    of the build_graph (for the train mode).
-    See export_model for more detailed information.
-    """
-    self.train(input_fn=input_fn, steps=1)
-    self.export_model(serving_input_receiver_fn, export_output_fn, export_dir, checkpoint_path)
-
-  def save_checkpoints_and_evaluate(self,
-                                    input_fn=None,
-                                    steps=None,
-                                    hooks=None,
-                                    name=None):
-    """
-    Evaluates model after saving checkpoint to save_dir.
-    Please note that to use this method, you need to assign a loss to the output
-    of the build_graph (for the train mode).
-    See evaluate for more detailed information.
-    """
-    self.train(input_fn=input_fn, steps=1)
-    self.evaluate(input_fn, steps, hooks, name)
+      dataset.",
+        )
+        parser.add_argument(
+            "--allow_train_eval_overlap",
+            dest="allow_train_eval_overlap",
+            action="store_true",
+            help="Allow overlap between train and eval datasets.",
+        )
+        parser.add_argument(
+            "--eval_name",
+            type=str,
+            default=None,
+            help="String denoting what we want to name the eval. If this is `train`, then we eval on \
+      the training dataset.",
+        )
+        return parser
+
+    def contrib_run_feature_importances(
+        self,
+        feature_importances_parse_fn=None,
+        write_to_hdfs=True,
+        extra_groups=None,
+        datarecord_filter_fn=None,
+        datarecord_filter_run_name=None,
+    ):
+        """Compute feature importances on a trained model (this is a contrib feature)
+        Args:
+          feature_importances_parse_fn (fn): The same parse_fn that we use for training/evaluation.
+            Defaults to feature_config.get_parse_fn()
+          write_to_hdfs (bool): Setting this to True writes the feature importance metrics to HDFS
+        extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
+          the names of the features in the group
+        datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
+            and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
+        """
+        logging.info("Computing feature importance")
+        algorithm = self._params.feature_importance_algorithm
+
+        kwargs = {}
+        if algorithm == TREE:
+            kwargs[
+                "split_feature_group_on_period"
+            ] = self._params.split_feature_group_on_period
+            kwargs["stopping_metric"] = self._params.feature_importance_metric
+            kwargs["sensitivity"] = self._params.feature_importance_sensitivity
+            kwargs["dont_build_tree"] = self._params.dont_build_tree
+            kwargs["extra_groups"] = extra_groups
+            if self._params.feature_importance_is_metric_larger_the_better:
+                # The user has specified that the stopping metric is one where larger values are better (e.g. ROC_AUC)
+                kwargs["is_metric_larger_the_better"] = True
+            elif self._params.feature_importance_is_metric_smaller_the_better:
+                # The user has specified that the stopping metric is one where smaller values are better (e.g. LOSS)
+                kwargs["is_metric_larger_the_better"] = False
+            else:
+                # The user has not specified which direction is better for the stopping metric
+                kwargs["is_metric_larger_the_better"] = None
+            logging.info("Using the tree algorithm with kwargs {}".format(kwargs))
+
+        feature_importances = compute_feature_importances(
+            trainer=self,
+            data_dir=self._params.get("feature_importance_data_dir"),
+            feature_config=self._feature_config,
+            algorithm=algorithm,
+            record_count=self._params.feature_importance_example_count,
+            parse_fn=feature_importances_parse_fn,
+            datarecord_filter_fn=datarecord_filter_fn,
+            **kwargs,
+        )
+
+        if not feature_importances:
+            logging.info("Feature importances returned None")
+        else:
+            if write_to_hdfs:
+                logging.info("Writing feature importance to HDFS")
+                write_feature_importances_to_hdfs(
+                    trainer=self,
+                    feature_importances=feature_importances,
+                    output_path=datarecord_filter_run_name,
+                    metric=self._params.get("feature_importance_metric"),
+                )
+            else:
+                logging.info("Not writing feature importance to HDFS")
+
+            logging.info("Writing feature importance to ML Metastore")
+            write_feature_importances_to_ml_dash(
+                trainer=self, feature_importances=feature_importances
+            )
+        return feature_importances
+
+    def export_model(
+        self,
+        serving_input_receiver_fn=None,
+        export_output_fn=None,
+        export_dir=None,
+        checkpoint_path=None,
+        feature_spec=None,
+    ):
+        """
+        Export the model for prediction. Typically, the exported model
+        will later be run in production servers. This method is called
+        by the user to export the PREDICT graph to disk.
+
+        Internally, this method calls `tf.estimator.Estimator.export_savedmodel
+        <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
+
+        Args:
+          serving_input_receiver_fn (Function):
+            function preparing the model for inference requests.
+            If not set; defaults to the the serving input receiver fn set by the FeatureConfig.
+          export_output_fn (Function):
+            Function to export the graph_output (output of build_graph) for
+            prediction. Takes a graph_output dict as sole argument and returns
+            the export_output_fns dict.
+            Defaults to ``twml.export_output_fns.batch_prediction_continuous_output_fn``.
+          export_dir:
+            directory to export a SavedModel for prediction servers.
+            Defaults to ``[save_dir]/exported_models``.
+          checkpoint_path:
+            the checkpoint path to export. If None (the default), the most recent checkpoint
+            found within the model directory ``save_dir`` is chosen.
+
+        Returns:
+          The export directory where the PREDICT graph is saved.
+        """
+        if serving_input_receiver_fn is None:
+            if self._feature_config is None:
+                raise ValueError(
+                    "`feature_config` was not passed to `DataRecordTrainer`"
+                )
+            serving_input_receiver_fn = (
+                self._feature_config.get_serving_input_receiver_fn()
+            )
+
+        if feature_spec is None:
+            if self._feature_config is None:
+                raise ValueError(
+                    "feature_spec can not be inferred."
+                    "Please pass feature_spec=feature_config.get_feature_spec() to the trainer.export_model method"
+                )
+            else:
+                feature_spec = self._feature_config.get_feature_spec()
+
+        if isinstance(serving_input_receiver_fn, twml.feature_config.FeatureConfig):
+            raise ValueError(
+                "Cannot pass FeatureConfig as a parameter to serving_input_receiver_fn"
+            )
+        elif not callable(serving_input_receiver_fn):
+            raise ValueError("Expecting Function for serving_input_receiver_fn")
+
+        if export_output_fn is None:
+            export_output_fn = (
+                twml.export_output_fns.batch_prediction_continuous_output_fn
+            )
+
+        return super(DataRecordTrainer, self).export_model(
+            export_dir=export_dir,
+            serving_input_receiver_fn=serving_input_receiver_fn,
+            checkpoint_path=checkpoint_path,
+            export_output_fn=export_output_fn,
+            feature_spec=feature_spec,
+        )
+
+    def get_train_input_fn(
+        self,
+        parse_fn=None,
+        repeat=None,
+        shuffle=True,
+        interleave=True,
+        shuffle_files=None,
+        initializable=False,
+        log_tf_data_summaries=False,
+        **kwargs,
+    ):
+        """
+        This method is used to create input function used by estimator.train().
+
+        Args:
+          parse_fn:
+            Function to parse a data record into a set of features.
+            Defaults to the parser returned by the FeatureConfig selected
+          repeat (optional):
+            Specifies if the dataset is to be repeated. Defaults to `params.train_steps > 0`.
+            This ensures the training is run for atleast `params.train_steps`.
+            Toggling this to `False` results in training finishing when one of the following happens:
+              - The entire dataset has been trained upon once.
+              - `params.train_steps` has been reached.
+          shuffle (optional):
+            Specifies if the files and records in the files need to be shuffled.
+            When `True`,  files are shuffled, and records of each files are shuffled.
+            When `False`, files are read in alpha-numerical order. Also when `False`
+            the dataset is sharded among workers for Hogwild and distributed training
+            if no sharding configuration is provided in `params.train_dataset_shards`.
+            Defaults to `True`.
+          interleave (optional):
+            Specifies if records from multiple files need to be interleaved in parallel.
+            Defaults to `True`.
+          shuffle_files (optional):
+            Shuffle the list of files. Defaults to 'Shuffle' if not provided.
+          initializable (optional):
+            A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
+            a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
+            (false) is used for most plain iterators.
+          log_tf_data_summaries (optional):
+            A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
+            tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
+            events files. This requires that `initializable` is `True` above.
+
+        Returns:
+          An input_fn that can be consumed by `estimator.train()`.
+        """
+        if parse_fn is None:
+            if self._feature_config is None:
+                raise ValueError(
+                    "`feature_config` was not passed to `DataRecordTrainer`"
+                )
+            parse_fn = self._feature_config.get_parse_fn()
+
+        if not callable(parse_fn):
+            raise ValueError("Expecting parse_fn to be a function.")
+
+        if log_tf_data_summaries and not initializable:
+            raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
+
+        if repeat is None:
+            repeat = self.params.train_steps > 0 or self.params.get(
+                "distributed", False
+            )
+
+        if (
+            not shuffle
+            and self.num_workers > 1
+            and self.params.train_dataset_shards is None
+        ):
+            num_shards = self.num_workers
+            shard_index = self.worker_index
+        else:
+            num_shards = self.params.train_dataset_shards
+            shard_index = self.params.train_dataset_shard_index
+
+        return lambda: twml.input_fns.default_input_fn(
+            files=self._train_files,
+            batch_size=self.params.train_batch_size,
+            parse_fn=parse_fn,
+            num_threads=self.params.num_threads,
+            repeat=repeat,
+            keep_rate=self.params.train_keep_rate,
+            parts_downsampling_rate=self.params.train_parts_downsampling_rate,
+            shards=num_shards,
+            shard_index=shard_index,
+            shuffle=shuffle,
+            shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
+            interleave=interleave,
+            initializable=initializable,
+            log_tf_data_summaries=log_tf_data_summaries,
+            **kwargs,
+        )
+
+    def get_eval_input_fn(
+        self,
+        parse_fn=None,
+        repeat=None,
+        shuffle=True,
+        interleave=True,
+        shuffle_files=None,
+        initializable=False,
+        log_tf_data_summaries=False,
+        **kwargs,
+    ):
+        """
+        This method is used to create input function used by estimator.eval().
+
+        Args:
+          parse_fn:
+            Function to parse a data record into a set of features.
+            Defaults to twml.parsers.get_sparse_parse_fn(feature_config).
+          repeat (optional):
+            Specifies if the dataset is to be repeated. Defaults to `params.eval_steps > 0`.
+            This ensures the evaluation is run for atleast `params.eval_steps`.
+            Toggling this to `False` results in evaluation finishing when one of the following happens:
+              - The entire dataset has been evaled upon once.
+              - `params.eval_steps` has been reached.
+          shuffle (optional):
+            Specifies if the files and records in the files need to be shuffled.
+            When `False`, files are read in alpha-numerical order.
+            When `True`,  files are shuffled, and records of each files are shuffled.
+            Defaults to `True`.
+          interleave (optional):
+            Specifies if records from multiple files need to be interleaved in parallel.
+            Defaults to `True`.
+          shuffle_files (optional):
+            Shuffles the list of files. Defaults to 'Shuffle' if not provided.
+          initializable (optional):
+            A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
+            a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
+            (false) is used for most plain iterators.
+          log_tf_data_summaries (optional):
+            A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
+            tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
+            events files. This requires that `initializable` is `True` above.
+
+        Returns:
+          An input_fn that can be consumed by `estimator.eval()`.
+        """
+        if parse_fn is None:
+            if self._feature_config is None:
+                raise ValueError(
+                    "`feature_config` was not passed to `DataRecordTrainer`"
+                )
+            parse_fn = self._feature_config.get_parse_fn()
+
+        if not self._eval_files:
+            raise ValueError(
+                "`eval_files` was not present in `params` passed to `DataRecordTrainer`"
+            )
+
+        if not callable(parse_fn):
+            raise ValueError("Expecting parse_fn to be a function.")
+
+        if log_tf_data_summaries and not initializable:
+            raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
+
+        if repeat is None:
+            repeat = self.params.eval_steps > 0
+
+        return lambda: twml.input_fns.default_input_fn(
+            files=self._eval_files,
+            batch_size=self.params.eval_batch_size,
+            parse_fn=parse_fn,
+            num_threads=self.params.num_threads,
+            repeat=repeat,
+            keep_rate=self.params.eval_keep_rate,
+            parts_downsampling_rate=self.params.eval_parts_downsampling_rate,
+            shuffle=shuffle,
+            shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
+            interleave=interleave,
+            initializable=initializable,
+            log_tf_data_summaries=log_tf_data_summaries,
+            **kwargs,
+        )
+
+    def _assert_train_files(self):
+        if not self._train_files:
+            raise ValueError(
+                "train.data_dir was not set in params passed to DataRecordTrainer."
+            )
+
+    def _assert_eval_files(self):
+        if not self._eval_files:
+            raise ValueError(
+                "eval.data_dir was not set in params passed to DataRecordTrainer."
+            )
+
+    def train(self, input_fn=None, steps=None, hooks=None):
+        """
+        Makes input functions optional. input_fn defaults to self.get_train_input_fn().
+        See Trainer for more detailed documentation documentation.
+        """
+        if input_fn is None:
+            self._assert_train_files()
+        input_fn = input_fn if input_fn else self.get_train_input_fn()
+        super(DataRecordTrainer, self).train(
+            input_fn=input_fn, steps=steps, hooks=hooks
+        )
+
+    def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
+        """
+        Makes input functions optional. input_fn defaults to self.get_eval_input_fn().
+        See Trainer for more detailed documentation.
+        """
+        if input_fn is None:
+            self._assert_eval_files()
+        input_fn = input_fn if input_fn else self.get_eval_input_fn(repeat=False)
+        return super(DataRecordTrainer, self).evaluate(
+            input_fn=input_fn, steps=steps, hooks=hooks, name=name
+        )
+
+    def learn(self, train_input_fn=None, eval_input_fn=None, **kwargs):
+        """
+        Overrides ``Trainer.learn`` to make ``input_fn`` functions optional.
+        Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
+        ``self.train_input_fn`` and ``self.eval_input_fn``.
+        See ``Trainer.learn`` for more detailed documentation.
+        """
+        if train_input_fn is None:
+            self._assert_train_files()
+        if eval_input_fn is None:
+            self._assert_eval_files()
+        train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
+        eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
+
+        super(DataRecordTrainer, self).learn(
+            train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, **kwargs
+        )
+
+    def train_and_evaluate(self, train_input_fn=None, eval_input_fn=None, **kwargs):
+        """
+        Overrides ``Trainer.train_and_evaluate`` to make ``input_fn`` functions optional.
+        Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
+        ``self.train_input_fn`` and ``self.eval_input_fn``.
+        See ``Trainer.train_and_evaluate`` for detailed documentation.
+        """
+        if train_input_fn is None:
+            self._assert_train_files()
+        if eval_input_fn is None:
+            self._assert_eval_files()
+        train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
+        eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
+
+        super(DataRecordTrainer, self).train_and_evaluate(
+            train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, **kwargs
+        )
+
+    def _model_fn(self, features, labels, mode, params, config=None):
+        """
+        Overrides the _model_fn to correct for the features shape of the sparse features
+        extracted with the contrib.FeatureConfig
+        """
+        if isinstance(self._feature_config, twml.contrib.feature_config.FeatureConfig):
+            # Fix the shape of the features. The features dictionary will be modified to
+            # contain the shape changes.
+            twml.util.fix_shape_sparse(features, self._feature_config)
+        return super(DataRecordTrainer, self)._model_fn(
+            features=features, labels=labels, mode=mode, params=params, config=config
+        )
+
+    def calibrate(
+        self, calibrator, input_fn=None, steps=None, save_calibrator=True, hooks=None
+    ):
+        """
+        Makes input functions optional. input_fn defaults to self.train_input_fn.
+        See Trainer for more detailed documentation.
+        """
+        if input_fn is None:
+            self._assert_train_files()
+        input_fn = input_fn if input_fn else self.get_train_input_fn()
+        super(DataRecordTrainer, self).calibrate(
+            calibrator=calibrator,
+            input_fn=input_fn,
+            steps=steps,
+            save_calibrator=save_calibrator,
+            hooks=hooks,
+        )
+
+    def save_checkpoints_and_export_model(
+        self,
+        serving_input_receiver_fn,
+        export_output_fn=None,
+        export_dir=None,
+        checkpoint_path=None,
+        input_fn=None,
+    ):
+        """
+        Exports saved module after saving checkpoint to save_dir.
+        Please note that to use this method, you need to assign a loss to the output
+        of the build_graph (for the train mode).
+        See export_model for more detailed information.
+        """
+        self.train(input_fn=input_fn, steps=1)
+        self.export_model(
+            serving_input_receiver_fn, export_output_fn, export_dir, checkpoint_path
+        )
+
+    def save_checkpoints_and_evaluate(
+        self, input_fn=None, steps=None, hooks=None, name=None
+    ):
+        """
+        Evaluates model after saving checkpoint to save_dir.
+        Please note that to use this method, you need to assign a loss to the output
+        of the build_graph (for the train mode).
+        See evaluate for more detailed information.
+        """
+        self.train(input_fn=input_fn, steps=1)
+        self.evaluate(input_fn, steps, hooks, name)
diff --git a/twml/twml/trainers/trainer.py b/twml/twml/trainers/trainer.py
index e51b4e0fd..7e96d0d0b 100644
--- a/twml/twml/trainers/trainer.py
+++ b/twml/twml/trainers/trainer.py
@@ -69,46 +69,53 @@
 import datetime
 import functools
 import math
-from operator import itemgetter
 import os
 import pprint as pp
 import random
-from string import Template
 import subprocess
 import sys
 import time
+from operator import itemgetter
+from string import Template
 from threading import Thread
 
+from absl import logging
 from twitter.common.metrics import AtomicGauge
 from twitter.deepbird.stats_server import utils as stats_server_utils
 from twitter.deepbird.stats_server.stats_exporter import StatsExporter
 from twitter.ml.common import metrics
-from twitter.ml.common.kubernetes import kubectl_delete_by_name, Resource
-from twitter.ml.twml.status import get_distributed_training_job_status, TrainingJobStatus
+from twitter.ml.common.kubernetes import Resource, kubectl_delete_by_name
+from twitter.ml.twml.status import (
+    TrainingJobStatus,
+    get_distributed_training_job_status,
+)
 
-from absl import logging
-from twml.optimizers import LazyAdamOptimizer, optimize_loss, OPTIMIZER_SUMMARIES
 from twml.contrib.optimizers import DeepGradientCompressionOptimizer
+from twml.optimizers import OPTIMIZER_SUMMARIES, LazyAdamOptimizer, optimize_loss
 from twml.tracking import ExperimentTracker
-from twml.util import (delete_file_or_dir,
-                       get_distributed_training_job_path,
-                       sanitize_hdfs_path)
+from twml.util import (
+    delete_file_or_dir,
+    get_distributed_training_job_path,
+    sanitize_hdfs_path,
+)
+
 try:
-  from urllib import quote as encode_url
+    from urllib import quote as encode_url
 except ImportError:
-  from urllib.parse import quote as encode_url
-import tensorflow.compat.v1 as tf
+    from urllib.parse import quote as encode_url
+
 import tensorflow
+import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
-
 import twitter.ml.twml.kubernetes.status as k8s_status
+
 import twml
 import twml.export_output_fns
 import twml.learning_rate_decay
 import twml.metrics
 
-
-_CLUSTER_TEMPLATE = Template('''{
+_CLUSTER_TEMPLATE = Template(
+    """{
   "cluster": {
     "ps": [$PS],
     "chief": [$CHIEF],
@@ -116,1662 +123,1836 @@
   },
   "task": {"type": "$TYPE", "index": $INDEX}
 }
-''')
+"""
+)
 
 
 def init_from_checkpoint(init_dir, init_map):
-  """
-  Wrapper around tf.train.init_from_checkpoint
-  """
-  if init_dir:
-    init_dir = sanitize_hdfs_path(init_dir)
-    tf.train.init_from_checkpoint(init_dir, init_map)
+    """
+    Wrapper around tf.train.init_from_checkpoint
+    """
+    if init_dir:
+        init_dir = sanitize_hdfs_path(init_dir)
+        tf.train.init_from_checkpoint(init_dir, init_map)
 
 
 class Trainer(object):
-  """
-  This class wraps ``tf.estimator.Estimator`` to make construction, saving, and loading easier.
-  Supports multi-phase training (for example, use a Trainer for MDL calibration, then
-  another for training the rest of the model, then another for isotonic calibration).
-  The Trainer also implements a training and evaluation loop via the ``learn()`` method.
-  Each Trainer is associated to a fixed set of hyper parameters (params), and a single model
-  specified by ``build_graph``. Given these constraints, a single Trainer can be called
-  multiple times for training and evaluation over multiple epochs.
-
-  However, if you intend to try different sets of hyper-parameters, we recommend you instantiate
-  a different Trainer for each such experiment. That way, each experiment can be tracked
-  in a different ``save_dir``. Indeed, after calling ``learn``, a Trainer's save_dir will contain
-  checkpoints of the model (its graph, and variables), and the history of metrics (for example,
-  evaluation accuracy at each epoch), and other store observations like the average time per step.
-  The latter metrics can be viewed by pointing
-  TensorBoard to the save_dir and accessing TensorBoard via your browser.
-  """
-
-  def __init__(self, name, params, build_graph_fn,
-               metric_fn=None,
-               optimize_loss_fn=None,
-               run_config=None,
-               save_dir=None,
-               init_from_dir=None,
-               init_map=None,
-               warm_start_from=None,
-               profiler_steps=None,
-               **kwargs):
+    """
+    This class wraps ``tf.estimator.Estimator`` to make construction, saving, and loading easier.
+    Supports multi-phase training (for example, use a Trainer for MDL calibration, then
+    another for training the rest of the model, then another for isotonic calibration).
+    The Trainer also implements a training and evaluation loop via the ``learn()`` method.
+    Each Trainer is associated to a fixed set of hyper parameters (params), and a single model
+    specified by ``build_graph``. Given these constraints, a single Trainer can be called
+    multiple times for training and evaluation over multiple epochs.
+
+    However, if you intend to try different sets of hyper-parameters, we recommend you instantiate
+    a different Trainer for each such experiment. That way, each experiment can be tracked
+    in a different ``save_dir``. Indeed, after calling ``learn``, a Trainer's save_dir will contain
+    checkpoints of the model (its graph, and variables), and the history of metrics (for example,
+    evaluation accuracy at each epoch), and other store observations like the average time per step.
+    The latter metrics can be viewed by pointing
+    TensorBoard to the save_dir and accessing TensorBoard via your browser.
     """
 
-    Args:
-      name (String):
-        string name of this estimator; used as scope names for variables and tensors.
-      params (HParams, Namespace, or Dict):
-        hyper-parameters to be passed to Estimator constructor.
-        Must include params.train_batch_size and params.eval_batch_size.
-        Note that params is passed to twml.util.convert_to_hparams() to produce an HParams.
-      build_graph_fn:
-        A function for building tensorflow graphs.
-        This matches TensorFlow Estimator's model_fn signature.
-        For example,
-
-        .. code-block:: python
+    def __init__(
+        self,
+        name,
+        params,
+        build_graph_fn,
+        metric_fn=None,
+        optimize_loss_fn=None,
+        run_config=None,
+        save_dir=None,
+        init_from_dir=None,
+        init_map=None,
+        warm_start_from=None,
+        profiler_steps=None,
+        **kwargs,
+    ):
+        """
 
-          def build_graph(features, label, mode, params, config=None):
-            # Implements a simple binary logistic regression model
-            sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
+        Args:
+          name (String):
+            string name of this estimator; used as scope names for variables and tensors.
+          params (HParams, Namespace, or Dict):
+            hyper-parameters to be passed to Estimator constructor.
+            Must include params.train_batch_size and params.eval_batch_size.
+            Note that params is passed to twml.util.convert_to_hparams() to produce an HParams.
+          build_graph_fn:
+            A function for building tensorflow graphs.
+            This matches TensorFlow Estimator's model_fn signature.
+            For example,
+
+            .. code-block:: python
+
+              def build_graph(features, label, mode, params, config=None):
+                # Implements a simple binary logistic regression model
+                sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
+
+                logits = twml.layers.full_sparse(sparse_tf, 1 << params.input_size_bits, 1)
+
+                if mode == 'infer':
+                  loss = None
+                else:
+                  loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits)
+                  loss = twml.util.weighted_average(loss, features['weights'])
+
+                output = tf.nn.sigmoid(logits)
+
+                return {'output': output, 'loss': loss}
+
+            Args:
+              features (dict of Tensor keyed by a string name):
+                input tensors.
+              mode (tf.estimator.ModeKeys / String):
+                one of 'train', 'eval', 'infer'.
+              label (Tensor):
+                if in ``mode == 'train'`` mode, these contain the corresponding labels for input.
+              params (HParams):
+                hyper parameters that control how to build a graph.
+              config:
+                the RunConfig object passed to Estimator constructor.
+
+            This function is expected to return a dictionary containing the following keys:
+
+            * 'output': a node representing model output; required.
+            * 'loss': (required) a loss node used for optimization; required for training and
+              evaluation.
+            * 'train_op': (optional) an operation that minimizes the loss (as output by
+              `tf.train.Optimizer.minimize`). If train_op is specified, train_op is used
+              for optimization as opposed to loss. Loss is always logged to tensorboard.
+
+            Notes:
+
+            * any tf.summary written inside build graph are logged to tensorboard during training.
+            * the ``build_graph_fn`` is called once or twice per epoch (once per training,
+              once per evaluation). All data loading (and preprocessing) logic not required
+              for serving should be in the ``input_fn`` passed to ``learn``, ``train``,
+              ``evalulate``, etc.
+
+          optimize_loss_fn:
+            Defaults to Trainer.get_train_op. A function that takes params and loss as arguments
+            and returns a training op. The training op is used to update parameters (that is, to learn).
+          metric_fn:
+            A function that returns the eval_metric_ops dict given graph_output, labels and weights.
+            Defaults to None.
+            Use ``twml.metrics.get_binary_class_metric_fn()`` to return a ``metric_fn``
+            which implements many binary classification metrics.
+          run_config (RunConfig):
+            optional configuration to be passed to Estimator constructor. Defaults to None.
+          save_dir (String):
+            optional directory where to save model checkpoints,
+            tensorboard event files and trained parameters.
+            Overwrites and defaults to run_config.model_dir.
+          init_from_dir (String):
+            optional directory to load weights from.
+            if set to None (the default), do not init from any directory.
+          init_map (map from String to String):
+            Must be specified if init_from_dir is specified.
+            Defines which scopes and variables to load.
+            Keys are the variables and scopes to load from the directory.
+            Values are the destinations (in the current graph) to load into.
+            See tf.init_from_checkpoint for more information.
+            Note that the the trainer prepends name_scope of the form `name`/model/ to the name_scope
+            of any variable defined inside `build_graph_fn` and this should be taken into account when
+            defining the values.
+          warm_start_from:
+            Optional string filepath to a checkpoint to warm-start from,
+            or a tf.estimator.WarmStartSettings object to fully configure warm-starting.
+            If the string filepath is provided instead of a WarmStartSettings,
+            then all variables are warm-started, and it is assumed that
+            vocabularies and Tensor names are unchanged.
+          profiler_steps (Integer):
+            Defaults to None. If set defines the number of steps in the
+            `tf.train.ProfileHook <https://www.tensorflow.org/api_docs/python/tf/train/ProfilerHook>`_.
+            Captures CPU/GPU profiling information every ``profiler_steps`` steps or seconds.
+            When executing ``learn``, ``train`` or ``predict`` methods,
+            with ``profiler_steps`` set to a number,
+            a ``timeline_X.json`` file is created in the save_dir. This file contains profiling data
+            storedin Chrome trace format. To view stored data, use the Chrome browser to follow
+            these steps:
+
+            1) Go to the page chrome://tracing.
+            2) In the upper left corner, you will find Load button.
+            3) Press it and load our JSON file, which can be found in the ``save_dir``
+
+            *Warning*: This could create too many these json files which can be a potential problem,
+            e.g. for  HDFS there is normally quota forfile count, so use with caution.
+
+            Note: this argument is ignored when a non-None ``hooks`` argument is pasesd to
+            ``train``, ``learn``, or ``predict`` methods. The hook can be added manually by passing
+            ``trainer.train(..., hooks=myhooks.extend(trainer.get_train_hooks()))``, for example.
+        """
+
+        if tensorflow.__version__ >= "2.0":
+            RuntimeError("Trainer not yet supported for Tensorflow >= 2.0")
+
+        self._name = name
+        self._build_graph_fn = build_graph_fn
+        self._metric_fn = metric_fn
+        self._tensorboard_handle = None
+        self._current_estimator_spec = None  # holds the current estimator spec
+        self._profiler_steps = profiler_steps
+        self._export_output_fn = None
+        self._is_early_stopping = False
+
+        # NOTE: Sanitize all HDFS paths first.
+        save_dir = sanitize_hdfs_path(save_dir)
+        init_from_dir = sanitize_hdfs_path(init_from_dir)
+
+        # warm_start_from can be of type tf.estimator.WarmStartSettings.
+        if isinstance(warm_start_from, str):
+            warm_start_from = sanitize_hdfs_path(warm_start_from)
+
+        # convert to twitter.deepbird.hparam.hparam.HParams object
+        params = twml.util.convert_to_hparams(params)
+
+        # keep a copy of the params because calling self._estimator.params creates a deepcopy
+        self._params = params
+        self.check_params()
+
+        self._using_hogwild = True if os.environ.get("TWML_HOGWILD_PORTS") else False
+        # configure Hogwild (needs to be called before RunConfig is created)
+        self._hogwild_setup()
+
+        if not run_config:
+            session_config = tf.ConfigProto()
+            # By default each process tries to allocate (almost) all of the memory.
+            # This option ensures the gpu memory grows dynamically instead.
+            session_config.gpu_options.allow_growth = True  # pylint: disable=no-member
+
+            if "TWML_NUM_CPUS" in os.environ:
+                num_available_cpus = int(os.environ.get("TWML_MESOS_CPU", "8"))
+                if params.num_mkl_threads > 1:
+                    os.environ["OMP_NUM_THREADS"] = str(params.num_mkl_threads)
+                    os.environ["MKL_NUM_THREADS"] = str(params.num_mkl_threads)
+                    session_config.inter_op_parallelism_threads = (
+                        num_available_cpus // params.num_mkl_threads
+                    )
+                    session_config.intra_op_parallelism_threads = params.num_mkl_threads
+
+            run_config = tf.estimator.RunConfig(
+                session_config=session_config,
+                keep_checkpoint_max=self._params.get("keep_checkpoint_max", 20),
+                log_step_count_steps=10000,
+                save_checkpoints_secs=self._params.get("save_checkpoints_secs", 600),
+                tf_random_seed=self._tf_random_seed(),
+            )
+        elif not isinstance(run_config, tf.estimator.RunConfig):
+            raise ValueError(
+                "Expecting run_config argument of type None or tf.estimator.RunConfig"
+                "Got %s instead." % type(run_config).__name__
+            )
+        elif os.environ.get("TWML_HOGWILD_PORTS"):
+            raise ValueError("Custom RunConfig not supported with Hogwild")
+
+        if run_config.model_dir is None and save_dir is None:
+            raise ValueError(
+                "Expecting either save_dir or run_config.model_dir to be specified. Got None for each."
+            )
+        elif run_config.model_dir is None:
+            run_config = run_config.replace(model_dir=save_dir)
+        elif save_dir is None:
+            save_dir = run_config.model_dir
+
+        self._save_dir = save_dir
+        self.experiment_tracker = ExperimentTracker(
+            self._params, run_config, self._save_dir
+        )
 
-            logits = twml.layers.full_sparse(sparse_tf, 1 << params.input_size_bits, 1)
+        # Check if should delete the tsd running this training job. In certain use case when
+        # there are other tf operations following trainer.train_and_evaluate (or trainer.learn),
+        # additional state files need to be specified to ensure those steps are executed after job restart.
+        kwargs["gke_state_files"] = kwargs.get("gke_state_files", ["_SUCCESS"])
+        self._maybe_del_tsd_exit(kwargs["gke_state_files"])
+        logging.info(
+            "Checkpoint and event files will be saved at save_dir=%s", save_dir
+        )
+        self._optimize_loss_fn = (
+            self.get_train_op if optimize_loss_fn is None else optimize_loss_fn
+        )
 
-            if mode == 'infer':
-              loss = None
+        # overwrite the current save_dir
+        if self._params.get("overwrite_save_dir") and tf.io.gfile.exists(
+            self._save_dir
+        ):
+            logging.info(
+                "Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
+                % self._save_dir
+            )
+            # if distributed or hogwild:
+            if self._params.get("distributed", False):
+                # sleep for 30 seconds to allow each worker to get to this point.
+                time.sleep(30)
+                if run_config.is_chief:
+                    logging.info("Chief deleting the save_dir now")
+                    delete_file_or_dir(self._save_dir)
+                # sleep for 30 seconds to allow each worker to get to this point.
+                time.sleep(30)
             else:
-              loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits)
-              loss = twml.util.weighted_average(loss, features['weights'])
-
-            output = tf.nn.sigmoid(logits)
-
-            return {'output': output, 'loss': loss}
+                delete_file_or_dir(self._save_dir)
+
+        # Exposing stats to a /vars.json endpoint that will be collected
+        # by the absorber
+        if self._params.get("stats_port"):
+            try:
+                stats_server_utils.start_stats_server(
+                    self._params.get("stats_port"), self._save_dir
+                )
+            except Exception as err:
+                logging.error("Failed to start the stats server. Error: %s", str(err))
+
+        checkpoint = os.path.join(self._save_dir, "checkpoint")
+        if tf.io.gfile.exists(checkpoint):
+            logging.info(
+                "The provided save_dir directory %s already exists."
+                " Training will be resumed." % checkpoint
+            )
+
+        self._maybe_restore_checkpoint = lambda: init_from_checkpoint(
+            init_from_dir, init_map
+        )
 
-        Args:
-          features (dict of Tensor keyed by a string name):
-            input tensors.
-          mode (tf.estimator.ModeKeys / String):
-            one of 'train', 'eval', 'infer'.
-          label (Tensor):
-            if in ``mode == 'train'`` mode, these contain the corresponding labels for input.
-          params (HParams):
-            hyper parameters that control how to build a graph.
-          config:
-            the RunConfig object passed to Estimator constructor.
-
-        This function is expected to return a dictionary containing the following keys:
-
-        * 'output': a node representing model output; required.
-        * 'loss': (required) a loss node used for optimization; required for training and
-          evaluation.
-        * 'train_op': (optional) an operation that minimizes the loss (as output by
-          `tf.train.Optimizer.minimize`). If train_op is specified, train_op is used
-          for optimization as opposed to loss. Loss is always logged to tensorboard.
-
-        Notes:
-
-        * any tf.summary written inside build graph are logged to tensorboard during training.
-        * the ``build_graph_fn`` is called once or twice per epoch (once per training,
-          once per evaluation). All data loading (and preprocessing) logic not required
-          for serving should be in the ``input_fn`` passed to ``learn``, ``train``,
-          ``evalulate``, etc.
-
-      optimize_loss_fn:
-        Defaults to Trainer.get_train_op. A function that takes params and loss as arguments
-        and returns a training op. The training op is used to update parameters (that is, to learn).
-      metric_fn:
-        A function that returns the eval_metric_ops dict given graph_output, labels and weights.
-        Defaults to None.
-        Use ``twml.metrics.get_binary_class_metric_fn()`` to return a ``metric_fn``
-        which implements many binary classification metrics.
-      run_config (RunConfig):
-        optional configuration to be passed to Estimator constructor. Defaults to None.
-      save_dir (String):
-        optional directory where to save model checkpoints,
-        tensorboard event files and trained parameters.
-        Overwrites and defaults to run_config.model_dir.
-      init_from_dir (String):
-        optional directory to load weights from.
-        if set to None (the default), do not init from any directory.
-      init_map (map from String to String):
-        Must be specified if init_from_dir is specified.
-        Defines which scopes and variables to load.
-        Keys are the variables and scopes to load from the directory.
-        Values are the destinations (in the current graph) to load into.
-        See tf.init_from_checkpoint for more information.
-        Note that the the trainer prepends name_scope of the form `name`/model/ to the name_scope
-        of any variable defined inside `build_graph_fn` and this should be taken into account when
-        defining the values.
-      warm_start_from:
-        Optional string filepath to a checkpoint to warm-start from,
-        or a tf.estimator.WarmStartSettings object to fully configure warm-starting.
-        If the string filepath is provided instead of a WarmStartSettings,
-        then all variables are warm-started, and it is assumed that
-        vocabularies and Tensor names are unchanged.
-      profiler_steps (Integer):
-        Defaults to None. If set defines the number of steps in the
-        `tf.train.ProfileHook <https://www.tensorflow.org/api_docs/python/tf/train/ProfilerHook>`_.
-        Captures CPU/GPU profiling information every ``profiler_steps`` steps or seconds.
-        When executing ``learn``, ``train`` or ``predict`` methods,
-        with ``profiler_steps`` set to a number,
-        a ``timeline_X.json`` file is created in the save_dir. This file contains profiling data
-        storedin Chrome trace format. To view stored data, use the Chrome browser to follow
-        these steps:
-
-        1) Go to the page chrome://tracing.
-        2) In the upper left corner, you will find Load button.
-        3) Press it and load our JSON file, which can be found in the ``save_dir``
-
-        *Warning*: This could create too many these json files which can be a potential problem,
-        e.g. for  HDFS there is normally quota forfile count, so use with caution.
-
-        Note: this argument is ignored when a non-None ``hooks`` argument is pasesd to
-        ``train``, ``learn``, or ``predict`` methods. The hook can be added manually by passing
-        ``trainer.train(..., hooks=myhooks.extend(trainer.get_train_hooks()))``, for example.
-    """
+        if init_from_dir is not None and init_map is None:
+            raise ValueError("Need to provide init_map when init_from_dir is provided.")
 
-    if tensorflow.__version__ >= "2.0":
-      RuntimeError("Trainer not yet supported for Tensorflow >= 2.0")
-
-    self._name = name
-    self._build_graph_fn = build_graph_fn
-    self._metric_fn = metric_fn
-    self._tensorboard_handle = None
-    self._current_estimator_spec = None  # holds the current estimator spec
-    self._profiler_steps = profiler_steps
-    self._export_output_fn = None
-    self._is_early_stopping = False
-
-    # NOTE: Sanitize all HDFS paths first.
-    save_dir = sanitize_hdfs_path(save_dir)
-    init_from_dir = sanitize_hdfs_path(init_from_dir)
-
-    # warm_start_from can be of type tf.estimator.WarmStartSettings.
-    if isinstance(warm_start_from, str):
-      warm_start_from = sanitize_hdfs_path(warm_start_from)
-
-    # convert to twitter.deepbird.hparam.hparam.HParams object
-    params = twml.util.convert_to_hparams(params)
-
-    # keep a copy of the params because calling self._estimator.params creates a deepcopy
-    self._params = params
-    self.check_params()
-
-    self._using_hogwild = True if os.environ.get('TWML_HOGWILD_PORTS') else False
-    # configure Hogwild (needs to be called before RunConfig is created)
-    self._hogwild_setup()
-
-    if not run_config:
-      session_config = tf.ConfigProto()
-      # By default each process tries to allocate (almost) all of the memory.
-      # This option ensures the gpu memory grows dynamically instead.
-      session_config.gpu_options.allow_growth = True  # pylint: disable=no-member
-
-      if 'TWML_NUM_CPUS' in os.environ:
-        num_available_cpus = int(os.environ.get("TWML_MESOS_CPU", "8"))
-        if params.num_mkl_threads > 1:
-          os.environ["OMP_NUM_THREADS"] = str(params.num_mkl_threads)
-          os.environ["MKL_NUM_THREADS"] = str(params.num_mkl_threads)
-          session_config.inter_op_parallelism_threads = num_available_cpus // params.num_mkl_threads
-          session_config.intra_op_parallelism_threads = params.num_mkl_threads
-
-      run_config = tf.estimator.RunConfig(
-        session_config=session_config,
-        keep_checkpoint_max=self._params.get('keep_checkpoint_max', 20),
-        log_step_count_steps=10000,
-        save_checkpoints_secs=self._params.get('save_checkpoints_secs', 600),
-        tf_random_seed=self._tf_random_seed())
-    elif not isinstance(run_config, tf.estimator.RunConfig):
-      raise ValueError("Expecting run_config argument of type None or tf.estimator.RunConfig"
-        "Got %s instead." % type(run_config).__name__)
-    elif os.environ.get('TWML_HOGWILD_PORTS'):
-      raise ValueError("Custom RunConfig not supported with Hogwild")
-
-    if run_config.model_dir is None and save_dir is None:
-      raise ValueError(
-          "Expecting either save_dir or run_config.model_dir to be specified. Got None for each.")
-    elif run_config.model_dir is None:
-      run_config = run_config.replace(model_dir=save_dir)
-    elif save_dir is None:
-      save_dir = run_config.model_dir
-
-    self._save_dir = save_dir
-    self.experiment_tracker = ExperimentTracker(self._params, run_config, self._save_dir)
-
-    # Check if should delete the tsd running this training job. In certain use case when 
-    # there are other tf operations following trainer.train_and_evaluate (or trainer.learn),
-    # additional state files need to be specified to ensure those steps are executed after job restart.
-    kwargs['gke_state_files'] = kwargs.get('gke_state_files', ['_SUCCESS'])
-    self._maybe_del_tsd_exit(kwargs['gke_state_files'])
-    logging.info("Checkpoint and event files will be saved at save_dir=%s", save_dir)
-    self._optimize_loss_fn = self.get_train_op if optimize_loss_fn is None else optimize_loss_fn
-
-    # overwrite the current save_dir
-    if self._params.get('overwrite_save_dir') and tf.io.gfile.exists(self._save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % self._save_dir)
-      # if distributed or hogwild:
-      if self._params.get('distributed', False):
-        # sleep for 30 seconds to allow each worker to get to this point.
-        time.sleep(30)
-        if run_config.is_chief:
-          logging.info("Chief deleting the save_dir now")
-          delete_file_or_dir(self._save_dir)
-        # sleep for 30 seconds to allow each worker to get to this point.
-        time.sleep(30)
-      else:
-        delete_file_or_dir(self._save_dir)
-
-    # Exposing stats to a /vars.json endpoint that will be collected
-    # by the absorber
-    if self._params.get('stats_port'):
-      try:
-        stats_server_utils.start_stats_server(self._params.get('stats_port'), self._save_dir)
-      except Exception as err:
-        logging.error('Failed to start the stats server. Error: %s', str(err))
-
-    checkpoint = os.path.join(self._save_dir, 'checkpoint')
-    if tf.io.gfile.exists(checkpoint):
-      logging.info("The provided save_dir directory %s already exists."
-                   " Training will be resumed."
-                   % checkpoint)
-
-    self._maybe_restore_checkpoint = lambda: init_from_checkpoint(init_from_dir, init_map)
-
-    if init_from_dir is not None and init_map is None:
-      raise ValueError("Need to provide init_map when init_from_dir is provided.")
-
-    if not tf.io.gfile.exists(self._save_dir):
-      # so tensorboard can point to a directory that exists
-      tf.io.gfile.mkdir(self._save_dir)
-
-    self._estimator = tf.estimator.Estimator(
-      model_fn=self._model_fn,
-      params=self._params,  # HParams
-      config=run_config,  # RunConfig
-      warm_start_from=warm_start_from,
-      model_dir=self._save_dir,  # By this point it is same as run_config.model_dir
-    )
-
-    # Log parameters that are used to construct trainer. This allows people to see default values.
-    logging.info("Trainer constructed using the following parameters: ")
-    pp_params = pp.pformat(self._params.values())
-    logging.info(pp_params)
-
-    # Start TensorBoard
-    if self._params.get('disable_tensorboard', False):
-      logging.info("Skipping launching TensorBoard [--disable_tensorboard is set]")
-    elif "tensorboard_port" in self._params.values() and self._params.tensorboard_port is not None:
-      self.start_tensorboard(self._params.tensorboard_port)
-
-    # Export gauge that will track whether a model was exported
-    self.stats_exporter = StatsExporter("twml.trainer")
-    self.export_gauge = AtomicGauge('export_model')
-    self.stats_exporter.register_metrics(self.export_gauge)
-
-  def _hogwild_setup(self):
-    """
-    Setup the parameters required for hogwild.
-    """
-    self._num_workers = self._params.get('num_workers') or 1
-    logging.info("NUM_WORKERS: %d", self._num_workers)
-    if self._num_workers <= 1:
-      self._ports = None
-      return
-
-    # a hogwild job is considered distributed
-    if 'distributed' in self._params:
-      self._params.set_hparam('distributed', True)
-    else:
-      self._params.add_hparam('distributed', True)
-
-    ports = os.environ.get('TWML_HOGWILD_PORTS')
-    if ports:
-      self._ports = [int(port) for port in ports.strip().split(",")]
-      if (self._num_workers + 1!= len(self._ports)):
-        raise ValueError("Number of (workers + PS) and ports need to match")
-    else:
-      if self._num_workers > 1:
-        raise ValueError("TWML_HOGWILD_PORTS needs to be set to use hogwild training")
-
-    # Split the number of data threads across multiple workers
-    num_threads = self._params.get('num_threads')
-    num_threads_per_worker = int(math.ceil(float(num_threads) / self._num_workers))
-    self._params.set_hparam('num_threads', num_threads_per_worker)
-
-    hogwild_task_type = os.environ.get('TWML_HOGWILD_TASK_TYPE')
-    hogwild_task_id = int(os.environ.get('TWML_HOGWILD_TASK_ID'))
-    os.environ['TF_CONFIG'] = self._get_cluster_config(hogwild_task_type, hogwild_task_id)
-
-  def _tf_random_seed(self):
-    """ Returns user set seed and deal with Hogwild multiple seeds """
-    tf_random_seed = self._params.get('tf_random_seed', None)
-    if tf_random_seed is None:
-      return None
-    elif self.using_hogwild and os.environ.get('TWML_HOGWILD_TASK_TYPE') == 'worker':
-      # chief (tf_random_seed), worker_0 (tf_random_seed + 1), worker_1 (tf_random_seed + 2)...
-      return tf_random_seed + 1 + int(os.environ.get('TWML_HOGWILD_TASK_ID'))
-    else:
-      return tf_random_seed
-
-  def check_params(self):
-    """ Verify that params has the correct key,values """
-    param_values = self._params.values()
-
-    if 'train_batch_size' in param_values:
-      if not isinstance(self._params.train_batch_size, int):
-        raise ValueError("Expecting params.train_batch_size to be an integer.")
-      if self._params.train_batch_size <= 0:
-        raise ValueError("train_batch_size needs to be positive")
-    else:
-      raise ValueError("train_batch_size needs to be present in params")
-
-    if 'eval_batch_size' in param_values:
-      if not isinstance(self._params.eval_batch_size, int):
-        raise ValueError("Expecting params.eval_batch_size to be an integer.")
-      if self._params.eval_batch_size <= 0:
-        raise ValueError("eval_batch_size needs to be positive.")
-    else:
-      self._params.add_hparam('eval_batch_size', self._params.train_batch_size)
-
-    if (self._params.get('distributed_training_cleanup') and
-      not self._params.get('distributed')):
-      # we only need to support training discontinuation for distributed training
-      # bc we are still using TSDs on GKE for distributed training
-      raise ValueError(
-        "Expecting params.distributed to be set if "
-        "params.distributed_training_cleanup is set."
-      )
-
-  def _get_cluster_config(self, name, index):
-    """Create a tensorflow cluster config from ports, name and index"""
-    host = '"localhost:%d"'
-    ps = host % self._ports[0]
-    chief = host % self._ports[1]
-    workers = ", ".join([host % port for port in self._ports[2:]])
-    config = _CLUSTER_TEMPLATE.substitute(
-      PS=ps,
-      CHIEF=chief,
-      WORKER=workers,
-      TYPE=name,
-      INDEX=index,
-    )
-    return config
-
-  @property
-  def current_estimator_spec(self):
-    """
-    returns the current estimator (warning: often reset)
-    """
-    return self._current_estimator_spec
+        if not tf.io.gfile.exists(self._save_dir):
+            # so tensorboard can point to a directory that exists
+            tf.io.gfile.mkdir(self._save_dir)
 
-  @property
-  def estimator(self):
-    """ returns estimator encapsulated by Trainer """
-    return self._estimator
+        self._estimator = tf.estimator.Estimator(
+            model_fn=self._model_fn,
+            params=self._params,  # HParams
+            config=run_config,  # RunConfig
+            warm_start_from=warm_start_from,
+            model_dir=self._save_dir,  # By this point it is same as run_config.model_dir
+        )
 
-  @property
-  def num_workers(self):
-    """ returns number of workers """
-    return self._estimator.config.num_worker_replicas
+        # Log parameters that are used to construct trainer. This allows people to see default values.
+        logging.info("Trainer constructed using the following parameters: ")
+        pp_params = pp.pformat(self._params.values())
+        logging.info(pp_params)
+
+        # Start TensorBoard
+        if self._params.get("disable_tensorboard", False):
+            logging.info(
+                "Skipping launching TensorBoard [--disable_tensorboard is set]"
+            )
+        elif (
+            "tensorboard_port" in self._params.values()
+            and self._params.tensorboard_port is not None
+        ):
+            self.start_tensorboard(self._params.tensorboard_port)
+
+        # Export gauge that will track whether a model was exported
+        self.stats_exporter = StatsExporter("twml.trainer")
+        self.export_gauge = AtomicGauge("export_model")
+        self.stats_exporter.register_metrics(self.export_gauge)
+
+    def _hogwild_setup(self):
+        """
+        Setup the parameters required for hogwild.
+        """
+        self._num_workers = self._params.get("num_workers") or 1
+        logging.info("NUM_WORKERS: %d", self._num_workers)
+        if self._num_workers <= 1:
+            self._ports = None
+            return
+
+        # a hogwild job is considered distributed
+        if "distributed" in self._params:
+            self._params.set_hparam("distributed", True)
+        else:
+            self._params.add_hparam("distributed", True)
+
+        ports = os.environ.get("TWML_HOGWILD_PORTS")
+        if ports:
+            self._ports = [int(port) for port in ports.strip().split(",")]
+            if self._num_workers + 1 != len(self._ports):
+                raise ValueError("Number of (workers + PS) and ports need to match")
+        else:
+            if self._num_workers > 1:
+                raise ValueError(
+                    "TWML_HOGWILD_PORTS needs to be set to use hogwild training"
+                )
+
+        # Split the number of data threads across multiple workers
+        num_threads = self._params.get("num_threads")
+        num_threads_per_worker = int(math.ceil(float(num_threads) / self._num_workers))
+        self._params.set_hparam("num_threads", num_threads_per_worker)
+
+        hogwild_task_type = os.environ.get("TWML_HOGWILD_TASK_TYPE")
+        hogwild_task_id = int(os.environ.get("TWML_HOGWILD_TASK_ID"))
+        os.environ["TF_CONFIG"] = self._get_cluster_config(
+            hogwild_task_type, hogwild_task_id
+        )
 
-  @property
-  def worker_index(self):
-    """
-    returns index of worker in the cluster
-    chief has index 0
-    non-chief workers have indices 1 through (num_workers - 1)
-    """
-    return self._estimator.config.global_id_in_cluster
-
-  @property
-  def using_hogwild(self):
-    """ returns a bool indicating whether hogwild is being used """
-    return self._using_hogwild
-
-  def set_estimator(self, estimator):
-    """ sets the estimator used internally by Trainer """
-    if not isinstance(estimator, tf.estimator.Estimator):
-      raise ValueError("Expecting tf.estimator.Estimator")
-    self._estimator = estimator
-    self._params = self.estimator.params
-
-  @property
-  def params(self):
-    """
-    returns the hyper-parameters passed to the constructor.
-    """
-    return self._params
+    def _tf_random_seed(self):
+        """Returns user set seed and deal with Hogwild multiple seeds"""
+        tf_random_seed = self._params.get("tf_random_seed", None)
+        if tf_random_seed is None:
+            return None
+        elif (
+            self.using_hogwild and os.environ.get("TWML_HOGWILD_TASK_TYPE") == "worker"
+        ):
+            # chief (tf_random_seed), worker_0 (tf_random_seed + 1), worker_1 (tf_random_seed + 2)...
+            return tf_random_seed + 1 + int(os.environ.get("TWML_HOGWILD_TASK_ID"))
+        else:
+            return tf_random_seed
+
+    def check_params(self):
+        """Verify that params has the correct key,values"""
+        param_values = self._params.values()
+
+        if "train_batch_size" in param_values:
+            if not isinstance(self._params.train_batch_size, int):
+                raise ValueError("Expecting params.train_batch_size to be an integer.")
+            if self._params.train_batch_size <= 0:
+                raise ValueError("train_batch_size needs to be positive")
+        else:
+            raise ValueError("train_batch_size needs to be present in params")
+
+        if "eval_batch_size" in param_values:
+            if not isinstance(self._params.eval_batch_size, int):
+                raise ValueError("Expecting params.eval_batch_size to be an integer.")
+            if self._params.eval_batch_size <= 0:
+                raise ValueError("eval_batch_size needs to be positive.")
+        else:
+            self._params.add_hparam("eval_batch_size", self._params.train_batch_size)
+
+        if self._params.get("distributed_training_cleanup") and not self._params.get(
+            "distributed"
+        ):
+            # we only need to support training discontinuation for distributed training
+            # bc we are still using TSDs on GKE for distributed training
+            raise ValueError(
+                "Expecting params.distributed to be set if "
+                "params.distributed_training_cleanup is set."
+            )
+
+    def _get_cluster_config(self, name, index):
+        """Create a tensorflow cluster config from ports, name and index"""
+        host = '"localhost:%d"'
+        ps = host % self._ports[0]
+        chief = host % self._ports[1]
+        workers = ", ".join([host % port for port in self._ports[2:]])
+        config = _CLUSTER_TEMPLATE.substitute(
+            PS=ps,
+            CHIEF=chief,
+            WORKER=workers,
+            TYPE=name,
+            INDEX=index,
+        )
+        return config
+
+    @property
+    def current_estimator_spec(self):
+        """
+        returns the current estimator (warning: often reset)
+        """
+        return self._current_estimator_spec
+
+    @property
+    def estimator(self):
+        """returns estimator encapsulated by Trainer"""
+        return self._estimator
+
+    @property
+    def num_workers(self):
+        """returns number of workers"""
+        return self._estimator.config.num_worker_replicas
+
+    @property
+    def worker_index(self):
+        """
+        returns index of worker in the cluster
+        chief has index 0
+        non-chief workers have indices 1 through (num_workers - 1)
+        """
+        return self._estimator.config.global_id_in_cluster
+
+    @property
+    def using_hogwild(self):
+        """returns a bool indicating whether hogwild is being used"""
+        return self._using_hogwild
+
+    def set_estimator(self, estimator):
+        """sets the estimator used internally by Trainer"""
+        if not isinstance(estimator, tf.estimator.Estimator):
+            raise ValueError("Expecting tf.estimator.Estimator")
+        self._estimator = estimator
+        self._params = self.estimator.params
+
+    @property
+    def params(self):
+        """
+        returns the hyper-parameters passed to the constructor.
+        """
+        return self._params
+
+    @staticmethod
+    def add_parser_arguments():
+        """
+        Add common commandline args to parse for the Trainer class.
+        Typically, the user calls this function and then parses cmd-line arguments
+        into an argparse.Namespace object which is then passed to the Trainer constructor
+        via the params argument.
+
+        See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
+        for a list and description of all cmd-line arguments.
+
+        Returns:
+          argparse.ArgumentParser instance with some useful args already added.
+        """
+        return twml.argument_parser.get_trainer_parser()
+
+    @staticmethod
+    def get_train_op(params, loss):
+        """
+        Return a training Op, that is, a `twml.optimizers.optimize_loss
+        <https://www.tensorflow.org/api_docs/python/tf/contrib/layers/optimize_loss>`_
+        instance given params and loss.
+        This method can be overwritten by passing the optimize_loss_fn to the Trainer
+        constructor.
 
-  @staticmethod
-  def add_parser_arguments():
-    """
-    Add common commandline args to parse for the Trainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
+        Args:
+          params:
+            tensorflow.contrib.training.HParams instance. Recognizes the optimizer, optimizer_summaries,
+            gradient_noise_scale, clip_gradients and learning_rate_decay (including
+            other learning rate decay arguments).
+          loss:
+            scalar Op returned by the build_graph that specifies the training loss to
+            be minimized.
+        """
+        optimizer = params.get("optimizer")
+
+        if not optimizer:
+            optimizer = "SGD"
+
+        if optimizer == "LazyAdam":
+            optimizer = LazyAdamOptimizer
+
+        if optimizer == "DGC":
+            optimizer = DeepGradientCompressionOptimizer(
+                learning_rate=params.learning_rate,
+                use_locking=False,
+                name="Sparse",
+                density=params.get("dgc_density"),
+                density_decay=params.get("dgc_density_decay"),
+                density_decay_steps=params.get("dgc_density_decay_steps"),
+                density_decay_rate=params.get("dgc_density_decay_rate"),
+                min_density=params.get("dgc_min_density"),
+                accumulation=params.get("dgc_accumulation"),
+            )
+
+        summaries = ["loss"]
+        if params.get("show_optimizer_summaries"):
+            summaries = OPTIMIZER_SUMMARIES
+
+        train_op = optimize_loss(
+            loss=loss,
+            global_step=tf.train.get_global_step(),
+            optimizer=optimizer,
+            learning_rate=params.learning_rate,
+            summaries=summaries,
+            colocate_gradients_with_ops=True,
+            gradient_noise_scale=params.get("gradient_noise_scale"),
+            clip_gradients=params.get("clip_gradients"),
+            learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn(
+                params
+            ),
+        )
+        return train_op
+
+    def export_model_effects(self, export_path, feature_spec=None, log_features=True):
+        # DO NOT CHANGE THE ORDER.
+        # This needs to be done before registering the model.
+        if feature_spec:
+            if log_features:
+                features = feature_spec["features"]
+                feature_names = [
+                    ".".join(features[fid]["featureName"].split(".")[1:])
+                    for fid in features.keys()
+                ]
+                features_to_log = ",".join(feature_names)
+                try:
+                    model_hash = self.experiment_tracker.compute_model_hash(export_path)
+                    metrics.log_usage(
+                        "dbv2",
+                        "export_model_effects",
+                        "v1",
+                        custom_attrs=[
+                            model_hash,
+                            "feature config present",
+                            features_to_log,
+                        ],
+                    )
+                except:  # noqa: T803
+                    logging.info("Failed to log Feature Config features")
+
+            twml.contrib.export.export_fn.export_feature_spec(export_path, feature_spec)
+            export_start_time = time.time()
+            self.experiment_tracker.export_feature_spec(feature_spec)
+            logging.info(
+                "Exported feature spec to ML Metastore in %s seconds.",
+                time.time() - export_start_time,
+            )
+
+        self.experiment_tracker.register_model(str(export_path))
+        self.export_gauge.increment()
+
+    @property
+    def best_or_latest_checkpoint(self):
+        if self._is_early_stopping:
+            best_checkpoint_path = os.path.join(self._save_dir, "best_checkpoint")
+            checkpoint_path = tf.train.latest_checkpoint(best_checkpoint_path)
+            # Return best checkpoint if necessary
+            if checkpoint_path:
+                return checkpoint_path
+            else:
+                raise ValueError(
+                    "Best checkpoint not found at %s." % best_checkpoint_path
+                )
+        else:  # Fallback to latest checkpoint from save directory
+            return self.latest_checkpoint
+
+    @property
+    def latest_checkpoint(self):
+        return self.estimator.latest_checkpoint()
+
+    def export_model(
+        self,
+        serving_input_receiver_fn,
+        export_output_fn=None,
+        export_dir=None,
+        checkpoint_path=None,
+        feature_spec=None,
+        log_features=True,
+    ):
+        """
+        Export the model for prediction. Typically, the exported model
+        will later be run in production servers. This method is called
+        by the user to export the PREDICTgraph to disk.
+
+        Internally, this method calls `tf.estimator.Estimator.export_savedmodel
+        <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
+
+        Note that a valid self._export_output_fn is required.
+        If export_ouput_fn is provided, it is used to set the self._export_output_fn.
 
-    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-    for a list and description of all cmd-line arguments.
+        Args:
+          serving_input_receiver_fn:
+            function preparing the model for inference requests.
+            This funtion returns the ``features`` dict passed to ``build_graph``.
+          export_dir:
+            directory to export a SavedModel for prediction servers.
+            Defaults to ``[save_dir]/exported_models``.
+          checkpoint_path:
+            the checkpoint path to export. If None (the default), the most recent checkpoint
+            found within the model directory is chosen.
+          export_output_fn:
+            Function to export the graph_output (output of build_graph) for
+            prediction. Takes a graph_output dict as sole argument and returns
+            the export_output_fns dict.
+            Defaults to `twml.export_output_fns.default_output_fn`.
+
+        Return:
+          returns a string path to exported directory.
+
+        # set the export output function
+        """
+        if not self.is_chief():
+            logging.info(
+                "Trainer.export_model ignored due to the process not being chief."
+            )
+            return
 
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    return twml.argument_parser.get_trainer_parser()
+        self._export_output_fn = (
+            export_output_fn or twml.export_output_fns.default_output_fn
+        )
 
-  @staticmethod
-  def get_train_op(params, loss):
-    """
-    Return a training Op, that is, a `twml.optimizers.optimize_loss
-    <https://www.tensorflow.org/api_docs/python/tf/contrib/layers/optimize_loss>`_
-    instance given params and loss.
-    This method can be overwritten by passing the optimize_loss_fn to the Trainer
-    constructor.
-
-    Args:
-      params:
-        tensorflow.contrib.training.HParams instance. Recognizes the optimizer, optimizer_summaries,
-        gradient_noise_scale, clip_gradients and learning_rate_decay (including
-        other learning rate decay arguments).
-      loss:
-        scalar Op returned by the build_graph that specifies the training loss to
-        be minimized.
-    """
-    optimizer = params.get('optimizer')
-
-    if not optimizer:
-      optimizer = 'SGD'
-
-    if optimizer == 'LazyAdam':
-      optimizer = LazyAdamOptimizer
-
-    if optimizer == 'DGC':
-      optimizer = DeepGradientCompressionOptimizer(
-          learning_rate=params.learning_rate,
-          use_locking=False,
-          name="Sparse",
-          density=params.get('dgc_density'),
-          density_decay=params.get('dgc_density_decay'),
-          density_decay_steps=params.get('dgc_density_decay_steps'),
-          density_decay_rate=params.get('dgc_density_decay_rate'),
-          min_density=params.get('dgc_min_density'),
-          accumulation=params.get('dgc_accumulation')
-      )
-
-    summaries = ['loss']
-    if params.get('show_optimizer_summaries'):
-      summaries = OPTIMIZER_SUMMARIES
-
-    train_op = optimize_loss(
-      loss=loss,
-      global_step=tf.train.get_global_step(),
-      optimizer=optimizer,
-      learning_rate=params.learning_rate,
-      summaries=summaries,
-      colocate_gradients_with_ops=True,
-      gradient_noise_scale=params.get('gradient_noise_scale'),
-      clip_gradients=params.get('clip_gradients'),
-      learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn(params)
-    )
-    return train_op
-
-  def export_model_effects(self, export_path, feature_spec=None, log_features=True):
-
-    # DO NOT CHANGE THE ORDER.
-    # This needs to be done before registering the model.
-    if feature_spec:
-      if log_features:
-        features = feature_spec['features']
-        feature_names = ['.'.join(features[fid]['featureName'].split('.')[1:]) for fid in features.keys()]
-        features_to_log = ','.join(feature_names)
-        try:
-          model_hash = self.experiment_tracker.compute_model_hash(export_path)
-          metrics.log_usage('dbv2', 'export_model_effects', 'v1', custom_attrs=[model_hash, "feature config present", features_to_log])
-        except:  # noqa: T803
-          logging.info("Failed to log Feature Config features")
-
-      twml.contrib.export.export_fn.export_feature_spec(export_path, feature_spec)
-      export_start_time = time.time()
-      self.experiment_tracker.export_feature_spec(feature_spec)
-      logging.info("Exported feature spec to ML Metastore in %s seconds.", time.time() - export_start_time)
-
-    self.experiment_tracker.register_model(str(export_path))
-    self.export_gauge.increment()
-
-  @property
-  def best_or_latest_checkpoint(self):
-    if self._is_early_stopping:
-      best_checkpoint_path = os.path.join(self._save_dir, "best_checkpoint")
-      checkpoint_path = tf.train.latest_checkpoint(best_checkpoint_path)
-      # Return best checkpoint if necessary
-      if checkpoint_path:
-        return checkpoint_path
-      else:
-        raise ValueError("Best checkpoint not found at %s." % best_checkpoint_path)
-    else:  # Fallback to latest checkpoint from save directory
-      return self.latest_checkpoint
-
-  @property
-  def latest_checkpoint(self):
-    return self.estimator.latest_checkpoint()
-
-  def export_model(self, serving_input_receiver_fn,
-                   export_output_fn=None,
-                   export_dir=None, checkpoint_path=None,
-                   feature_spec=None,
-                   log_features=True):
-    """
-    Export the model for prediction. Typically, the exported model
-    will later be run in production servers. This method is called
-    by the user to export the PREDICTgraph to disk.
-
-    Internally, this method calls `tf.estimator.Estimator.export_savedmodel
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
-
-    Note that a valid self._export_output_fn is required.
-    If export_ouput_fn is provided, it is used to set the self._export_output_fn.
-
-    Args:
-      serving_input_receiver_fn:
-        function preparing the model for inference requests.
-        This funtion returns the ``features`` dict passed to ``build_graph``.
-      export_dir:
-        directory to export a SavedModel for prediction servers.
-        Defaults to ``[save_dir]/exported_models``.
-      checkpoint_path:
-        the checkpoint path to export. If None (the default), the most recent checkpoint
-        found within the model directory is chosen.
-      export_output_fn:
-        Function to export the graph_output (output of build_graph) for
-        prediction. Takes a graph_output dict as sole argument and returns
-        the export_output_fns dict.
-        Defaults to `twml.export_output_fns.default_output_fn`.
-
-    Return:
-      returns a string path to exported directory.
-
-    # set the export output function
-    """
-    if not self.is_chief():
-      logging.info("Trainer.export_model ignored due to the process not being chief.")
-      return
+        if not callable(self._export_output_fn):
+            raise RuntimeError(
+                "Expecting export_output_fn function. Got %s."
+                % type(self._export_output_fn).__name__
+            )
+
+        if export_dir:
+            export_dir = sanitize_hdfs_path(export_dir)
+
+        if checkpoint_path:
+            checkpoint_path = sanitize_hdfs_path(checkpoint_path)
+        else:
+            checkpoint_path = self.best_or_latest_checkpoint
+
+        # actually export the model using the Estimator API
+        export_path = self._estimator.export_savedmodel(
+            export_dir_base=export_dir
+            or os.path.join(self._save_dir, "exported_models"),
+            serving_input_receiver_fn=serving_input_receiver_fn,
+            checkpoint_path=checkpoint_path,
+        )
 
-    self._export_output_fn = export_output_fn or twml.export_output_fns.default_output_fn
+        # export_path is bytes, need to convert to string for python3 to work.
+        logging.info("The exported model path is: " + str(export_path))
 
-    if not callable(self._export_output_fn):
-      raise RuntimeError(
-        "Expecting export_output_fn function. Got %s."
-        % type(self._export_output_fn).__name__)
+        self.export_model_effects(export_path, feature_spec, log_features)
 
-    if export_dir:
-      export_dir = sanitize_hdfs_path(export_dir)
+        return export_path
 
-    if checkpoint_path:
-      checkpoint_path = sanitize_hdfs_path(checkpoint_path)
-    else:
-      checkpoint_path = self.best_or_latest_checkpoint
+    def _model_fn(self, features, labels, mode, params, config=None):
+        """
+        returns tf.estimator.EstimatorSpec that can be used with tf.estimator.Estimators.
+        You would probably never need to modify this method.
+        Instead, you should override build_graph, which this method calls.
 
-    # actually export the model using the Estimator API
-    export_path = self._estimator.export_savedmodel(
-      export_dir_base=export_dir or os.path.join(self._save_dir, 'exported_models'),
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      checkpoint_path=checkpoint_path)
+        Args:
+          features:
+            Dict of input tensors.
+          labels:
+            Tensor of target labels.
+          mode:
+            an instance of tf.estimator.ModeKeys.
+            Typically used to toggle TRAINing or EVALuation.
+          params:
+            HParams object containing hyper-parameters.
+        """
+        # pylint: disable=too-many-branches
+        if isinstance(features, dict):
+            weights = features.get("weights", None)
+        else:
+            weights = None
+
+        with tf.variable_scope(self._name + "/model"):
+            graph_output = self._build_graph_fn(features, labels, mode, params, config)
+            loss = graph_output["loss"] if "loss" in graph_output else None
+
+        self._maybe_restore_checkpoint()
+
+        with tf.variable_scope(self._name + "/optim"):
+            train_op = None
+            if mode == tf.estimator.ModeKeys.TRAIN:
+                if "train_op" in graph_output:
+                    train_op = graph_output["train_op"]
+                    graph_output[
+                        "train_op"
+                    ] = None  # remove from preds to prevent error
+                elif loss is not None:
+                    train_op = self._optimize_loss_fn(params, loss)
+
+                if params.get("train_log_metrics") and self._metric_fn:
+                    metric_ops = self._metric_fn(
+                        graph_output=graph_output, labels=labels, weights=weights
+                    )
+                    for metric_name in metric_ops:
+                        tf.summary.scalar(
+                            name="training_metric_" + metric_name,
+                            tensor=metric_ops[metric_name][1],
+                        )  # index 0 contains value_op, 1 contains update_op
+
+        if mode == tf.estimator.ModeKeys.PREDICT and self._export_output_fn is not None:
+            # note that this is ignored by the predict method.
+            # Estimator only uses export_output_fn for export_model.
+            export_outputs = self._export_output_fn(graph_output)
+        else:
+            export_outputs = None
+
+        if mode == tf.estimator.ModeKeys.EVAL and self._metric_fn:
+            eval_metric_ops = self._metric_fn(
+                graph_output=graph_output, labels=labels, weights=weights
+            )
+        else:
+            eval_metric_ops = None
+
+        # None and loss (scalar, not sliceable by TFMA) should be removed from the graph_output
+        preds = {
+            key: graph_output[key]
+            for key in graph_output
+            if (graph_output[key] is not None) and (key is not "loss")
+        }
+
+        init_feed_dict = twml.contrib.initializers.get_init_feed_dict()
+        scaffold = tf.train.Scaffold(init_feed_dict=init_feed_dict)
+
+        # Clear the init feed collection to avoid serializing the initializers.
+        twml.contrib.initializers.clear_init_feed_collection()
+
+        # save estimator for use by later methods and hooks (warning: often reset)
+        self._current_estimator_spec = tf.estimator.EstimatorSpec(
+            mode=mode,
+            predictions=preds,
+            export_outputs=export_outputs,
+            loss=loss,
+            train_op=train_op,
+            eval_metric_ops=eval_metric_ops,
+            scaffold=scaffold,
+        )
 
-    # export_path is bytes, need to convert to string for python3 to work.
-    logging.info("The exported model path is: " + str(export_path))
+        return self._current_estimator_spec
 
-    self.export_model_effects(export_path, feature_spec, log_features)
+    def get_train_hooks(self):
+        """Return SessionRunHooks used during training.
 
-    return export_path
+        By default training uses one hooks `tf.train.StepCounterHook` for monitoring step speed.
 
-  def _model_fn(self, features, labels, mode, params, config=None):
-    """
-    returns tf.estimator.EstimatorSpec that can be used with tf.estimator.Estimators.
-    You would probably never need to modify this method.
-    Instead, you should override build_graph, which this method calls.
-
-    Args:
-      features:
-        Dict of input tensors.
-      labels:
-        Tensor of target labels.
-      mode:
-        an instance of tf.estimator.ModeKeys.
-        Typically used to toggle TRAINing or EVALuation.
-      params:
-        HParams object containing hyper-parameters.
-    """
-    # pylint: disable=too-many-branches
-    if isinstance(features, dict):
-      weights = features.get('weights', None)
-    else:
-      weights = None
-
-    with tf.variable_scope(self._name + '/model'):
-      graph_output = self._build_graph_fn(features, labels, mode, params, config)
-      loss = graph_output['loss'] if 'loss' in graph_output else None
-
-    self._maybe_restore_checkpoint()
-
-    with tf.variable_scope(self._name + '/optim'):
-      train_op = None
-      if mode == tf.estimator.ModeKeys.TRAIN:
-        if 'train_op' in graph_output:
-          train_op = graph_output['train_op']
-          graph_output['train_op'] = None  # remove from preds to prevent error
-        elif loss is not None:
-          train_op = self._optimize_loss_fn(params, loss)
-
-        if params.get('train_log_metrics') and self._metric_fn:
-          metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights)
-          for metric_name in metric_ops:
-            tf.summary.scalar(
-              name="training_metric_" + metric_name,
-              tensor=metric_ops[metric_name][1])  # index 0 contains value_op, 1 contains update_op
-
-    if mode == tf.estimator.ModeKeys.PREDICT and self._export_output_fn is not None:
-      # note that this is ignored by the predict method.
-      # Estimator only uses export_output_fn for export_model.
-      export_outputs = self._export_output_fn(graph_output)
-    else:
-      export_outputs = None
-
-    if mode == tf.estimator.ModeKeys.EVAL and self._metric_fn:
-      eval_metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights)
-    else:
-      eval_metric_ops = None
-
-    # None and loss (scalar, not sliceable by TFMA) should be removed from the graph_output
-    preds = {key: graph_output[key] for key in graph_output if (graph_output[key] is not None) and (key is not 'loss')}
-
-    init_feed_dict = twml.contrib.initializers.get_init_feed_dict()
-    scaffold = tf.train.Scaffold(init_feed_dict=init_feed_dict)
-
-    # Clear the init feed collection to avoid serializing the initializers.
-    twml.contrib.initializers.clear_init_feed_collection()
-
-    # save estimator for use by later methods and hooks (warning: often reset)
-    self._current_estimator_spec = tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=preds,
-      export_outputs=export_outputs,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=eval_metric_ops,
-      scaffold=scaffold,
-    )
-
-    return self._current_estimator_spec
-
-  def get_train_hooks(self):
-    """Return SessionRunHooks used during training.
-
-    By default training uses one hooks `tf.train.StepCounterHook` for monitoring step speed.
-
-    If self._profiler_steps is set then we also use the ProfilerHook `tf.train.ProfilerHook`
-    for monitoring the profile.
+        If self._profiler_steps is set then we also use the ProfilerHook `tf.train.ProfilerHook`
+        for monitoring the profile.
 
-    """
-    # Instead of having every_n_steps be a constant number,
-    # change it dynamically based on batch size.
-    # Ideally we should be using every_n_secs, but that seems buggy as of 1.7.
-    # The every_n_steps = 20K / batch_size
-    every_n_steps = ((2048 * 100) // self._params.train_batch_size)
-    step_counter = tf.train.StepCounterHook(
-      every_n_steps=every_n_steps, output_dir=self._save_dir
-    )
-    train_hooks = [step_counter]
-
-    if self._profiler_steps is not None:
-      if not self._params.get('distributed') or self._estimator.config.is_chief:
-        profiler = tf.train.ProfilerHook(
-          save_steps=self._profiler_steps,
-          output_dir=self._save_dir
+        """
+        # Instead of having every_n_steps be a constant number,
+        # change it dynamically based on batch size.
+        # Ideally we should be using every_n_secs, but that seems buggy as of 1.7.
+        # The every_n_steps = 20K / batch_size
+        every_n_steps = (2048 * 100) // self._params.train_batch_size
+        step_counter = tf.train.StepCounterHook(
+            every_n_steps=every_n_steps, output_dir=self._save_dir
         )
-        train_hooks.append(profiler)
-
-    return train_hooks
-
-  def is_task_type(self, name):
-    """
-    Helper function to specify if the current process is of the given worker type.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    if os.environ.get('TF_CONFIG'):
-      if self._estimator.config.task_type == name:
+        train_hooks = [step_counter]
+
+        if self._profiler_steps is not None:
+            if not self._params.get("distributed") or self._estimator.config.is_chief:
+                profiler = tf.train.ProfilerHook(
+                    save_steps=self._profiler_steps, output_dir=self._save_dir
+                )
+                train_hooks.append(profiler)
+
+        return train_hooks
+
+    def is_task_type(self, name):
+        """
+        Helper function to specify if the current process is of the given worker type.
+        Note: This an only be called *after* self._hogwild_setup() is called in __init__()
+        """
+        if os.environ.get("TF_CONFIG"):
+            if self._estimator.config.task_type == name:
+                return True
+            else:
+                return False
         return True
-      else:
-        return False
-    return True
 
-  def is_evaluator(self):
-    """
-    Helper function to let you know if the worker is evaluator.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    return self.is_task_type("evaluator")
+    def is_evaluator(self):
+        """
+        Helper function to let you know if the worker is evaluator.
+        Note: This an only be called *after* self._hogwild_setup() is called in __init__()
+        """
+        return self.is_task_type("evaluator")
+
+    def is_chief(self):
+        """
+        Helper function to let you know if the worker is chief.
+        Note: This an only be called *after* self._hogwild_setup() is called in __init__()
+        """
+        return self.is_task_type("chief") or self.is_task_type("master")
+
+    def is_ps(self):
+        """
+        Helper function to let you know if the task is parameter server.
+        """
+        if os.environ.get("TF_CONFIG") and self._estimator.config.task_type == "ps":
+            return True
+        return False
 
-  def is_chief(self):
-    """
-    Helper function to let you know if the worker is chief.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    return self.is_task_type("chief") or self.is_task_type("master")
+    def _exit_ps_after_training_complete(self):
+        """
+        Helper function to shutdown parameter server after training job complete (either succeed or failed).
+        """
+        if not self.is_ps():
+            return
+
+        # No need to exit ps if on the same machine
+        if os.environ.get("TWML_HOGWILD_PORTS"):
+            return
+
+        if self._params.get("disable_auto_ps_shutdown", False):
+            logging.info(
+                "Skip shutting down parameter server after training complete [--disable_auto_ps_shutdown is set]"
+            )
+            return
+
+        # checking job status is different on gke vs aurora
+        if self._is_on_gke():
+            get_job_status = functools.partial(
+                k8s_status.get_training_job_status,
+                cluster=None,
+                namespace=os.environ["TWML_JOB_ROLE"],
+                environment=os.environ["TWML_JOB_ENV"],
+                job_name=os.environ["TWML_JOB_NAME"],
+                using_tsd=True,
+            )
+        else:
+            get_job_status = functools.partial(
+                get_distributed_training_job_path,
+                base_job_path=get_distributed_training_job_path(),
+            )
+
+        def wait_complete_then_exit():
+            retry_max = 60
+            retry = 0
+            while True:
+                try:
+                    training_status = get_job_status()
+                    if training_status == TrainingJobStatus.FINISHED:
+                        logging.info(
+                            "Distributed training job succeed, shutting down parameter server."
+                        )
+                        os._exit(0)
+                    elif training_status == TrainingJobStatus.FAILED:
+                        logging.info(
+                            "Distributed training job failed, shutting down parameter server."
+                        )
+                        os._exit(0)
+                    elif training_status == TrainingJobStatus.NOT_FOUND:
+                        raise Exception("Distributed training job status not found.")
+                    else:
+                        poke_interval = random.randrange(
+                            60, 90
+                        )  # prevent spike QPS to aurora endpoint
+                        time.sleep(poke_interval)
+                        retry = 0
+                except Exception as e:
+                    if retry >= retry_max:
+                        raise e  # only exception in this thread, won't fail parameter server thread
+                    retry += 1
+                    poke_interval = random.randrange(60, 90) + retry * 10
+                    logging.warn(
+                        "Error getting distributed training job status, will retry after %s seconds."
+                        % poke_interval
+                    )
+                    time.sleep(poke_interval)
+
+        Thread(target=wait_complete_then_exit).start()
+
+    def get_eval_hooks(self):  # pylint: disable=no-self-use
+        """Return SessionRunHooks used during evaluation."""
+        return None
+
+    def get_predict_hooks(self):
+        """Return hooks used during prediction.
+        If profiler_steps is set in the constructor to the Trainer,
+        we pass a tf.Train.ProfilerHook to the estimator's predict function.
+        """
+        hooks = []
+        if self._profiler_steps is not None:
+            profiler = tf.train.ProfilerHook(
+                save_steps=self._profiler_steps, output_dir=self._save_dir
+            )
+            hooks.append(profiler)
+        return hooks
+
+    def learn(
+        self,
+        train_input_fn=None,
+        eval_input_fn=None,
+        train_max_steps=None,
+        train_steps=None,
+        eval_steps=None,
+        train_hooks=None,
+        eval_hooks=None,
+        early_stop_metric=None,
+        early_stop_patience=-1,
+        early_stop_minimize=True,
+        early_stop_tolerance=0,
+        start_epoch=0,
+        exporters=None,
+        export_output_fn=None,
+        max_duration=None,
+    ):
+        """
+        Train and evaluate the estimator for ``train_max_steps`` steps.
+        Each epoch involves ``train_steps`` training steps followed
+        by ``eval_steps`` evaluation steps. Note that each step
+        is a ``session.run()``, that is, each batch is a step.
 
-  def is_ps(self):
-    """
-    Helper function to let you know if the task is parameter server.
-    """
-    if os.environ.get('TF_CONFIG') and self._estimator.config.task_type == 'ps':
-      return True
-    return False
+        Args:
+          train_max_steps:
+            maximum number of global steps of training to run.
+            Defaults to params.train_max_steps.
+            None-values cause learn() to terminate after *one* call to train() and evaluate(),
+            which is usually useful when using train_steps=-1
+            Non-positive values trains indefinitely in a loop (use with caution),
+            which is usually useful when used with early stopping.
+          train_steps:
+            number of training steps per epoch. For example, 100 means each
+            training epoch will end after processing 100 batches.
+            Defaults to params.train_steps.
+            Non-positive values and None-values go through the entire training set each epoch.
+          eval_steps:
+            number of evaluation steps per epoch.
+            Defaults to params.eval_steps.
+            Non-positive values and None-values go through the entire evaluation set each epoch.
+          train_input_fn:
+            Function to iterate through training set. It is passed to estimator.train.
+          eval_input_fn:
+            Function to iterate through evaluation set. It is passed to estimator.evaluate.
+          train_hooks:
+            List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
+          eval_hooks:
+            List of SessionRunHooks uses for evaluation. Defaults to self.get_eval_hooks()
+          start_epoch:
+            The epoch from which to start learn. If you want to do training and evaluation
+            for N epochs, you can call ``learn()`` in a loop as follows:
+          exporters:
+            List of exporters called at the end of each evaluation run.
+            Defaults to none.
+          export_output_fn:
+            The output format to use for exported models.
+            Only used if exporters is not None.
+
+            .. code-block:: python
+
+              for epoch in range(1,max_epoch):
+                trainer.learn(start_epoch=epoch)
+
+        Early-stopping arguments:
+          early_stop_metric:
+            String specifying the metric to early-stop on. Required with positive
+            ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
+            The string is used to extract the relevant tensor Op from the dict returned by
+            the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
+            the string is one of those. For multi-class (that is, multi-metric)
+            metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
+            of the ``multi_metric_names`` (one per class).
+          early_stop_patience:
+            Maximum number of epochs to wait for an improvement in the early_stop_metric
+            before breaking off training. For example, a patience of 10 means that
+            training will have 10 epochs to improve the metric before it is killed.
+            Whenever the metric is improved before running out of patience,
+            patience is reset to ``early_stop_patience``.
+            Defaults to -1 (that is, no early-stopping).
+          early_stop_minimize:
+            Set this to True (the default) for metrics that need to be minimized
+            (like ``loss``). Metrics like ``accuracy`` that need to be maximized
+            should set this to False.
+          early_stop_tolerance:
+            A non-negative tolerance for comparing early_stop_metric.
+            E.g. when maximizing the condition is current_metric > best_metric + tolerance.
+            Defaults to 0.
+          max_duration:
+            A float. When this argument is defined, the job will automatically terminate after
+            `max_duration` seconds if it has not already compeleted.
+
+        Returns:
+          The directory where the checkpoints were saved.
+          That is, save_dir.
+          You can point TensorBoard to this directory to get metrics,
+          or pass it to another Trainer via ``init_from_dir`` when doing
+          multi-phase training.
+        """
+        # pylint: disable=too-many-branches
+
+        if not callable(train_input_fn):
+            raise ValueError("Expecting callable train_input_fn function")
+        if not callable(eval_input_fn):
+            raise ValueError("Expecting callable eval_input_fn function")
+
+        if os.environ.get("TF_CONFIG"):
+            raise ValueError(
+                "trainer.learn() can not be used with distributed / hogwild setups"
+            )
+
+        if exporters and export_output_fn:
+            self._export_output_fn = export_output_fn
+
+        train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
+        eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
+        eval_hooks = [] if eval_hooks is None else eval_hooks
 
-  def _exit_ps_after_training_complete(self):
-    """
-    Helper function to shutdown parameter server after training job complete (either succeed or failed).
-    """
-    if not self.is_ps():
-      return
-
-    # No need to exit ps if on the same machine
-    if os.environ.get('TWML_HOGWILD_PORTS'):
-      return
-
-    if self._params.get('disable_auto_ps_shutdown', False):
-      logging.info("Skip shutting down parameter server after training complete [--disable_auto_ps_shutdown is set]")
-      return
-
-    # checking job status is different on gke vs aurora
-    if self._is_on_gke():
-      get_job_status = functools.partial(
-        k8s_status.get_training_job_status,
-        cluster=None,
-        namespace=os.environ['TWML_JOB_ROLE'],
-        environment=os.environ['TWML_JOB_ENV'],
-        job_name=os.environ['TWML_JOB_NAME'],
-        using_tsd=True)
-    else:
-      get_job_status = functools.partial(
-        get_distributed_training_job_path,
-        base_job_path=get_distributed_training_job_path()
-      )
-
-    def wait_complete_then_exit():
-      retry_max = 60
-      retry = 0
-      while True:
-        try:
-          training_status = get_job_status()
-          if training_status == TrainingJobStatus.FINISHED:
-            logging.info("Distributed training job succeed, shutting down parameter server.")
-            os._exit(0)
-          elif training_status == TrainingJobStatus.FAILED:
-            logging.info("Distributed training job failed, shutting down parameter server.")
-            os._exit(0)
-          elif training_status == TrainingJobStatus.NOT_FOUND:
-            raise Exception("Distributed training job status not found.")
-          else:
-            poke_interval = random.randrange(60, 90)  # prevent spike QPS to aurora endpoint
-            time.sleep(poke_interval)
-            retry = 0
-        except Exception as e:
-          if retry >= retry_max:
-            raise e  # only exception in this thread, won't fail parameter server thread
-          retry += 1
-          poke_interval = random.randrange(60, 90) + retry * 10
-          logging.warn("Error getting distributed training job status, will retry after %s seconds." % poke_interval)
-          time.sleep(poke_interval)
-    Thread(target=wait_complete_then_exit).start()
-
-  def get_eval_hooks(self):  # pylint: disable=no-self-use
-    """ Return SessionRunHooks used during evaluation."""
-    return None
-
-  def get_predict_hooks(self):
-    """ Return hooks used during prediction.
-    If profiler_steps is set in the constructor to the Trainer,
-    we pass a tf.Train.ProfilerHook to the estimator's predict function.
-    """
-    hooks = []
-    if self._profiler_steps is not None:
-      profiler = tf.train.ProfilerHook(
-        save_steps=self._profiler_steps,
-        output_dir=self._save_dir
-      )
-      hooks.append(profiler)
-    return hooks
-
-  def learn(self, train_input_fn=None, eval_input_fn=None,
-            train_max_steps=None,
-            train_steps=None, eval_steps=None,
-            train_hooks=None, eval_hooks=None,
-            early_stop_metric=None, early_stop_patience=-1,
-            early_stop_minimize=True, early_stop_tolerance=0, start_epoch=0,
-            exporters=None, export_output_fn=None, max_duration=None):
-    """
-    Train and evaluate the estimator for ``train_max_steps`` steps.
-    Each epoch involves ``train_steps`` training steps followed
-    by ``eval_steps`` evaluation steps. Note that each step
-    is a ``session.run()``, that is, each batch is a step.
-
-    Args:
-      train_max_steps:
-        maximum number of global steps of training to run.
-        Defaults to params.train_max_steps.
-        None-values cause learn() to terminate after *one* call to train() and evaluate(),
-        which is usually useful when using train_steps=-1
-        Non-positive values trains indefinitely in a loop (use with caution),
-        which is usually useful when used with early stopping.
-      train_steps:
-        number of training steps per epoch. For example, 100 means each
-        training epoch will end after processing 100 batches.
-        Defaults to params.train_steps.
-        Non-positive values and None-values go through the entire training set each epoch.
-      eval_steps:
-        number of evaluation steps per epoch.
-        Defaults to params.eval_steps.
-        Non-positive values and None-values go through the entire evaluation set each epoch.
-      train_input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      eval_input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.evaluate.
-      train_hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-      eval_hooks:
-        List of SessionRunHooks uses for evaluation. Defaults to self.get_eval_hooks()
-      start_epoch:
-        The epoch from which to start learn. If you want to do training and evaluation
-        for N epochs, you can call ``learn()`` in a loop as follows:
-      exporters:
-        List of exporters called at the end of each evaluation run.
-        Defaults to none.
-      export_output_fn:
-        The output format to use for exported models.
-        Only used if exporters is not None.
-
-        .. code-block:: python
-
-          for epoch in range(1,max_epoch):
-            trainer.learn(start_epoch=epoch)
-
-    Early-stopping arguments:
-      early_stop_metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      early_stop_patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-        Defaults to -1 (that is, no early-stopping).
-      early_stop_minimize:
-        Set this to True (the default) for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      early_stop_tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        E.g. when maximizing the condition is current_metric > best_metric + tolerance.
-        Defaults to 0.
-      max_duration:
-        A float. When this argument is defined, the job will automatically terminate after
-        `max_duration` seconds if it has not already compeleted. 
-
-    Returns:
-      The directory where the checkpoints were saved.
-      That is, save_dir.
-      You can point TensorBoard to this directory to get metrics,
-      or pass it to another Trainer via ``init_from_dir`` when doing
-      multi-phase training.
-    """
-    # pylint: disable=too-many-branches
-
-    if not callable(train_input_fn):
-      raise ValueError("Expecting callable train_input_fn function")
-    if not callable(eval_input_fn):
-      raise ValueError("Expecting callable eval_input_fn function")
-
-    if os.environ.get('TF_CONFIG'):
-      raise ValueError("trainer.learn() can not be used with distributed / hogwild setups")
-
-    if exporters and export_output_fn:
-      self._export_output_fn = export_output_fn
-
-    train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
-    eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
-    eval_hooks = [] if eval_hooks is None else eval_hooks
-
-    if train_max_steps is None:
-      train_max_steps = self.params.get('train_max_steps')
-
-    if train_steps is None:
-      train_steps = self.params.train_steps
-    if train_steps <= 0:
-      train_steps = None
-
-    if eval_steps is None:
-      eval_steps = self.params.eval_steps
-    if eval_steps <= 0:
-      eval_steps = None
-
-    if early_stop_patience > 0:
-      assert train_max_steps is not None, "Early stopping and max_steps=None are not compatible."
-      # prepare early stopping hook (which also handles logic here)
-      self._is_early_stopping = True
-      early_stop_hook = twml.hooks.EarlyStopHook(
-        metric=early_stop_metric,
-        checkpoint_dir=self._save_dir,
-        patience=early_stop_patience,
-        minimize=early_stop_minimize,
-        tolerance=early_stop_tolerance,
-        get_estimator_spec_fn=lambda: self.current_estimator_spec,
-        start_epoch=start_epoch)
-      # add early stop hook to eval hooks
-      eval_hooks.append(early_stop_hook)
-
-    if max_duration is not None:
-      train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=True,
-      )
-      train_hooks.append(train_early_stop_duration_hook)
-
-      eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=True,
-      )
-      eval_hooks.append(eval_early_stop_duration_hook)
-
-    if not self._is_early_stopping:
-      if (train_max_steps is not None) and (train_max_steps <= 0):
-        if ((max_duration is not None) and (max_duration < 0)) or (max_duration is None):
-          logging.warn("train.max_steps is non-positive, and no early or duration stopping is configured. "
-                      "Training job will loop forever.")
-
-    if train_max_steps is not None and train_max_steps > 0:
-      # we can't pass max_steps AND steps to estimator.train.
-      # so we pass steps to estimator.train and max_steps to this hook instead...
-      stop_at_step_hook = twml.hooks.StopAtStepHook(last_step=train_max_steps)
-      train_hooks.append(stop_at_step_hook)
-
-    with self.experiment_tracker.track_experiment(eval_hooks,
-                                                  lambda: self.current_estimator_spec):
-      # alternate training and evaluation epochs
-      epoch = start_epoch
-      while True:
-        logging.info("Training epoch %d", epoch)
-        self._estimator.train(train_input_fn, steps=train_steps, hooks=train_hooks)
-
-        logging.info("Evaluating epoch %d", epoch)
-        eval_result = self._estimator.evaluate(
-          eval_input_fn, steps=eval_steps, hooks=eval_hooks)
-
-        if exporters:
-          checkpoint_path = self.estimator.latest_checkpoint()
-          for exporter in exporters:
-            export_path = os.path.join(self._save_dir, "export", exporter.name)
-            exporter.export(
-              estimator=self.estimator, export_path=export_path,
-              checkpoint_path=checkpoint_path, eval_result=eval_result,
-              is_the_final_export=False)
-
-        # If train_max_step is none. Terminate after one loop.
         if train_max_steps is None:
-          break
+            train_max_steps = self.params.get("train_max_steps")
+
+        if train_steps is None:
+            train_steps = self.params.train_steps
+        if train_steps <= 0:
+            train_steps = None
+
+        if eval_steps is None:
+            eval_steps = self.params.eval_steps
+        if eval_steps <= 0:
+            eval_steps = None
+
+        if early_stop_patience > 0:
+            assert (
+                train_max_steps is not None
+            ), "Early stopping and max_steps=None are not compatible."
+            # prepare early stopping hook (which also handles logic here)
+            self._is_early_stopping = True
+            early_stop_hook = twml.hooks.EarlyStopHook(
+                metric=early_stop_metric,
+                checkpoint_dir=self._save_dir,
+                patience=early_stop_patience,
+                minimize=early_stop_minimize,
+                tolerance=early_stop_tolerance,
+                get_estimator_spec_fn=lambda: self.current_estimator_spec,
+                start_epoch=start_epoch,
+            )
+            # add early stop hook to eval hooks
+            eval_hooks.append(early_stop_hook)
+
+        if max_duration is not None:
+            train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
+                max_duration=max_duration,
+                exit_on_end=False,
+                save_dir=self._save_dir,
+                overwrite=True,
+            )
+            train_hooks.append(train_early_stop_duration_hook)
+
+            eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
+                max_duration=max_duration,
+                exit_on_end=False,
+                save_dir=self._save_dir,
+                overwrite=True,
+            )
+            eval_hooks.append(eval_early_stop_duration_hook)
+
+        if not self._is_early_stopping:
+            if (train_max_steps is not None) and (train_max_steps <= 0):
+                if ((max_duration is not None) and (max_duration < 0)) or (
+                    max_duration is None
+                ):
+                    logging.warn(
+                        "train.max_steps is non-positive, and no early or duration stopping is configured. "
+                        "Training job will loop forever."
+                    )
+
+        if train_max_steps is not None and train_max_steps > 0:
+            # we can't pass max_steps AND steps to estimator.train.
+            # so we pass steps to estimator.train and max_steps to this hook instead...
+            stop_at_step_hook = twml.hooks.StopAtStepHook(last_step=train_max_steps)
+            train_hooks.append(stop_at_step_hook)
+
+        with self.experiment_tracker.track_experiment(
+            eval_hooks, lambda: self.current_estimator_spec
+        ):
+            # alternate training and evaluation epochs
+            epoch = start_epoch
+            while True:
+                logging.info("Training epoch %d", epoch)
+                self._estimator.train(
+                    train_input_fn, steps=train_steps, hooks=train_hooks
+                )
+
+                logging.info("Evaluating epoch %d", epoch)
+                eval_result = self._estimator.evaluate(
+                    eval_input_fn, steps=eval_steps, hooks=eval_hooks
+                )
+
+                if exporters:
+                    checkpoint_path = self.estimator.latest_checkpoint()
+                    for exporter in exporters:
+                        export_path = os.path.join(
+                            self._save_dir, "export", exporter.name
+                        )
+                        exporter.export(
+                            estimator=self.estimator,
+                            export_path=export_path,
+                            checkpoint_path=checkpoint_path,
+                            eval_result=eval_result,
+                            is_the_final_export=False,
+                        )
+
+                # If train_max_step is none. Terminate after one loop.
+                if train_max_steps is None:
+                    break
+
+                # If stop_at_step_hook requested a stop, break
+                if train_max_steps > 0 and stop_at_step_hook.stop_requested:
+                    break
+
+                # early-stopping logic is handled internally by the hook
+                if early_stop_patience > 0 and early_stop_hook.should_stop:
+                    # but we still need to break here
+                    break
+                epoch += 1
+
+            self.write_state_to_disk(save_dir=self._save_dir, filename="_SUCCESS")
+
+        return self._save_dir
+
+    def get_train_spec(self, input_fn, max_steps=None, hooks=None):
+        """Get the TrainSpec used by ``tf.train.train_and_evaluate``."""
+        if not callable(input_fn):
+            raise ValueError("Expecting callable train_input_fn")
+
+        if max_steps is None:
+            max_steps = self.params.train_max_steps
+
+        if max_steps is not None and max_steps <= 0:
+            max_steps = None
+
+        hooks = self.get_train_hooks() if hooks is None else hooks
+
+        return tf.estimator.TrainSpec(
+            input_fn=input_fn, max_steps=max_steps, hooks=hooks
+        )
 
-        # If stop_at_step_hook requested a stop, break
-        if train_max_steps > 0 and stop_at_step_hook.stop_requested:
-          break
+    def get_eval_spec(
+        self, input_fn, steps=None, delay=None, period=None, hooks=None, exporters=None
+    ):
+        """Get the EvalSpec used by ``tf.train.train_and_evaluate``."""
+        if not callable(input_fn):
+            raise ValueError("Expecting callable eval_input_fn")
 
-        # early-stopping logic is handled internally by the hook
-        if early_stop_patience > 0 and early_stop_hook.should_stop:
-          # but we still need to break here
-          break
-        epoch += 1
+        if steps is None:
+            steps = self.params.eval_steps
 
-      self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS')
+        if steps <= 0:
+            steps = None
 
-    return self._save_dir
+        if delay is None:
+            delay = self.params.eval_delay
 
-  def get_train_spec(self, input_fn, max_steps=None, hooks=None):
-    """Get the TrainSpec used by ``tf.train.train_and_evaluate``."""
-    if not callable(input_fn):
-      raise ValueError("Expecting callable train_input_fn")
+        if period is None:
+            period = self.params.eval_period
 
-    if max_steps is None:
-      max_steps = self.params.train_max_steps
+        hooks = self.get_eval_hooks() if hooks is None else hooks
 
-    if max_steps is not None and max_steps <= 0:
-      max_steps = None
+        eval_name = self.params.get("eval_name", None)
 
-    hooks = self.get_train_hooks() if hooks is None else hooks
+        return tf.estimator.EvalSpec(
+            input_fn=input_fn,
+            steps=steps,
+            name=eval_name,
+            start_delay_secs=delay,
+            throttle_secs=period,
+            hooks=hooks,
+            exporters=exporters,
+        )
 
-    return tf.estimator.TrainSpec(input_fn=input_fn,
-                                  max_steps=max_steps,
-                                  hooks=hooks)
+    def train_and_evaluate(
+        self,
+        train_input_fn=None,
+        eval_input_fn=None,
+        train_max_steps=None,
+        eval_steps=None,
+        eval_delay=None,
+        eval_period=None,
+        train_hooks=None,
+        eval_hooks=None,
+        early_stop_metric=None,
+        early_stop_patience=-1,
+        early_stop_minimize=True,
+        early_stop_tolerance=0,
+        exporters=None,
+        export_output_fn=None,
+        max_duration=None,
+    ):
+        """
+        Train and evaluate the estimator for ``train_max_steps``
+        using ``tf.estimator.train_and_evaluate``.
+        With a cluster configuration provided in the ``TF_CONFIG`` environment variable, this method
+        can be used for distributed training (multi-node or multi-process).
+        Unlike the ``learn`` method, training is continuous with ``train_max_steps``.
+        For distributed use case, evaluation happens periodically.
+        That is, after ``eval_delay`` seconds, an evaluation epoch of ``eval_step`` steps
+        occurs every ``eval_period`` seconds. Evaluation happens on the most recent checkpoint.
+        TF defaults to saving checkpoints every 10 mins.
+        For local use case, training occurs for train_max_steps epochs followed by a
+        single evaluation. For local use case we therefore recommend using learn() instead
+        as it provides early-stopping and multiple evaluations.
+
+        ``train_and_evaluate`` will evaluate for ``eval_steps`` every ``eval_period`` seconds.
+        It will stop after ``train_steps`` is reached.
+
+        You must ensure that all workers/servers are assigned the same `save_dir`.
+
+        .. Note::
+
+          If the TF_CONFIG environment variable is set, this function assumes its running a distribute job.
 
-  def get_eval_spec(self, input_fn, steps=None, delay=None, period=None,
-                    hooks=None, exporters=None):
-    """Get the EvalSpec used by ``tf.train.train_and_evaluate``."""
-    if not callable(input_fn):
-      raise ValueError("Expecting callable eval_input_fn")
+        Args:
+          train_input_fn:
+            Function to iterate through training set. It is passed to estimator.train_and_evalute
+          eval_input_fn:
+            Function to iterate through evaluation set. It is passed to estimator.train_and_evalute.
+          train_max_steps:
+            maximum number of global steps of training to run.
+            Defaults to params.train_max_steps.
+            Non-positive values and None-values train indefinitely (use with caution).
+          eval_steps:
+            number of steps per evaluation.
+            Defaults to params.eval_steps.
+            Non-positive values and None-values go through
+            the entire evaluation set for each evaluation.
+            Note that the number of eval_steps should be high enough to minimize noise.
+            This is especially true for early-stopping.
+          eval_delay:
+            Start the first evaluation after eval_delay. Defaults to params.eval_delay or 2*60s.
+          eval_period:
+            Run an evaluation every eval_period seconds. Defaults to params.eval_period or 10*60s.
+          exporters:
+            List of exporters called at the end of each evaluation run.
+            Defaults to none.
+          export_output_fn:
+            The output format to use for exported models.
+            Only used if exporters is not None.
+
+        Early-stopping arguments:
+          early_stop_metric:
+            String specifying the metric to early-stop on. Required with positive
+            ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
+            The string is used to extract the relevant tensor Op from the dict returned by
+            the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
+            the string is one of those. For multi-class (that is, multi-metric)
+            metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
+            of the ``multi_metric_names`` (one per class).
+          early_stop_patience:
+            Maximum number of epochs to wait for an improvement in the early_stop_metric
+            before breaking off training. For example, a patience of 10 means that
+            training will have 10 epochs to improve the metric before it is killed.
+            Whenever the metric is improved before running out of patience,
+            patience is reset to ``early_stop_patience``.
+            Defaults to -1 (that is, no early-stopping).
+          early_stop_minimize:
+            Set this to True (the default) for metrics that need to be minimized
+            (like ``loss``). Metrics like ``accuracy`` that need to be maximized
+            should set this to False.
+          early_stop_tolerance:
+            A non-negative tolerance for comparing early_stop_metric.
+            E.g. when maximizing the condition is current_metric > best_metric + tolerance.
+            Defaults to 0.
+          max_duration:
+            A float. When this argument is defined, the job will automatically terminate after
+            `max_duration` seconds if it has not already compeleted.
+
+        Returns:
+          The directory where the checkpoints were saved.
+        """
+
+        logging.info("WARNING: Trainer.train_and_evaluate is an EXPERIMENTAL API.")
+        logging.info(
+            "Trainer.train_and_evaluate may change or be removed in future versions."
+        )
 
-    if steps is None:
-      steps = self.params.eval_steps
+        if not callable(train_input_fn):
+            raise ValueError("Expecting callable train_input_fn function")
+        if not callable(eval_input_fn):
+            raise ValueError("Expecting callable eval_input_fn function")
+
+        self._exit_ps_after_training_complete()
+
+        # Maybe export in eval processes.
+        if self.is_evaluator():
+            if self.params.get("eval_name") is not None:
+                # Do not export if running special eval.
+                exporters = None
+                export_output_fn = None
+            elif exporters and export_output_fn:
+                self._export_output_fn = export_output_fn
+            else:
+                # Default option.
+                self._export_output_fn = None
 
-    if steps <= 0:
-      steps = None
+        train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
+        train_hooks = [] if train_hooks is None else train_hooks
 
-    if delay is None:
-      delay = self.params.eval_delay
+        eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
+        eval_hooks = [] if eval_hooks is None else eval_hooks
 
-    if period is None:
-      period = self.params.eval_period
+        if train_max_steps is None:
+            train_max_steps = self.params.get("train_max_steps")
+
+        if eval_steps is None:
+            eval_steps = self.params.eval_steps
+        if eval_steps <= 0:
+            eval_steps = None
+
+        if eval_delay is None:
+            eval_delay = self.params.eval_delay
+        if eval_period is None:
+            eval_period = self.params.eval_period
+
+        if early_stop_patience > 0:
+            # when training hooks detect this file, they request a stop to training
+            early_stop_path = os.path.join(self._save_dir, "earlystop_now.txt")
+            # prepare early stopping hook (which also handles logic here)
+
+            self._is_early_stopping = True
+
+            eval_early_stop_hook = twml.hooks.EarlyStopHook(
+                metric=early_stop_metric,
+                checkpoint_dir=self._save_dir,
+                patience=early_stop_patience,
+                minimize=early_stop_minimize,
+                tolerance=early_stop_tolerance,
+                get_estimator_spec_fn=lambda: self.current_estimator_spec,
+                file_path=early_stop_path,
+                exit_on_end=os.environ.get("TF_CONFIG") is not None,
+            )  # only exit for distributed jobs
+            # add early stop hook to eval hooks
+            eval_hooks.append(eval_early_stop_hook)
+
+            # prepare the commensurate training hook
+            train_early_stop_hook = twml.hooks.StopIfExistsHook(early_stop_path)
+            train_hooks.append(train_early_stop_hook)
+
+        if max_duration is not None:
+            train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
+                max_duration=max_duration,
+                exit_on_end=False,
+                save_dir=self._save_dir,
+                overwrite=self.is_chief(),
+            )
+            eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
+                max_duration=max_duration,
+                exit_on_end=os.environ.get("TF_CONFIG") is not None,
+                save_dir=self._save_dir,
+                overwrite=False,
+            )  # only exit for distributed jobs
+
+            train_hooks.append(train_early_stop_duration_hook)
+            eval_hooks.append(eval_early_stop_duration_hook)
+
+        with self.experiment_tracker.track_experiment(
+            eval_hooks, lambda: self.current_estimator_spec
+        ):
+            train_spec = self.get_train_spec(
+                train_input_fn, train_max_steps, train_hooks
+            )
+            eval_spec = self.get_eval_spec(
+                eval_input_fn,
+                eval_steps,
+                eval_delay,
+                eval_period,
+                eval_hooks,
+                exporters,
+            )
+            self._train_and_evaluate(train_spec, eval_spec)
+
+        if self.is_chief():
+            self.write_state_to_disk(save_dir=self._save_dir, filename="_SUCCESS")
+
+        return self._save_dir
+
+    def _train_and_evaluate(self, train_spec, eval_spec):
+        """
+        Private method that calls
+        ``tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)``.
+        """
+        try:
+            tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)
+        except twml.errors.EarlyStopError:
+            # Ignore the exception if on evaluator.
+            if self.is_evaluator():
+                pass
+            else:
+                raise
 
-    hooks = self.get_eval_hooks() if hooks is None else hooks
+    def train(self, input_fn=None, steps=None, hooks=None):
+        """
+        Train the estimator for `steps` training steps.
 
-    eval_name = self.params.get("eval_name", None)
+        Args:
+          steps:
+            number of steps for which to perform training. For example, 100 means each
+            evaluation will end after processing 100 batches.
+            Defaults to None. i.e. trains on the entire dataset a single time.
+            Non-positive values and None-values go through the entire training set each epoch.
+          input_fn:
+            Function to iterate through training set. It is passed to estimator.train.
+          hooks:
+            List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
+        """
+        if os.environ.get("TF_CONFIG") and "is_calibrating" not in self.params:
+            raise ValueError(
+                "trainer.train() can not be used with distributed / hogwild setups"
+            )
+
+        if not callable(input_fn):
+            raise ValueError("Expecting callable input_fn function")
+
+        if self._is_early_stopping:
+            raise ValueError(
+                "Can not call train() after learn() when using early stopping."
+            )
+
+        hooks = self.get_train_hooks() if hooks is None else hooks
+        self._estimator.train(input_fn, steps=steps, hooks=hooks)
+        return self
+
+    def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
+        """
+        Evaluate the estimator for `steps` evaluation steps.
 
-    return tf.estimator.EvalSpec(input_fn=input_fn,
-                                 steps=steps,
-                                 name=eval_name,
-                                 start_delay_secs=delay,
-                                 throttle_secs=period,
-                                 hooks=hooks,
-                                 exporters=exporters)
+        Args:
+          steps:
+            number of steps for which to perform evaluation. For example, 100 means each
+            evaluation will end after processing 100 batches.
+            Defaults to None. i.e. evaluates on the entire dataset a single time.
+            Negative values and None-values go through the entire training set each epoch.
+          input_fn:
+            Function to iterate through evaluation set. It is passed to estimator.evaluate.
+          hooks:
+            List of SessionRunHooks used for evaluation. Defaults to None.
+            Note that, unlike learn(), hooks defaults to None instead of self.get_eval_hooks()
+            as the latter may implement early-stopping, which isn't necessarilty the desired
+            behavior when calling evaluate() on its own.
+          name:
+            Name of the evaluation if user needs to run multiple evaluations on different data sets.
+            Metrics for different evaluations are saved in separate folders,
+            and appear separately in tensorboard.
+
+        Returns:
+          If `is_evaluator()`, returns a dict containing the evaluation metrics specified
+          in `metric_fn` keyed by name, as well as an entry `global_step` that contains
+          the value of the global step for which this evaluation was performed.
+          Otherwise (i.e. `is_evaluator() == False`), returns None.
+        """
+        if not self.is_evaluator():
+            return None
+
+        if not callable(input_fn):
+            raise ValueError("Expecting callable input_fn function")
+
+        hooks = self.get_eval_hooks() if hooks is None else hooks
+        hooks = [] if hooks is None else hooks
+
+        # for consistency with train/learn
+        eval_steps = None if steps is not None and steps < 0 else steps
+
+        with self.experiment_tracker.track_experiment(
+            hooks, lambda: self.current_estimator_spec, name=name
+        ):
+            checkpoint = self.best_or_latest_checkpoint
+            computed_metrics = self._estimator.evaluate(
+                input_fn,
+                steps=eval_steps,
+                hooks=hooks,
+                checkpoint_path=checkpoint,
+                name=name,
+            )
+
+        return computed_metrics
+
+    def start_tensorboard(self, port=None):
+        """
+        Start tensorboard process to visualize logs in save_dir.
+        """
+        logging.info("Starting tensorboard.")
+        if self._tensorboard_handle:
+            logging.warn("Tensorboard already running. Nothing done.")
+            return
+
+        if port is None:
+            if "tensorboard_port" not in self.params.values():
+                raise ValueError("You must specify a port for tensorboard to run on.")
+            elif self.params.tensorboard_port is None:
+                return
+            else:
+                port = self.params.tensorboard_port
 
-  def train_and_evaluate(self, train_input_fn=None, eval_input_fn=None,
-                         train_max_steps=None, eval_steps=None,
-                         eval_delay=None, eval_period=None,
-                         train_hooks=None, eval_hooks=None,
-                         early_stop_metric=None, early_stop_patience=-1,
-                         early_stop_minimize=True, early_stop_tolerance=0, exporters=None,
-                         export_output_fn=None, max_duration=None):
-    """
-    Train and evaluate the estimator for ``train_max_steps``
-    using ``tf.estimator.train_and_evaluate``.
-    With a cluster configuration provided in the ``TF_CONFIG`` environment variable, this method
-    can be used for distributed training (multi-node or multi-process).
-    Unlike the ``learn`` method, training is continuous with ``train_max_steps``.
-    For distributed use case, evaluation happens periodically.
-    That is, after ``eval_delay`` seconds, an evaluation epoch of ``eval_step`` steps
-    occurs every ``eval_period`` seconds. Evaluation happens on the most recent checkpoint.
-    TF defaults to saving checkpoints every 10 mins.
-    For local use case, training occurs for train_max_steps epochs followed by a
-    single evaluation. For local use case we therefore recommend using learn() instead
-    as it provides early-stopping and multiple evaluations.
-
-    ``train_and_evaluate`` will evaluate for ``eval_steps`` every ``eval_period`` seconds.
-    It will stop after ``train_steps`` is reached.
-
-    You must ensure that all workers/servers are assigned the same `save_dir`.
-
-    .. Note::
-
-      If the TF_CONFIG environment variable is set, this function assumes its running a distribute job.
-
-    Args:
-      train_input_fn:
-        Function to iterate through training set. It is passed to estimator.train_and_evalute
-      eval_input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.train_and_evalute.
-      train_max_steps:
-        maximum number of global steps of training to run.
-        Defaults to params.train_max_steps.
-        Non-positive values and None-values train indefinitely (use with caution).
-      eval_steps:
-        number of steps per evaluation.
-        Defaults to params.eval_steps.
-        Non-positive values and None-values go through
-        the entire evaluation set for each evaluation.
-        Note that the number of eval_steps should be high enough to minimize noise.
-        This is especially true for early-stopping.
-      eval_delay:
-        Start the first evaluation after eval_delay. Defaults to params.eval_delay or 2*60s.
-      eval_period:
-        Run an evaluation every eval_period seconds. Defaults to params.eval_period or 10*60s.
-      exporters:
-        List of exporters called at the end of each evaluation run.
-        Defaults to none.
-      export_output_fn:
-        The output format to use for exported models.
-        Only used if exporters is not None.
-
-    Early-stopping arguments:
-      early_stop_metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      early_stop_patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-        Defaults to -1 (that is, no early-stopping).
-      early_stop_minimize:
-        Set this to True (the default) for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      early_stop_tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        E.g. when maximizing the condition is current_metric > best_metric + tolerance.
-        Defaults to 0.
-      max_duration:
-        A float. When this argument is defined, the job will automatically terminate after
-        `max_duration` seconds if it has not already compeleted. 
-
-    Returns:
-      The directory where the checkpoints were saved.
-    """
+        mldash_path = "experiments"
+        if self.experiment_tracker.path:
+            mldash_path += "/%s" % encode_url(self.experiment_tracker.experiment_id)
+        tensorboard_args = ["--logdir=%s" % self._save_dir, "--port=%d" % port]
 
-    logging.info("WARNING: Trainer.train_and_evaluate is an EXPERIMENTAL API.")
-    logging.info("Trainer.train_and_evaluate may change or be removed in future versions.")
-
-    if not callable(train_input_fn):
-      raise ValueError("Expecting callable train_input_fn function")
-    if not callable(eval_input_fn):
-      raise ValueError("Expecting callable eval_input_fn function")
-
-    self._exit_ps_after_training_complete()
-
-    # Maybe export in eval processes.
-    if self.is_evaluator():
-      if self.params.get("eval_name") is not None:
-        # Do not export if running special eval.
-        exporters = None
-        export_output_fn = None
-      elif exporters and export_output_fn:
-        self._export_output_fn = export_output_fn
-      else:
-        # Default option.
-        self._export_output_fn = None
+        try:
+            args = [
+                "email_and_launch_tensorboard",
+                mldash_path,
+                "--",
+            ] + tensorboard_args
+            self._tensorboard_handle = subprocess.Popen(args)
+        except OSError:
+            try:
+                self._tensorboard_handle = subprocess.Popen(
+                    ["tensorboard"] + tensorboard_args
+                )
+            except OSError:
+                try:
+                    # this will work with Twitter internal pants build when run locally
+                    args = [
+                        "./pants",
+                        "run",
+                        "twml:tensorboard",
+                        "--",
+                    ] + tensorboard_args
+                    self._tensorboard_handle = subprocess.Popen(args)
+                except OSError:
+                    logging.error(
+                        "No tensorboard installed, won't able to visualize training in tensorboard."
+                    )
+
+    def stop_tensorboard(self):
+        """
+        Shutdown this Trainer's associated Tensorboard.
+        """
+        if self._tensorboard_handle:
+            logging.info("Shutting down tensorboard.")
+            self._tensorboard_handle.kill()
+        else:
+            logging.warn("No known tensorboard process. Nothing done.")
+
+    def calibrate(
+        self, calibrator, steps=None, input_fn=None, save_calibrator=True, hooks=None
+    ):
+        """
+        Calibrate the calibrator for `steps` calibration steps using the estimator.train method.
+        The build_graph passed to the Trainer constructor should
+        call calibrator.accumulate using something like tf.py_func.
+        That way, when this method calls estimator.train the calibrator will
+        accumulate one epoch of samples. After which, this method calls calibrator.calibrate().
+        It is up to the user to then call calibrator.save() to save the calibrated Layer
+        and other information to disk for multi-phase training.
 
-    train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
-    train_hooks = [] if train_hooks is None else train_hooks
-
-    eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
-    eval_hooks = [] if eval_hooks is None else eval_hooks
-
-    if train_max_steps is None:
-      train_max_steps = self.params.get('train_max_steps')
-
-    if eval_steps is None:
-      eval_steps = self.params.eval_steps
-    if eval_steps <= 0:
-      eval_steps = None
-
-    if eval_delay is None:
-      eval_delay = self.params.eval_delay
-    if eval_period is None:
-      eval_period = self.params.eval_period
-
-    if early_stop_patience > 0:
-      # when training hooks detect this file, they request a stop to training
-      early_stop_path = os.path.join(self._save_dir, 'earlystop_now.txt')
-      # prepare early stopping hook (which also handles logic here)
-
-      self._is_early_stopping = True
-
-      eval_early_stop_hook = twml.hooks.EarlyStopHook(
-        metric=early_stop_metric,
-        checkpoint_dir=self._save_dir,
-        patience=early_stop_patience,
-        minimize=early_stop_minimize,
-        tolerance=early_stop_tolerance,
-        get_estimator_spec_fn=lambda: self.current_estimator_spec,
-        file_path=early_stop_path,
-        exit_on_end=os.environ.get('TF_CONFIG') is not None)  # only exit for distributed jobs
-      # add early stop hook to eval hooks
-      eval_hooks.append(eval_early_stop_hook)
-
-      # prepare the commensurate training hook
-      train_early_stop_hook = twml.hooks.StopIfExistsHook(early_stop_path)
-      train_hooks.append(train_early_stop_hook)
-
-    if max_duration is not None:
-      train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=self.is_chief()
-      )
-      eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=os.environ.get('TF_CONFIG') is not None,
-        save_dir=self._save_dir,
-        overwrite=False
-      )  # only exit for distributed jobs
-
-      train_hooks.append(train_early_stop_duration_hook)
-      eval_hooks.append(eval_early_stop_duration_hook)
-
-    with self.experiment_tracker.track_experiment(eval_hooks, lambda: self.current_estimator_spec):
-      train_spec = self.get_train_spec(train_input_fn, train_max_steps, train_hooks)
-      eval_spec = self.get_eval_spec(eval_input_fn, eval_steps,
-                                     eval_delay, eval_period,
-                                     eval_hooks, exporters)
-      self._train_and_evaluate(train_spec, eval_spec)
-
-    if self.is_chief():
-      self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS')
-
-    return self._save_dir
-
-  def _train_and_evaluate(self, train_spec, eval_spec):
-    """
-    Private method that calls
-    ``tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)``.
-    """
-    try:
-      tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)
-    except twml.errors.EarlyStopError:
-      # Ignore the exception if on evaluator.
-      if self.is_evaluator():
-        pass
-      else:
-        raise
-
-  def train(self, input_fn=None, steps=None, hooks=None):
-    """
-    Train the estimator for `steps` training steps.
-
-    Args:
-      steps:
-        number of steps for which to perform training. For example, 100 means each
-        evaluation will end after processing 100 batches.
-        Defaults to None. i.e. trains on the entire dataset a single time.
-        Non-positive values and None-values go through the entire training set each epoch.
-      input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-    """
-    if os.environ.get('TF_CONFIG') and "is_calibrating" not in self.params:
-      raise ValueError("trainer.train() can not be used with distributed / hogwild setups")
+        Args:
+          calibrator:
+            a twml.Calibrator instance or a dict of the form {name(str): twml.Calibrator}.
+          steps:
+            Maximum steps to accumulate examples for calibration. Optional.
+            If not specified, examples will be accumulated until all downsampled parts are processed.
+          input_fn:
+            Function to iterate through training set. It is passed to estimator.train.
+          hooks:
+            List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
+          save_calibrator:
+            Boolean (default: True). If set to True it will save the calibrator layer.
+        """
+
+        if not callable(input_fn):
+            raise ValueError("Expecting callable input_fn function")
+
+        # making everything a dict to avoid multiple ifs
+        if isinstance(calibrator, twml.contrib.calibrators.Calibrator):
+            calibrator = {"default": calibrator}
+
+        # This is a dummy call to train, since we cannot predict without training
+        # from the Estimator API
+        self._estimator.train(input_fn, steps=1)
+        max_steps = steps if steps is not None else -1
+        for name, clbrt in sorted(calibrator.items(), key=itemgetter(0)):
+            count = 0
+            for out in self._estimator.predict(
+                input_fn, hooks=hooks, yield_single_examples=False
+            ):
+                if max_steps > 0 and count > max_steps:
+                    break
+                clbrt.accumulate_feature(out)
+                count += 1
+            clbrt.calibrate()
+
+        # this step is done to allow us to keep the current phases event file for
+        # visualization on Tensorboard. It removes all files that
+        # are not event files. This piece of code should be deprecated when
+        # we deprecate the MDL calibrator (CX-12329)
+        for fname in tf.io.gfile.listdir(self._save_dir):
+            if not fname.startswith("events"):
+                tf.io.gfile.remove(os.path.join(self._save_dir, fname))
+
+        if save_calibrator:
+            # If we only have one calibrator, the calibrator signature
+            # will be set to default
+            if len(calibrator) == 1:
+                calibrator = calibrator["default"]
+                calibrator.save(
+                    self.params.save_dir, name=calibrator.name, verbose=True
+                )
+            else:
+                for name, clbrt in calibrator.items():
+                    clbrt.save(
+                        self.params.save_dir, name=clbrt.name + str(name), verbose=True
+                    )
+
+    def predict(self, *args, **kwargs):
+        """
+        Wrapper over the tensorflow `Estimator.predict
+        <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#predict>`_.
+        method. See that documentation for description of arguments accepted.
+
+        If hooks is passed as an argument, the specified hooks are used.
+        Else when profiler_steps is specified in the constructor of the Trainer, a
+        tf.train.ProfilerHook is passed to the predict interface.
+        Otherwise, hooks is set to an empty list.
+        """
+        if "hooks" not in kwargs and len(args) < 3:
+            # If hooks is not specified as a keyword argument, nor as a positional argument
+            # add hooks as a keyword argument.
+            kwargs["hooks"] = self.get_predict_hooks()
+
+        return self.estimator.predict(*args, **kwargs)
+
+    def hub_export(
+        self,
+        name,
+        serving_input_receiver_fn,
+        export_dir=None,
+        checkpoint_path=None,
+        export_task_type_overrider=None,
+    ):
+        """
+        Exports registered modules into a save directory.
+
+        This method creates a directory under export_path with the save TF Hub.
+        One sub-directory (named export_name) per module registered via register_module_for_export.
+
+        Arguments:
+          name:
+            unique name of the module to export.
+          serving_input_receiver_fn:
+            A function with no arguments that returns a ServingInputReceiver.
+            This is used with the estimator passed to export() to build the graph (in PREDICT mode)
+            that registers the modules for export. The model in that graph is never run,
+            so the actual data provided by this input fn does not matter.
+          export_dir:
+            A string containing a directory where to write the export directories.
+            Defaults to the save_dir.
+          checkpoint_path:
+            The checkpoint path to export. Defaults to the latest.
+          export_task_type_overrider:
+            Specifies the task type that will override the default task type used for export
+            (hogwild training defaults to evaluator, otherwise, defaults to chief)
+        """
+        if export_task_type_overrider:
+            if not self.is_task_type(export_task_type_overrider):
+                logging.info(
+                    f"Trainer.hub_export ignored due to process not being {export_task_type_overrider}"
+                )
+                return
+        else:
+            if self._using_hogwild:
+                if not self.is_evaluator():
+                    logging.info(
+                        "Trainer.hub_export ignored due to the process not being evaluator."
+                    )
+                    return
+            else:
+                if not self.is_chief():
+                    logging.info(
+                        "Trainer.hub_export ignored due to the process not being chief."
+                    )
+                    return
+
+        if export_dir:
+            export_dir = sanitize_hdfs_path(export_dir)
+
+        if checkpoint_path:
+            checkpoint_path = sanitize_hdfs_path(checkpoint_path)
+        else:
+            checkpoint_path = self.best_or_latest_checkpoint
+
+        export_dir = export_dir if export_dir is not None else self._save_dir
+        exporter = hub.LatestModuleExporter(name, serving_input_receiver_fn)
+        # The path_exporter by default contains a timestamp directory in its path.
+        path_exporter = exporter.export(
+            estimator=self.estimator,
+            export_path=export_dir,
+            checkpoint_path=checkpoint_path,
+        )
 
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
+        # LatestModuleExporter.export() returns a binary string on Cloud ML Engine
+        # but tf.io.gfile.listdir() does not; this is an issue when joining paths
+        if isinstance(path_exporter, bytes):
+            path_exporter = path_exporter.decode()
 
-    if self._is_early_stopping:
-      raise ValueError("Can not call train() after learn() when using early stopping.")
+        # Copying the saved hub module to export_dir so we don't need to specify
+        # the timestamp when loading the module.
+        # This is a workaround due to the current implementation of hub.LatestModuleExporter.
+        # This works for multiple hub modules.
+        hub_exported_modules = tf.io.gfile.listdir(path_exporter)
 
-    hooks = self.get_train_hooks() if hooks is None else hooks
-    self._estimator.train(input_fn, steps=steps, hooks=hooks)
-    return self
+        backup_dir = os.path.join(
+            export_dir, "backups", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        )
 
-  def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
-    """
-    Evaluate the estimator for `steps` evaluation steps.
-
-    Args:
-      steps:
-        number of steps for which to perform evaluation. For example, 100 means each
-        evaluation will end after processing 100 batches.
-        Defaults to None. i.e. evaluates on the entire dataset a single time.
-        Negative values and None-values go through the entire training set each epoch.
-      input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.evaluate.
-      hooks:
-        List of SessionRunHooks used for evaluation. Defaults to None.
-        Note that, unlike learn(), hooks defaults to None instead of self.get_eval_hooks()
-        as the latter may implement early-stopping, which isn't necessarilty the desired
-        behavior when calling evaluate() on its own.
-      name:
-        Name of the evaluation if user needs to run multiple evaluations on different data sets.
-        Metrics for different evaluations are saved in separate folders,
-        and appear separately in tensorboard.
-
-    Returns:
-      If `is_evaluator()`, returns a dict containing the evaluation metrics specified
-      in `metric_fn` keyed by name, as well as an entry `global_step` that contains
-      the value of the global step for which this evaluation was performed.
-      Otherwise (i.e. `is_evaluator() == False`), returns None.
-    """
-    if not self.is_evaluator():
-      return None
+        for folder in hub_exported_modules:
+            hub_module_oldpath = os.path.join(path_exporter, folder)
+            hub_module_newpath = os.path.join(export_dir, folder)
 
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
+            # If the destination already exists, move to backup
+            if tf.io.gfile.exists(hub_module_newpath):
+                # Ensure backup_dir exists
+                tf.io.gfile.makedirs(backup_dir)
+                hub_module_backup = os.path.join(backup_dir, folder)
+                tf.io.gfile.rename(hub_module_newpath, hub_module_backup)
 
-    hooks = self.get_eval_hooks() if hooks is None else hooks
-    hooks = [] if hooks is None else hooks
+            tf.io.gfile.rename(hub_module_oldpath, hub_module_newpath)
 
-    # for consistency with train/learn
-    eval_steps = None if steps is not None and steps < 0 else steps
+        # Since the timestamped folder exists but is empty, we can delete it.
+        tf.io.gfile.rmtree(path_exporter)
 
-    with self.experiment_tracker.track_experiment(hooks, lambda: self.current_estimator_spec, name=name):
-      checkpoint = self.best_or_latest_checkpoint
-      computed_metrics = self._estimator.evaluate(
-        input_fn,
-        steps=eval_steps,
-        hooks=hooks,
-        checkpoint_path=checkpoint,
-        name=name
-      )
+    def _is_on_gke(self) -> bool:
+        """Returns True if running on gke."""
+        cluster = os.environ.get("TWML_JOB_CLUSTER")
+        if not cluster or cluster in {"smf1", "atla"}:
+            return False
+        return True
 
-    return computed_metrics
+    def _maybe_del_tsd_exit(self, state_files) -> None:
+        """Handle potential early exit and TwitterSetDeployment deletion.
 
-  def start_tensorboard(self, port=None):
-    """
-    Start tensorboard process to visualize logs in save_dir.
-    """
-    logging.info("Starting tensorboard.")
-    if self._tensorboard_handle:
-      logging.warn("Tensorboard already running. Nothing done.")
-      return
-
-    if port is None:
-      if 'tensorboard_port' not in self.params.values():
-        raise ValueError('You must specify a port for tensorboard to run on.')
-      elif self.params.tensorboard_port is None:
-        return
-      else:
-        port = self.params.tensorboard_port
-
-    mldash_path = 'experiments'
-    if self.experiment_tracker.path:
-      mldash_path += '/%s' % encode_url(self.experiment_tracker.experiment_id)
-    tensorboard_args = ['--logdir=%s' % self._save_dir, '--port=%d' % port]
-
-    try:
-      args = ['email_and_launch_tensorboard', mldash_path, '--'] + tensorboard_args
-      self._tensorboard_handle = subprocess.Popen(args)
-    except OSError:
-      try:
-        self._tensorboard_handle = subprocess.Popen(['tensorboard'] + tensorboard_args)
-      except OSError:
-        try:
-          # this will work with Twitter internal pants build when run locally
-          args = ['./pants', 'run', 'twml:tensorboard', '--'] + tensorboard_args
-          self._tensorboard_handle = subprocess.Popen(args)
-        except OSError:
-          logging.error("No tensorboard installed, won't able to visualize training in tensorboard.")
+        If:
+          - distributed training
+          - running GKE
+          - training is finished (all state_files exists)
+        we will exit early and not restart work
 
-  def stop_tensorboard(self):
-    """
-    Shutdown this Trainer's associated Tensorboard.
-    """
-    if self._tensorboard_handle:
-      logging.info("Shutting down tensorboard.")
-      self._tensorboard_handle.kill()
-    else:
-      logging.warn("No known tensorboard process. Nothing done.")
-
-  def calibrate(self,
-                calibrator,
-                steps=None,
-                input_fn=None,
-                save_calibrator=True,
-                hooks=None):
-    """
-    Calibrate the calibrator for `steps` calibration steps using the estimator.train method.
-    The build_graph passed to the Trainer constructor should
-    call calibrator.accumulate using something like tf.py_func.
-    That way, when this method calls estimator.train the calibrator will
-    accumulate one epoch of samples. After which, this method calls calibrator.calibrate().
-    It is up to the user to then call calibrator.save() to save the calibrated Layer
-    and other information to disk for multi-phase training.
-
-    Args:
-      calibrator:
-        a twml.Calibrator instance or a dict of the form {name(str): twml.Calibrator}.
-      steps:
-        Maximum steps to accumulate examples for calibration. Optional.
-        If not specified, examples will be accumulated until all downsampled parts are processed.
-      input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-      save_calibrator:
-        Boolean (default: True). If set to True it will save the calibrator layer.
-    """
+        If --distributed_training_cleanup = True then we will also handle
+        cleaning up the TwitterSetDeployments.
+
+        Args:
+          state_files: A python list indicate state files to determine the finish
+          state of the job.
+        """
+        # job type that is responsible for experiment tracking will remain alive
+        # until it marks the experiment as finished.
+        if self.experiment_tracker._env_eligible_for_recording_experiment:
+            exp_status = self.experiment_tracker.get_run_status()
+            if exp_status and exp_status not in {"Success", "Failed"}:
+                logging.info(
+                    f"Not exiting early because experiment is still {exp_status}."
+                )
+                return
+
+        # do not bother if we are on prem
+        if not self._is_on_gke():
+            logging.info("No need to exit early because running on prem.")
+            return
+
+        states = [
+            twml.util.file_exist_in_dir(self._save_dir, state_file)
+            for state_file in state_files
+        ]
+        do_not_restart = self._params.get("distributed") and all(states)
+        if not do_not_restart:
+            return
 
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
-
-    # making everything a dict to avoid multiple ifs
-    if isinstance(calibrator, twml.contrib.calibrators.Calibrator):
-      calibrator = {"default": calibrator}
-
-    # This is a dummy call to train, since we cannot predict without training
-    # from the Estimator API
-    self._estimator.train(input_fn, steps=1)
-    max_steps = steps if steps is not None else -1
-    for name, clbrt in sorted(calibrator.items(), key=itemgetter(0)):
-      count = 0
-      for out in self._estimator.predict(input_fn, hooks=hooks, yield_single_examples=False):
-        if max_steps > 0 and count > max_steps:
-          break
-        clbrt.accumulate_feature(out)
-        count += 1
-      clbrt.calibrate()
-
-    # this step is done to allow us to keep the current phases event file for
-    # visualization on Tensorboard. It removes all files that
-    # are not event files. This piece of code should be deprecated when
-    # we deprecate the MDL calibrator (CX-12329)
-    for fname in tf.io.gfile.listdir(self._save_dir):
-      if not fname.startswith("events"):
-        tf.io.gfile.remove(os.path.join(self._save_dir, fname))
-
-    if save_calibrator:
-      # If we only have one calibrator, the calibrator signature
-      # will be set to default
-      if len(calibrator) == 1:
-        calibrator = calibrator['default']
-        calibrator.save(
-          self.params.save_dir,
-          name=calibrator.name,
-          verbose=True
-        )
-      else:
-        for name, clbrt in calibrator.items():
-          clbrt.save(
-            self.params.save_dir,
-            name=clbrt.name + str(name),
-            verbose=True
-          )
-
-  def predict(self, *args, **kwargs):
-    """
-    Wrapper over the tensorflow `Estimator.predict
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#predict>`_.
-    method. See that documentation for description of arguments accepted.
-
-    If hooks is passed as an argument, the specified hooks are used.
-    Else when profiler_steps is specified in the constructor of the Trainer, a
-    tf.train.ProfilerHook is passed to the predict interface.
-    Otherwise, hooks is set to an empty list.
-    """
-    if 'hooks' not in kwargs and len(args) < 3:
-      # If hooks is not specified as a keyword argument, nor as a positional argument
-      # add hooks as a keyword argument.
-      kwargs['hooks'] = self.get_predict_hooks()
-
-    return self.estimator.predict(*args, **kwargs)
-
-  def hub_export(self,
-                 name,
-                 serving_input_receiver_fn,
-                 export_dir=None,
-                 checkpoint_path=None,
-                 export_task_type_overrider=None):
-    """
-    Exports registered modules into a save directory.
-
-    This method creates a directory under export_path with the save TF Hub.
-    One sub-directory (named export_name) per module registered via register_module_for_export.
-
-    Arguments:
-      name:
-        unique name of the module to export.
-      serving_input_receiver_fn:
-        A function with no arguments that returns a ServingInputReceiver.
-        This is used with the estimator passed to export() to build the graph (in PREDICT mode)
-        that registers the modules for export. The model in that graph is never run,
-        so the actual data provided by this input fn does not matter.
-      export_dir:
-        A string containing a directory where to write the export directories.
-        Defaults to the save_dir.
-      checkpoint_path:
-        The checkpoint path to export. Defaults to the latest.
-      export_task_type_overrider:
-        Specifies the task type that will override the default task type used for export
-        (hogwild training defaults to evaluator, otherwise, defaults to chief)
-    """
-    if export_task_type_overrider:
-      if not self.is_task_type(export_task_type_overrider):
-        logging.info(
-          f"Trainer.hub_export ignored due to process not being {export_task_type_overrider}")
-        return
-    else:
-      if self._using_hogwild:
-        if not self.is_evaluator():
-          logging.info("Trainer.hub_export ignored due to the process not being evaluator.")
-          return
-      else:
-        if not self.is_chief():
-          logging.info("Trainer.hub_export ignored due to the process not being chief.")
-          return
-
-    if export_dir:
-      export_dir = sanitize_hdfs_path(export_dir)
-
-    if checkpoint_path:
-      checkpoint_path = sanitize_hdfs_path(checkpoint_path)
-    else:
-      checkpoint_path = self.best_or_latest_checkpoint
-
-    export_dir = export_dir if export_dir is not None else self._save_dir
-    exporter = hub.LatestModuleExporter(name, serving_input_receiver_fn)
-    # The path_exporter by default contains a timestamp directory in its path.
-    path_exporter = exporter.export(estimator=self.estimator,
-                                    export_path=export_dir,
-                                    checkpoint_path=checkpoint_path)
-
-    # LatestModuleExporter.export() returns a binary string on Cloud ML Engine
-    # but tf.io.gfile.listdir() does not; this is an issue when joining paths
-    if isinstance(path_exporter, bytes):
-      path_exporter = path_exporter.decode()
-
-    # Copying the saved hub module to export_dir so we don't need to specify
-    # the timestamp when loading the module.
-    # This is a workaround due to the current implementation of hub.LatestModuleExporter.
-    # This works for multiple hub modules.
-    hub_exported_modules = tf.io.gfile.listdir(path_exporter)
-
-    backup_dir = os.path.join(export_dir, "backups",
-                              datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
-
-    for folder in hub_exported_modules:
-      hub_module_oldpath = os.path.join(path_exporter, folder)
-      hub_module_newpath = os.path.join(export_dir, folder)
-
-      # If the destination already exists, move to backup
-      if tf.io.gfile.exists(hub_module_newpath):
-        # Ensure backup_dir exists
-        tf.io.gfile.makedirs(backup_dir)
-        hub_module_backup = os.path.join(backup_dir, folder)
-        tf.io.gfile.rename(hub_module_newpath, hub_module_backup)
-
-      tf.io.gfile.rename(hub_module_oldpath, hub_module_newpath)
-
-    # Since the timestamped folder exists but is empty, we can delete it.
-    tf.io.gfile.rmtree(path_exporter)
-
-  def _is_on_gke(self) -> bool:
-    """Returns True if running on gke."""
-    cluster = os.environ.get('TWML_JOB_CLUSTER')
-    if not cluster or cluster in {'smf1', 'atla'}:
-      return False
-    return True
-
-  def _maybe_del_tsd_exit(self, state_files) -> None:
-    """Handle potential early exit and TwitterSetDeployment deletion.
-
-      If:
-        - distributed training
-        - running GKE
-        - training is finished (all state_files exists)
-      we will exit early and not restart work
-
-      If --distributed_training_cleanup = True then we will also handle
-      cleaning up the TwitterSetDeployments.
-
-      Args:
-        state_files: A python list indicate state files to determine the finish 
-        state of the job.
-    """
-    # job type that is responsible for experiment tracking will remain alive
-    # until it marks the experiment as finished.
-    if self.experiment_tracker._env_eligible_for_recording_experiment:
-      exp_status = self.experiment_tracker.get_run_status()
-      if exp_status and exp_status not in {'Success', 'Failed'}:
         logging.info(
-          f"Not exiting early because experiment is still {exp_status}."
+            f"Exiting early because a _SUCCESS file already exists in {self._save_dir}"
         )
-        return
-
-    # do not bother if we are on prem
-    if not self._is_on_gke():
-      logging.info("No need to exit early because running on prem.")
-      return
-
-    states = [
-      twml.util.file_exist_in_dir(self._save_dir, state_file) for state_file in state_files]
-    do_not_restart = (self._params.get('distributed') and all(states))
-    if not do_not_restart:
-      return
-
-    logging.info(
-      f"Exiting early because a _SUCCESS file already exists in {self._save_dir}")
-    if self._params.get('distributed_training_cleanup'):
-      resource_name = '-'.join([
-        os.environ['TWML_JOB_NAME'],
-        os.environ['TWML_DISTRIBUTED_JOB_TYPE'],
-        os.environ['TWML_JOB_ENV'],
-      ])
-      logging.info(f"Deleting TwitterSetDeployment {resource_name}")
-      # each job type will manage its own deletion so that deletion happens
-      # in the trainer init call for every job type
-      # otherwise we may kill another job type during an important
-      # process like experiment tracking management (handled by the evaluator
-      kubectl_delete_by_name(
-        zone=None,
-        namespace=os.environ['TWML_JOB_ROLE'],
-        resource_type=Resource.TWITTERSETDEPLOYMENTS.value,
-        resource_name=resource_name,
-        wait=False,
-      )
-    sys.exit(0)
-
-  def write_state_to_disk(self, save_dir, filename='_SUCCESS') -> None:
-    """Write state file to disk to indicate the state of training process. This is usually used 
-      to mark the state of training progress and determine the start when job restarts/resumes.
-    Args:
-      save_dir: A str of local/gcs/hdfs dir to write the state file.
-      file_name: A str indicate the state file. Default to `_SUCCESS`.
-    """
-    file_path = os.path.join(save_dir, filename)
-    if tf.io.gfile.exists(file_path):
-      tf.logging.warn(f'{file_path} already exist.')
-      return
-
-    with tf.io.gfile.GFile(file_path, 'w') as f:
-      f.write('')
\ No newline at end of file
+        if self._params.get("distributed_training_cleanup"):
+            resource_name = "-".join(
+                [
+                    os.environ["TWML_JOB_NAME"],
+                    os.environ["TWML_DISTRIBUTED_JOB_TYPE"],
+                    os.environ["TWML_JOB_ENV"],
+                ]
+            )
+            logging.info(f"Deleting TwitterSetDeployment {resource_name}")
+            # each job type will manage its own deletion so that deletion happens
+            # in the trainer init call for every job type
+            # otherwise we may kill another job type during an important
+            # process like experiment tracking management (handled by the evaluator
+            kubectl_delete_by_name(
+                zone=None,
+                namespace=os.environ["TWML_JOB_ROLE"],
+                resource_type=Resource.TWITTERSETDEPLOYMENTS.value,
+                resource_name=resource_name,
+                wait=False,
+            )
+        sys.exit(0)
+
+    def write_state_to_disk(self, save_dir, filename="_SUCCESS") -> None:
+        """Write state file to disk to indicate the state of training process. This is usually used
+          to mark the state of training progress and determine the start when job restarts/resumes.
+        Args:
+          save_dir: A str of local/gcs/hdfs dir to write the state file.
+          file_name: A str indicate the state file. Default to `_SUCCESS`.
+        """
+        file_path = os.path.join(save_dir, filename)
+        if tf.io.gfile.exists(file_path):
+            tf.logging.warn(f"{file_path} already exist.")
+            return
+
+        with tf.io.gfile.GFile(file_path, "w") as f:
+            f.write("")
diff --git a/twml/twml/util.py b/twml/twml/util.py
index cd7679a6f..4560e52b6 100644
--- a/twml/twml/util.py
+++ b/twml/twml/util.py
@@ -3,47 +3,39 @@
 """
 
 import argparse
-from datetime import datetime
 import itertools
 import json
 import logging as _logging
 import os
 import re
+from datetime import datetime
 
-from twitter.ml.common.resources import AuroraPath
-from twitter.deepbird.hparam import HParams
-from twitter.deepbird.io.util import (
-  _get_feature_id,  # noqa: F401
-  feature_id,  # noqa: F401
-  preprocess_feature_regex,  # noqa: F401
-  preprocess_path,  # noqa: F401
-  sanitize_hdfs_path,  # noqa: F401
-  is_string,  # noqa: F401
-  list_files,  # noqa: F401
-  match_files,  # noqa: F401
-)
-from twitter.deepbird.io.legacy.util import (
-  batch_apply,  # noqa: F401
-  boolean_mask,  # noqa: F401
-  fixed_length_tensor,  # noqa: F401
-)
-from twitter.deepbird.sparse.util import (
-  convert_to_sparse,  # noqa: F401
-  limit_bits,  # noqa: F401
-)
-
-from dateutil import rrule
-from joblib import delayed, Parallel
-from six import string_types
-
+import tensorflow.compat.v1 as tf
 from absl import logging
+from dateutil import rrule
+from joblib import Parallel, delayed
 from libtwml import CLIB, OPLIB  # noqa: F401
-import tensorflow.compat.v1 as tf
+from six import string_types
 from tensorflow.python.platform import tf_logging
+from twitter.deepbird.hparam import HParams
+from twitter.deepbird.io.legacy.util import batch_apply  # noqa: F401
+from twitter.deepbird.io.legacy.util import boolean_mask  # noqa: F401
+from twitter.deepbird.io.legacy.util import fixed_length_tensor  # noqa: F401
+from twitter.deepbird.io.util import _get_feature_id  # noqa: F401
+from twitter.deepbird.io.util import feature_id  # noqa: F401
+from twitter.deepbird.io.util import is_string  # noqa: F401
+from twitter.deepbird.io.util import list_files  # noqa: F401
+from twitter.deepbird.io.util import match_files  # noqa: F401
+from twitter.deepbird.io.util import preprocess_feature_regex  # noqa: F401
+from twitter.deepbird.io.util import preprocess_path  # noqa: F401
+from twitter.deepbird.io.util import sanitize_hdfs_path  # noqa: F401
+from twitter.deepbird.sparse.util import convert_to_sparse  # noqa: F401
+from twitter.deepbird.sparse.util import limit_bits  # noqa: F401
+from twitter.ml.common.resources import AuroraPath
+
 import twml
 from twml.feature_config import FeatureConfigBuilder
 
-
 # big_prime is less than 2**32
 # This just needs to be co-prime with powers of 2
 # any large prime is sufficient, but it's not necessary.
@@ -51,892 +43,942 @@
 
 
 def multiplicative_hash(input, hash_constant=HASHING_PRIME):
-  return input * hash_constant
+    return input * hash_constant
 
 
 def _return_tensors_from_checkpoint_folder(init_dir, model_name=None):
-  """Returns tensors list from a checkpoint folder
-
-  Args:
-    init_dir: Name of the checkpoint directory.
-    model_name: the model which we will use to obtain the checkpoint
-      (e.g. model.ckpt-50000) if set to None it will default to the
-      latest model saved in the checkpont file.
-
-  """
-  if model_name is None:
-    # gets the most recently generated model.cpkt file
-    model_path = tf.train.latest_checkpoint(init_dir)
-    if model_path is None:
-      raise ValueError("Could not find a valid model checkpoint inside the directory")
-  else:
-    model_path = os.path.join(init_dir, model_name)
-  reader = tf.train.NewCheckpointReader(model_path)
-  try:
-    return (reader.debug_string().decode("utf-8"))
-  except OSError:
-    logging.error('Could not decode the string')
+    """Returns tensors list from a checkpoint folder
+
+    Args:
+      init_dir: Name of the checkpoint directory.
+      model_name: the model which we will use to obtain the checkpoint
+        (e.g. model.ckpt-50000) if set to None it will default to the
+        latest model saved in the checkpont file.
+
+    """
+    if model_name is None:
+        # gets the most recently generated model.cpkt file
+        model_path = tf.train.latest_checkpoint(init_dir)
+        if model_path is None:
+            raise ValueError(
+                "Could not find a valid model checkpoint inside the directory"
+            )
+    else:
+        model_path = os.path.join(init_dir, model_name)
+    reader = tf.train.NewCheckpointReader(model_path)
+    try:
+        return reader.debug_string().decode("utf-8")
+    except OSError:
+        logging.error("Could not decode the string")
 
 
 def get_scope_dict(init_dir, incoming_scope_name, current_scope_name, model_name=None):
-  """Returns tensors map from a checkpoint file.
-
-  Args:
-    file_name:
-      Name of the checkpoint directory.
-    incoming_scope_name:
-      scope name of the previous phase
-    current_scope_name:
-      scope name of current phase
-    model_name:
-      the model which we will use to obtain the checkpoint
-      (e.g. model.ckpt-50000) if set to None it will default
-      to the latest model saved in the checkpoint file.
-  Returns:
-    init_map:
-      init_map which will be inputted to the checkpoint
-  """
-  init_map = {}
-  reader_dump = _return_tensors_from_checkpoint_folder(init_dir=init_dir,
-                                                       model_name=model_name).splitlines()
-  for member in reader_dump:
-    # remove global_step since it is not necessary
-    if 'global_step' not in member:
-      saved_variables = str(member.split(" ")[0])
-      saved_scope = saved_variables.rsplit('/', 1)[0] + "/"
-      new_scope = saved_scope.replace(incoming_scope_name, current_scope_name, 1)
-      # create key in init_map
-      if saved_scope not in init_map.keys():  # pylint: disable=dict-keys-not-iterating
-        init_map[saved_scope] = new_scope
-  return init_map
+    """Returns tensors map from a checkpoint file.
+
+    Args:
+      file_name:
+        Name of the checkpoint directory.
+      incoming_scope_name:
+        scope name of the previous phase
+      current_scope_name:
+        scope name of current phase
+      model_name:
+        the model which we will use to obtain the checkpoint
+        (e.g. model.ckpt-50000) if set to None it will default
+        to the latest model saved in the checkpoint file.
+    Returns:
+      init_map:
+        init_map which will be inputted to the checkpoint
+    """
+    init_map = {}
+    reader_dump = _return_tensors_from_checkpoint_folder(
+        init_dir=init_dir, model_name=model_name
+    ).splitlines()
+    for member in reader_dump:
+        # remove global_step since it is not necessary
+        if "global_step" not in member:
+            saved_variables = str(member.split(" ")[0])
+            saved_scope = saved_variables.rsplit("/", 1)[0] + "/"
+            new_scope = saved_scope.replace(incoming_scope_name, current_scope_name, 1)
+            # create key in init_map
+            if (
+                saved_scope not in init_map.keys()
+            ):  # pylint: disable=dict-keys-not-iterating
+                init_map[saved_scope] = new_scope
+    return init_map
 
 
 def get_init_map(
-        init_from_dir,
-        exclude_var_names=None,
-        exclude_name_scopes=None,
-        name_scope_to_remove=None,
-        name_scope_to_prepend=None):
-  """
-  Builds a map for initializing from a checkpoint (see tf.train.init_from_checkpoint).
-
-  It assumes that the latter part of the variable names are consistent between the checkpoint and
-  the new model, but their name_scopes may be different. If the checkpoint model has variable names
-  of the form old/scope/var/foo, and the corresponding variable names for the new model should be
-  my/new/scope/var/foo, then you should set name_scope_to_remove = 'old/' and
-  name_scope_to_prepend = 'my/new/'.
-
-  This function can be used to
-
-  1. Generate an ``init_map`` map that can be passed to the ``Trainer`` init or
-  2. Used to generate an ``init_map`` directly inside ``build_graph_fn``, in
-     which case it should be passed directly to ``tf.train.init_from_checkpoint`` inside
-     ``build_graph_fn``, in which case you do not also need to specify the ``init_map`` argument to
-     the trainer.
-
-  Parameters
-  ----------
-  init_from_dir: Directory containing checkpoint
-  exclude_var_names: list[str]
-    List of variables in the checkpoint that should be excluded from the map.
-  exclude_name_scopes: list[str]
-    List of name_scopes in the checkpoint model that should be excluded from the map.
-  name_scope_to_remove: str
-    portion of name_scope for checkpoint variables that should not be included in variable names
-    for new model.
-  name_scope_to_prepend: str
-    name_scope to prepend to variable names in checkpoint to give variable names for new model.
-
-  Returns
-  -------
-  dict
-    keys are variable names in the checkpoint and values are variable names in the new model,
-    into which the checkpoint parameters should be loaded.
-  """
-  vars_to_restore = get_checkpoint_variable_names(
     init_from_dir,
-    exclude_var_names=exclude_var_names,
-    exclude_scopes=exclude_name_scopes,
-  )
-
-  if name_scope_to_prepend is not None:
-    if not name_scope_to_prepend.endswith('/'):
-      name_scope_to_prepend += '/'
-
-  if name_scope_to_remove is not None:
-    if not name_scope_to_remove.endswith('/'):
-      name_scope_to_remove += '/'
-
-  init_map = {}
+    exclude_var_names=None,
+    exclude_name_scopes=None,
+    name_scope_to_remove=None,
+    name_scope_to_prepend=None,
+):
+    """
+    Builds a map for initializing from a checkpoint (see tf.train.init_from_checkpoint).
+
+    It assumes that the latter part of the variable names are consistent between the checkpoint and
+    the new model, but their name_scopes may be different. If the checkpoint model has variable names
+    of the form old/scope/var/foo, and the corresponding variable names for the new model should be
+    my/new/scope/var/foo, then you should set name_scope_to_remove = 'old/' and
+    name_scope_to_prepend = 'my/new/'.
+
+    This function can be used to
+
+    1. Generate an ``init_map`` map that can be passed to the ``Trainer`` init or
+    2. Used to generate an ``init_map`` directly inside ``build_graph_fn``, in
+       which case it should be passed directly to ``tf.train.init_from_checkpoint`` inside
+       ``build_graph_fn``, in which case you do not also need to specify the ``init_map`` argument to
+       the trainer.
+
+    Parameters
+    ----------
+    init_from_dir: Directory containing checkpoint
+    exclude_var_names: list[str]
+      List of variables in the checkpoint that should be excluded from the map.
+    exclude_name_scopes: list[str]
+      List of name_scopes in the checkpoint model that should be excluded from the map.
+    name_scope_to_remove: str
+      portion of name_scope for checkpoint variables that should not be included in variable names
+      for new model.
+    name_scope_to_prepend: str
+      name_scope to prepend to variable names in checkpoint to give variable names for new model.
+
+    Returns
+    -------
+    dict
+      keys are variable names in the checkpoint and values are variable names in the new model,
+      into which the checkpoint parameters should be loaded.
+    """
+    vars_to_restore = get_checkpoint_variable_names(
+        init_from_dir,
+        exclude_var_names=exclude_var_names,
+        exclude_scopes=exclude_name_scopes,
+    )
 
-  for var_name in vars_to_restore:
-    var_name_checkpoint = var_name
+    if name_scope_to_prepend is not None:
+        if not name_scope_to_prepend.endswith("/"):
+            name_scope_to_prepend += "/"
 
     if name_scope_to_remove is not None:
-      var_name = var_name.replace(name_scope_to_remove, '')
-
-    var_name_new_model = var_name
-
-    if name_scope_to_prepend is not None:
-      var_name_new_model = name_scope_to_prepend + var_name_new_model
+        if not name_scope_to_remove.endswith("/"):
+            name_scope_to_remove += "/"
 
-    init_map[var_name_checkpoint] = var_name_new_model
+    init_map = {}
 
-  return init_map
+    for var_name in vars_to_restore:
+        var_name_checkpoint = var_name
 
+        if name_scope_to_remove is not None:
+            var_name = var_name.replace(name_scope_to_remove, "")
 
-def get_checkpoint_variable_names(model_dir, exclude_var_names=None, exclude_scopes=None):
-  """
-  Gets a list of variable names from the latest checkpoint in model_dir.
-  Removes variables with scope defined by exclude_scopes, and/or with names defined by
-  exclude_var_names.
+        var_name_new_model = var_name
 
-  Args:
-    model_dir (str): Directory containing checkpoint file for the pre-trained model
-    exclude_var_names (list): Optional variable names to exclude (can include full/partial scope)
-    exclude_scopes (list): Optional scopes to exclude
+        if name_scope_to_prepend is not None:
+            var_name_new_model = name_scope_to_prepend + var_name_new_model
 
-  Returns:
-    list: variable names
-  """
-  checkpoint_path = tf.train.latest_checkpoint(model_dir)
-  variables_and_shapes = tf.train.list_variables(checkpoint_path)
+        init_map[var_name_checkpoint] = var_name_new_model
 
-  def _keep(name):
-    if exclude_scopes and any(name.startswith(exc_scope) for exc_scope in exclude_scopes):
-      return False
-    if exclude_var_names and any(name.endswith(exc_var) for exc_var in exclude_var_names):
-      return False
-    return True
+    return init_map
 
-  names = [x[0] for x in variables_and_shapes if _keep(x[0])]
 
-  return names
+def get_checkpoint_variable_names(
+    model_dir, exclude_var_names=None, exclude_scopes=None
+):
+    """
+    Gets a list of variable names from the latest checkpoint in model_dir.
+    Removes variables with scope defined by exclude_scopes, and/or with names defined by
+    exclude_var_names.
+
+    Args:
+      model_dir (str): Directory containing checkpoint file for the pre-trained model
+      exclude_var_names (list): Optional variable names to exclude (can include full/partial scope)
+      exclude_scopes (list): Optional scopes to exclude
+
+    Returns:
+      list: variable names
+    """
+    checkpoint_path = tf.train.latest_checkpoint(model_dir)
+    variables_and_shapes = tf.train.list_variables(checkpoint_path)
+
+    def _keep(name):
+        if exclude_scopes and any(
+            name.startswith(exc_scope) for exc_scope in exclude_scopes
+        ):
+            return False
+        if exclude_var_names and any(
+            name.endswith(exc_var) for exc_var in exclude_var_names
+        ):
+            return False
+        return True
+
+    names = [x[0] for x in variables_and_shapes if _keep(x[0])]
+
+    return names
 
 
 def to_snake_case(name):
-  """
-  Changes name to snake case
-  """
-  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
+    """
+    Changes name to snake case
+    """
+    intermediate = re.sub("(.)([A-Z][a-z0-9]+)", r"\1_\2", name)
+    insecure = re.sub("([a-z])([A-Z])", r"\1_\2", intermediate).lower()
+    # If the class is private the name starts with "_" which is not secure
+    # for creating scopes. We prefix the name with "private" in this case.
+    if insecure[0] != "_":
+        return insecure
+    return "private" + insecure
 
 
 def copy_phase_inputs(init_dir, dest_dir):
-  """Automatically copies the .json.tf from the init_dir to save_dir
-  so we can load multiple parameters at the same time.
-
-  Args:
-    init_dir:
-      Name of the checkpoint directory.
-    dest_dir:
-      Name of the output directory.
-  """
-  if init_dir is not None:
-    # we are using tf.io.gfile so we can use it with both local and hdfs paths
-    for files in tf.io.gfile.listdir(init_dir):
-      if files.endswith(".json.tf"):
-        src_file = os.path.join(init_dir, files)
-        dest_file = os.path.join(dest_dir, files)
-        if not tf.io.gfile.exists(dest_dir):
-          # creates the folder
-          try:
-            tf.io.gfile.makedirs(dest_dir)
-          # to prevent racing condition
-          except OSError:
-            if not tf.io.gfile.isdir(dest_dir):
-              raise
-        # dest_file may be old if it exists and
-        # dest_file gets copied several times in distributed training
-        tf.io.gfile.copy(src_file, dest_file, overwrite=True)
+    """Automatically copies the .json.tf from the init_dir to save_dir
+    so we can load multiple parameters at the same time.
+
+    Args:
+      init_dir:
+        Name of the checkpoint directory.
+      dest_dir:
+        Name of the output directory.
+    """
+    if init_dir is not None:
+        # we are using tf.io.gfile so we can use it with both local and hdfs paths
+        for files in tf.io.gfile.listdir(init_dir):
+            if files.endswith(".json.tf"):
+                src_file = os.path.join(init_dir, files)
+                dest_file = os.path.join(dest_dir, files)
+                if not tf.io.gfile.exists(dest_dir):
+                    # creates the folder
+                    try:
+                        tf.io.gfile.makedirs(dest_dir)
+                    # to prevent racing condition
+                    except OSError:
+                        if not tf.io.gfile.isdir(dest_dir):
+                            raise
+                # dest_file may be old if it exists and
+                # dest_file gets copied several times in distributed training
+                tf.io.gfile.copy(src_file, dest_file, overwrite=True)
 
 
 def rehash_sparse_features_nbits(sp_a, nbits, hash_fn=multiplicative_hash):
-  """
-  Rehash the feature ids of the sparse tensor,
-  and limit the output to n bits.
+    """
+    Rehash the feature ids of the sparse tensor,
+    and limit the output to n bits.
 
-  This is useful for making the distribution of
-  feature_ids more uniform, which may improve performance
-  in some situations.
+    This is useful for making the distribution of
+    feature_ids more uniform, which may improve performance
+    in some situations.
 
-  This would typically be used on the output of
-  PercentileDiscretizer, since it assigns many
-  bins to low-valued output feature ids.
+    This would typically be used on the output of
+    PercentileDiscretizer, since it assigns many
+    bins to low-valued output feature ids.
 
-  Input feature IDs should take values less than 2**32,
-  and nbits should be less than 32
+    Input feature IDs should take values less than 2**32,
+    and nbits should be less than 32
 
-  Args:
-    sp_a:
-      a tf.SparseTensor object
-    nbits:
-      integer number of bits to mask output feature_ids
-    hash_fn:
-      Function that takes integer values and returns hashes of these values.
-      The output does not need to be masked to the desired number of bits,
-      as this masking will be taken care of. Default value = multiplicative_hash.
+    Args:
+      sp_a:
+        a tf.SparseTensor object
+      nbits:
+        integer number of bits to mask output feature_ids
+      hash_fn:
+        Function that takes integer values and returns hashes of these values.
+        The output does not need to be masked to the desired number of bits,
+        as this masking will be taken care of. Default value = multiplicative_hash.
 
-  Returns:
-    a new tf.SparseTensor
-  """
+    Returns:
+      a new tf.SparseTensor
+    """
 
-  feature_ids = sp_a.indices[:, 1]
-  feature_ids = hash_fn(feature_ids)
+    feature_ids = sp_a.indices[:, 1]
+    feature_ids = hash_fn(feature_ids)
 
-  sample_ids = sp_a.indices[:, 0]
-  values = sp_a.values
-  dense_shape = sp_a.dense_shape
+    sample_ids = sp_a.indices[:, 0]
+    values = sp_a.values
+    dense_shape = sp_a.dense_shape
 
-  indices = tf.stack([sample_ids, feature_ids], axis=1)
+    indices = tf.stack([sample_ids, feature_ids], axis=1)
 
-  sp_a = tf.SparseTensor(indices, values, dense_shape)
+    sp_a = tf.SparseTensor(indices, values, dense_shape)
 
-  # note - we need 2**nbits >= batch size
-  # otherwise, sample_ids will be squashed by the mask.
-  return limit_sparse_tensor_size(sp_a, nbits)
+    # note - we need 2**nbits >= batch size
+    # otherwise, sample_ids will be squashed by the mask.
+    return limit_sparse_tensor_size(sp_a, nbits)
 
 
 def convert_to_hparams(opt):
-  """
-  Converts argparse.Namespace object to twitter.deepbird.hparam.hparam.HParams.
-  Note that tensorflow.contrib.training.HParams is gone in TF 2.x, and we forward ported
-  tensorflow.contrib.training.HParams to twitter.deepbird.hparam.hapram.HParams.
-
-  NOTE: If you are using estimators, please don't call this method and directly pass python dict
-  to TensorFlow estimator. Starting TensorFlow 2.0, Estimator will only accept dicts.
-  """
-
-  # Convert to dict so we can iterate through it cleanly.
-  if isinstance(opt, argparse.Namespace):
-    params_dict = vars(opt)
-  elif isinstance(opt, dict):
-    params_dict = opt
-  elif isinstance(opt, HParams):
-    logging.warning('If you are using Estimator, please pass python dict directly to Estimator.')
-    params_dict = opt.values()
-  else:
-    raise ValueError("Input can not be of type %s. "
-                     "It can be one of { argparse.Namespace, dict, "
-                     "twitter.deepbird.hparam.HParams}."
-                     % type(opt))
-
-  params = HParams()
-  # Hack to convert all parameters from hdfs:/// format to hdfs://default/
-  # Note: .items() makes a copy in python 2.7, but that is fine since the performance isn't critical.
-  for key, val in params_dict.items():
-    val = params_dict[key]
-    # Fix the path if the value is a string
-    if isinstance(val, str):
-      params.add_hparam(key, sanitize_hdfs_path(val))
+    """
+    Converts argparse.Namespace object to twitter.deepbird.hparam.hparam.HParams.
+    Note that tensorflow.contrib.training.HParams is gone in TF 2.x, and we forward ported
+    tensorflow.contrib.training.HParams to twitter.deepbird.hparam.hapram.HParams.
+
+    NOTE: If you are using estimators, please don't call this method and directly pass python dict
+    to TensorFlow estimator. Starting TensorFlow 2.0, Estimator will only accept dicts.
+    """
+
+    # Convert to dict so we can iterate through it cleanly.
+    if isinstance(opt, argparse.Namespace):
+        params_dict = vars(opt)
+    elif isinstance(opt, dict):
+        params_dict = opt
+    elif isinstance(opt, HParams):
+        logging.warning(
+            "If you are using Estimator, please pass python dict directly to Estimator."
+        )
+        params_dict = opt.values()
     else:
-      params.add_hparam(key, val)
-
-  return params
+        raise ValueError(
+            "Input can not be of type %s. "
+            "It can be one of { argparse.Namespace, dict, "
+            "twitter.deepbird.hparam.HParams}." % type(opt)
+        )
+
+    params = HParams()
+    # Hack to convert all parameters from hdfs:/// format to hdfs://default/
+    # Note: .items() makes a copy in python 2.7, but that is fine since the performance isn't critical.
+    for key, val in params_dict.items():
+        val = params_dict[key]
+        # Fix the path if the value is a string
+        if isinstance(val, str):
+            params.add_hparam(key, sanitize_hdfs_path(val))
+        else:
+            params.add_hparam(key, val)
+
+    return params
 
 
 def dynamic_partition(features, partitions, num_partitions=2, name=None):
-  """
-  Partitions each of the tensor in features using the provided mask.
-
-  Args:
-    features:
-      A single tensor or an iterable of tensors (list, tuple, dict)
-    partitions:
-      A bool or integer tensor representing the partitions.
-
-  Returns partitioned outputs as a list. Each element of the list is the same type as features.
-
-  This uses tf.dynamic_partition but adds the following niceties:
-    - features can be a list or dict of different tensor types.
-    - only a partition tensor is used to partition all the feature tensors recursively.
-    - the partition tensor is automatically converted into an integer tensor.
-    - defaults to num_partitions == 2
-  """
-
-  if not isinstance(features, (dict, list, tuple, tf.Tensor)):
-    raise AssertionError("features container must be a dict, list, or tuple, tf.Tensor")
-
-  if isinstance(partitions, tf.Tensor):
-    partitions = tf.cast(partitions, tf.int32)
-
-  if isinstance(features, tf.Tensor):
-    return tf.dynamic_partition(features, partitions, num_partitions, name)
-
-  outputs = []
-  for _ in range(num_partitions):
-    if isinstance(features, (tuple, list)):
-      # Create an empty list of lists first, will be converted to right type afterwards.
-      outputs.append([None for _ in range(len(features))])
-    else:
-      outputs.append(dict())
+    """
+    Partitions each of the tensor in features using the provided mask.
 
-  iterable = features.items() if isinstance(features, dict) else enumerate(features)
+    Args:
+      features:
+        A single tensor or an iterable of tensors (list, tuple, dict)
+      partitions:
+        A bool or integer tensor representing the partitions.
 
-  # Handling partitions of nested classes handled here:
-  # Recursively call dynamic_partition for containers
-  for key, feature in iterable:
-    name_key = None if name is None else name + "_" + str(key)
-    if isinstance(partitions, tf.Tensor):
-      results = tf.dynamic_partition(feature, partitions, num_partitions, name_key)
-    else:
-      results = tf.dynamic_partition(feature, partitions[key], num_partitions[key], name_key)
-      # Append the result to the proper output container
-    for idx, result in enumerate(results):
-      outputs[idx][key] = result
+    Returns partitioned outputs as a list. Each element of the list is the same type as features.
+
+    This uses tf.dynamic_partition but adds the following niceties:
+      - features can be a list or dict of different tensor types.
+      - only a partition tensor is used to partition all the feature tensors recursively.
+      - the partition tensor is automatically converted into an integer tensor.
+      - defaults to num_partitions == 2
+    """
 
-  # if input is tuple, convert list of lists back to list of tuples
-  if isinstance(features, tuple):
-    outputs = [type(features)(output) for output in outputs]
+    if not isinstance(features, (dict, list, tuple, tf.Tensor)):
+        raise AssertionError(
+            "features container must be a dict, list, or tuple, tf.Tensor"
+        )
 
-  return outputs
+    if isinstance(partitions, tf.Tensor):
+        partitions = tf.cast(partitions, tf.int32)
+
+    if isinstance(features, tf.Tensor):
+        return tf.dynamic_partition(features, partitions, num_partitions, name)
+
+    outputs = []
+    for _ in range(num_partitions):
+        if isinstance(features, (tuple, list)):
+            # Create an empty list of lists first, will be converted to right type afterwards.
+            outputs.append([None for _ in range(len(features))])
+        else:
+            outputs.append(dict())
+
+    iterable = features.items() if isinstance(features, dict) else enumerate(features)
+
+    # Handling partitions of nested classes handled here:
+    # Recursively call dynamic_partition for containers
+    for key, feature in iterable:
+        name_key = None if name is None else name + "_" + str(key)
+        if isinstance(partitions, tf.Tensor):
+            results = tf.dynamic_partition(
+                feature, partitions, num_partitions, name_key
+            )
+        else:
+            results = tf.dynamic_partition(
+                feature, partitions[key], num_partitions[key], name_key
+            )
+            # Append the result to the proper output container
+        for idx, result in enumerate(results):
+            outputs[idx][key] = result
+
+    # if input is tuple, convert list of lists back to list of tuples
+    if isinstance(features, tuple):
+        outputs = [type(features)(output) for output in outputs]
+
+    return outputs
 
 
 def write_file(filename, contents, encode=False):
-  '''
-  Optionally encodes contents and writes contents to a file.
-
-  Arguments:
-    filename:
-      path to file where the contents will be saved.
-      Accepts HDFS and local paths.
-    contents:
-      contents to save to the file.
-      Must be a string when encode is False.
-    encode:
-      False | 'json'. When encode='json', contents is encoded
-      with json.dumps.
-  '''
-  if encode == 'json':
-    contents = json.dumps(contents)
-  elif not is_string(contents):
-    raise ValueError("Expecting string for encode=False")
-
-  graph = tf.Graph()
-  with graph.as_default():
-    write = tf.write_file(filename, contents)
-
-  with tf.Session(graph=graph) as sess:
-    sess.run(write)
+    """
+    Optionally encodes contents and writes contents to a file.
+
+    Arguments:
+      filename:
+        path to file where the contents will be saved.
+        Accepts HDFS and local paths.
+      contents:
+        contents to save to the file.
+        Must be a string when encode is False.
+      encode:
+        False | 'json'. When encode='json', contents is encoded
+        with json.dumps.
+    """
+    if encode == "json":
+        contents = json.dumps(contents)
+    elif not is_string(contents):
+        raise ValueError("Expecting string for encode=False")
+
+    graph = tf.Graph()
+    with graph.as_default():
+        write = tf.write_file(filename, contents)
+
+    with tf.Session(graph=graph) as sess:
+        sess.run(write)
 
 
 def read_file(filename, decode=False):
-  '''
-  Reads contents from a file and optionally decodes it.
-
-  Arguments:
-    filename:
-      path to file where the contents will be loaded from.
-      Accepts HDFS and local paths.
-    decode:
-      False | 'json'. When decode='json', contents is decoded
-      with json.loads. When False, contents is returned as is.
-
-  Returns:
-    contents
-  '''
-  graph = tf.Graph()
-  with graph.as_default():
-    read = tf.read_file(filename)
-
-  with tf.Session(graph=graph) as sess:
-    contents = (sess.run(read))
-    # particular version of TF and/or Python may or may not perform decoding step from utf-8 to str
-    if not isinstance(contents, str):
-      contents = contents.decode()
-
-  if decode == 'json':
-    contents = json.loads(contents)
-
-  return contents
+    """
+    Reads contents from a file and optionally decodes it.
 
-def setup_tf_logging_formatter():
-  formatter = _logging.Formatter(
-      '%(asctime)s [%(levelname)s] %(name)s: %(message)s',
-      None)
-  # Setting up absl logging verbosity
-  logging.set_verbosity('info')
-  logging.set_stderrthreshold('info')
-  logging.get_absl_handler().setFormatter(formatter)
-  tf.logging.set_verbosity(tf.logging.INFO)
-  # Set tensorflow logging handler format
-  if len(tf_logging.get_logger().handlers) > 0:
-    tf_logging.get_logger().handlers[0].setFormatter(formatter)
+    Arguments:
+      filename:
+        path to file where the contents will be loaded from.
+        Accepts HDFS and local paths.
+      decode:
+        False | 'json'. When decode='json', contents is decoded
+        with json.loads. When False, contents is returned as is.
 
+    Returns:
+      contents
+    """
+    graph = tf.Graph()
+    with graph.as_default():
+        read = tf.read_file(filename)
 
-def set_tensorflow_log_level(log_level):
-  """
-  Sets tensorflow's default logging level.
+    with tf.Session(graph=graph) as sess:
+        contents = sess.run(read)
+        # particular version of TF and/or Python may or may not perform decoding step from utf-8 to str
+        if not isinstance(contents, str):
+            contents = contents.decode()
 
-  0. all logs are shown.
-  1. filter out INFO logs.
-  2. filter out WARNINGs and INFOs.
-  3. filter out ERRORs, WARNINGs, and INFOs.
+    if decode == "json":
+        contents = json.loads(contents)
 
-  Note that tf.Print output are INFO logs, so setting log_level above 0 would hide
-  output from tf.Print.
-  """
-  assert isinstance(log_level, int) and log_level >= 0 and log_level <= 3
-  os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(log_level)
+    return contents
 
 
-def weighted_average(values, weights):
-  """
-  Compute a weighted average using the given values and weights.
-  E.g. this is usually used to compute a weighted loss given sample weights.
-  """
-  return tf.reduce_sum(tf.multiply(values, weights)) / tf.reduce_sum(weights)
-
-
-def backup_checkpoint(checkpoint_path_prefix,
-                      backup_path='backup',
-                      empty_backup=True):
-  """
-  Creates a backup copy of a checkpoint in backup_dir.
-  This function is used by the Trainer for early-stopping.
-
-  Arguments:
-    checkpoint_path_prefix:
-      Prefix of the path to the checkpoint files.
-    backup_path:
-      path to a directory where checkpoint files will be backed up.
-    empty_backup:
-      When True (the default), the current contents of the backup directory
-      are removed before the backup is performed.
-
-  Returns:
-    The number of backed up files.
-  """
-  checkpoint_file_prefix = os.path.basename(checkpoint_path_prefix)
-
-  if tf.io.gfile.exists(backup_path) and empty_backup:
-    tf.io.gfile.rmtree(backup_path)
-
-  tf.io.gfile.mkdir(backup_path)
-
-  n_backup = 0
-  # copy all checkpoint files to backup directory (TODO use gfile.glob instead)
-  try:
-    checkpoint_files = tf.io.gfile.glob(checkpoint_path_prefix + "*")
-    if len(checkpoint_files) == 0:
-      raise twml.errors.CheckpointNotFoundError("%s not found" % checkpoint_path_prefix)
-    for filename in checkpoint_files:
-      n_backup += 1
-      tf.io.gfile.copy(
-        src=filename,
-        dst=os.path.join(backup_path, os.path.basename(filename))
-      )
-  except tf.errors.OpError as ex:
-    raise twml.errors.CheckpointNotFoundError(
-      f"{str(ex)}\n {checkpoint_path_prefix} not found."
+def setup_tf_logging_formatter():
+    formatter = _logging.Formatter(
+        "%(asctime)s [%(levelname)s] %(name)s: %(message)s", None
     )
+    # Setting up absl logging verbosity
+    logging.set_verbosity("info")
+    logging.set_stderrthreshold("info")
+    logging.get_absl_handler().setFormatter(formatter)
+    tf.logging.set_verbosity(tf.logging.INFO)
+    # Set tensorflow logging handler format
+    if len(tf_logging.get_logger().handlers) > 0:
+        tf_logging.get_logger().handlers[0].setFormatter(formatter)
 
-  # tf.train.latest_checkpoint needs the 'checkpoint' file.
-  with tf.io.gfile.GFile(os.path.join(backup_path, 'checkpoint'), 'w') as f:
-    f.write('model_checkpoint_path: "%s"\n' % checkpoint_file_prefix)
 
-  return n_backup
+def set_tensorflow_log_level(log_level):
+    """
+    Sets tensorflow's default logging level.
+
+    0. all logs are shown.
+    1. filter out INFO logs.
+    2. filter out WARNINGs and INFOs.
+    3. filter out ERRORs, WARNINGs, and INFOs.
+
+    Note that tf.Print output are INFO logs, so setting log_level above 0 would hide
+    output from tf.Print.
+    """
+    assert isinstance(log_level, int) and log_level >= 0 and log_level <= 3
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = str(log_level)
+
+
+def weighted_average(values, weights):
+    """
+    Compute a weighted average using the given values and weights.
+    E.g. this is usually used to compute a weighted loss given sample weights.
+    """
+    return tf.reduce_sum(tf.multiply(values, weights)) / tf.reduce_sum(weights)
+
+
+def backup_checkpoint(checkpoint_path_prefix, backup_path="backup", empty_backup=True):
+    """
+    Creates a backup copy of a checkpoint in backup_dir.
+    This function is used by the Trainer for early-stopping.
+
+    Arguments:
+      checkpoint_path_prefix:
+        Prefix of the path to the checkpoint files.
+      backup_path:
+        path to a directory where checkpoint files will be backed up.
+      empty_backup:
+        When True (the default), the current contents of the backup directory
+        are removed before the backup is performed.
+
+    Returns:
+      The number of backed up files.
+    """
+    checkpoint_file_prefix = os.path.basename(checkpoint_path_prefix)
+
+    if tf.io.gfile.exists(backup_path) and empty_backup:
+        tf.io.gfile.rmtree(backup_path)
+
+    tf.io.gfile.mkdir(backup_path)
+
+    n_backup = 0
+    # copy all checkpoint files to backup directory (TODO use gfile.glob instead)
+    try:
+        checkpoint_files = tf.io.gfile.glob(checkpoint_path_prefix + "*")
+        if len(checkpoint_files) == 0:
+            raise twml.errors.CheckpointNotFoundError(
+                "%s not found" % checkpoint_path_prefix
+            )
+        for filename in checkpoint_files:
+            n_backup += 1
+            tf.io.gfile.copy(
+                src=filename, dst=os.path.join(backup_path, os.path.basename(filename))
+            )
+    except tf.errors.OpError as ex:
+        raise twml.errors.CheckpointNotFoundError(
+            f"{str(ex)}\n {checkpoint_path_prefix} not found."
+        )
+
+    # tf.train.latest_checkpoint needs the 'checkpoint' file.
+    with tf.io.gfile.GFile(os.path.join(backup_path, "checkpoint"), "w") as f:
+        f.write('model_checkpoint_path: "%s"\n' % checkpoint_file_prefix)
+
+    return n_backup
 
 
 def set_only_checkpoint(source_path, dest_path, remove_source=True):
-  """
-  Removes the checkpoint and model.ckpt* files from dest_path.
-  Moves the latest checkpoint from source_path to dest_path.
-
-  Arguments:
-    source_path:
-      path to directory containing the latest checkpoint.
-      Should contain a valid checkpoint file and model.ckpt files.
-      For early-stopping, this should be the save_dir/best_checkpoint dir.
-    dest_path:
-      path to directory where the latest checkpoint files will be moved.
-      All its checkpoint and model.ckpt* files will be removed.
-      For early-stopping, this should be the save_dir.
-    remove_source:
-      When True (the default), deletes the source directory.
-      Note that even when False, its checkpoint files are moved to
-      dest_path anyway.
-      This deletes the source directory (and any remaining contents).
-  """
-  # make it so that source_path checkpoint is the only checkpoint
-  source_path_prefix = tf.train.latest_checkpoint(source_path)
-  if source_path_prefix is not None:
-    # remove intermediate checkpoints
-    for filename in tf.io.gfile.listdir(dest_path):
-      if filename.startswith("model.ckpt"):
-        tf.io.gfile.Remove(os.path.join(dest_path, filename))
-    # move contents of source_path to dest_path
-    for filename in tf.io.gfile.listdir(source_path):
-      tf.io.gfile.rename(
-        oldname=os.path.join(source_path, filename),
-        newname=os.path.join(dest_path, filename),
-        overwrite=True)  # overwrite "checkpoint" file
-    # delete the source_path dir
-    if remove_source:
-      tf.io.gfile.rmtree(source_path)
+    """
+    Removes the checkpoint and model.ckpt* files from dest_path.
+    Moves the latest checkpoint from source_path to dest_path.
+
+    Arguments:
+      source_path:
+        path to directory containing the latest checkpoint.
+        Should contain a valid checkpoint file and model.ckpt files.
+        For early-stopping, this should be the save_dir/best_checkpoint dir.
+      dest_path:
+        path to directory where the latest checkpoint files will be moved.
+        All its checkpoint and model.ckpt* files will be removed.
+        For early-stopping, this should be the save_dir.
+      remove_source:
+        When True (the default), deletes the source directory.
+        Note that even when False, its checkpoint files are moved to
+        dest_path anyway.
+        This deletes the source directory (and any remaining contents).
+    """
+    # make it so that source_path checkpoint is the only checkpoint
+    source_path_prefix = tf.train.latest_checkpoint(source_path)
+    if source_path_prefix is not None:
+        # remove intermediate checkpoints
+        for filename in tf.io.gfile.listdir(dest_path):
+            if filename.startswith("model.ckpt"):
+                tf.io.gfile.Remove(os.path.join(dest_path, filename))
+        # move contents of source_path to dest_path
+        for filename in tf.io.gfile.listdir(source_path):
+            tf.io.gfile.rename(
+                oldname=os.path.join(source_path, filename),
+                newname=os.path.join(dest_path, filename),
+                overwrite=True,
+            )  # overwrite "checkpoint" file
+        # delete the source_path dir
+        if remove_source:
+            tf.io.gfile.rmtree(source_path)
 
 
 def list_files_by_datetime(
-  base_path,
-  start_datetime,
-  end_datetime=None,
-  datetime_prefix_format='%Y/%m/%d/%H',
-  extension='lzo',
-  parallelism=1,
-  hour_resolution=1,
-  sort=False
+    base_path,
+    start_datetime,
+    end_datetime=None,
+    datetime_prefix_format="%Y/%m/%d/%H",
+    extension="lzo",
+    parallelism=1,
+    hour_resolution=1,
+    sort=False,
 ):
-  """List files matching `base_path/dt_prefix_format/*.extension` for the requested datetime range.
-
-  Args:
-    base_path:
-      The base path. If `None`, returns `None`.
-    start_datetime:
-      A `datetime.datetime` or string representing the start of the range (inclusive).
-      If `None`, it returns `list_files(base_path, extension, sort)`.
-    end_datetime:
-      A `datetime.datetime` or string representing the end of the range (inclusive).
-      If `None`, assumed to be the same as start_datetime.
-    datetime_prefix_format:
-      Format compatible with `datetime.datetime.strftime`
-      (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior).
-    extension:
-      The extension of the files composing the dataset (e.g. 'lzo').
-    parallelism:
-      The number of threads used to process list patterns (this is mostly useful
-      when dealing with filesystems such as HDFS in which listing files is a potentially expensive
-      operation).
-    hour_resolution:
-      The separation between consecutive hours. The default value is 1.
-    sort:
-      bool, whether to return a sorted list of files. Default False.
-
-  Returns:
-    A list with all the matching files.
-
-  Raises:
-    errors.OpError: If there are filesystem / directory listing errors.
-  """
-  if hour_resolution is None:
-    hour_resolution = 1
-
-  if base_path is None:
-    return None
-
-  if start_datetime is None:
-    return list_files(base_path, extension, sort)
-
-  # Do this in case people want to use a single day for training.
-  if end_datetime is None:
-    end_datetime = start_datetime
-
-  assert parallelism > 0
-  assert start_datetime <= end_datetime
-
-  if isinstance(start_datetime, str):
-    start_datetime = datetime.strptime(start_datetime, datetime_prefix_format)
-
-  if isinstance(end_datetime, str):
-    end_datetime = datetime.strptime(end_datetime, datetime_prefix_format)
-
-  assert isinstance(start_datetime, datetime)
-  assert isinstance(end_datetime, datetime)
-
-  base_path = preprocess_path(base_path)
-
-  def _handle_missing_globs(pattern):
-    try:
-      return tf.io.gfile.glob(pattern)
-    except tf.errors.NotFoundError as e:
-      tf.logging.warning(e.message)
-      return []
-
-  # a set is used because there might be some repeated globs depending on dt_prefix_format
-  globs = {
-    os.path.join(base_path, dt.strftime(datetime_prefix_format), '*.%s' % extension)
-    for dt in rrule.rrule(
-      freq=rrule.HOURLY, interval=hour_resolution, dtstart=start_datetime, until=end_datetime)
-  }
-  nested_files = Parallel(n_jobs=parallelism, backend='threading')(
-    delayed(_handle_missing_globs)(p) for p in globs
-  )
-  flattened_files = list(itertools.chain.from_iterable(nested_files))
-
-  if not flattened_files:
-    error_msg = "Files list is empty: base_path={base_path}, start_datetime={start_datetime}, end_datetime={end_datetime}".format(
-      base_path=base_path, start_datetime=start_datetime, end_datetime=end_datetime
+    """List files matching `base_path/dt_prefix_format/*.extension` for the requested datetime range.
+
+    Args:
+      base_path:
+        The base path. If `None`, returns `None`.
+      start_datetime:
+        A `datetime.datetime` or string representing the start of the range (inclusive).
+        If `None`, it returns `list_files(base_path, extension, sort)`.
+      end_datetime:
+        A `datetime.datetime` or string representing the end of the range (inclusive).
+        If `None`, assumed to be the same as start_datetime.
+      datetime_prefix_format:
+        Format compatible with `datetime.datetime.strftime`
+        (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior).
+      extension:
+        The extension of the files composing the dataset (e.g. 'lzo').
+      parallelism:
+        The number of threads used to process list patterns (this is mostly useful
+        when dealing with filesystems such as HDFS in which listing files is a potentially expensive
+        operation).
+      hour_resolution:
+        The separation between consecutive hours. The default value is 1.
+      sort:
+        bool, whether to return a sorted list of files. Default False.
+
+    Returns:
+      A list with all the matching files.
+
+    Raises:
+      errors.OpError: If there are filesystem / directory listing errors.
+    """
+    if hour_resolution is None:
+        hour_resolution = 1
+
+    if base_path is None:
+        return None
+
+    if start_datetime is None:
+        return list_files(base_path, extension, sort)
+
+    # Do this in case people want to use a single day for training.
+    if end_datetime is None:
+        end_datetime = start_datetime
+
+    assert parallelism > 0
+    assert start_datetime <= end_datetime
+
+    if isinstance(start_datetime, str):
+        start_datetime = datetime.strptime(start_datetime, datetime_prefix_format)
+
+    if isinstance(end_datetime, str):
+        end_datetime = datetime.strptime(end_datetime, datetime_prefix_format)
+
+    assert isinstance(start_datetime, datetime)
+    assert isinstance(end_datetime, datetime)
+
+    base_path = preprocess_path(base_path)
+
+    def _handle_missing_globs(pattern):
+        try:
+            return tf.io.gfile.glob(pattern)
+        except tf.errors.NotFoundError as e:
+            tf.logging.warning(e.message)
+            return []
+
+    # a set is used because there might be some repeated globs depending on dt_prefix_format
+    globs = {
+        os.path.join(base_path, dt.strftime(datetime_prefix_format), "*.%s" % extension)
+        for dt in rrule.rrule(
+            freq=rrule.HOURLY,
+            interval=hour_resolution,
+            dtstart=start_datetime,
+            until=end_datetime,
+        )
+    }
+    nested_files = Parallel(n_jobs=parallelism, backend="threading")(
+        delayed(_handle_missing_globs)(p) for p in globs
     )
-    raise OSError(error_msg)
+    flattened_files = list(itertools.chain.from_iterable(nested_files))
 
-  if sort:
-    flattened_files = sorted(flattened_files)
+    if not flattened_files:
+        error_msg = "Files list is empty: base_path={base_path}, start_datetime={start_datetime}, end_datetime={end_datetime}".format(
+            base_path=base_path,
+            start_datetime=start_datetime,
+            end_datetime=end_datetime,
+        )
+        raise OSError(error_msg)
 
-  return flattened_files
+    if sort:
+        flattened_files = sorted(flattened_files)
+
+    return flattened_files
 
 
 def limit_sparse_tensor_size(sparse_tf, input_size_bits, mask_indices=True):
-  """
-  Returns a ``tf.SparseTensor`` which is the input SparseTensor
-  limited to the specified input_size_bits
-
-  Args:
-    sparse_tf:
-      twml.SparseTensor or tf.SparseTensor
-    input_size_bits:
-      The number of bits allocated to the input size.
-      Input size will be power(2,input_size_bits).
-      Note that twml.limit_bits truncates any feature keys that
-      exceed the input size.
-    mask_indices:
-      If mask indices is False; only the shape is changed. Defaults to True.
-  """
-  if isinstance(sparse_tf, twml.SparseTensor):
-    sparse_tf = sparse_tf.to_tf()
-  if not isinstance(sparse_tf, tf.SparseTensor):
-    raise TypeError('Input argument `sparse_tf` should either be of type'
-                    'twml.SparseTensor of tf.SparseTensor. Found type: {}'.
-                    format(type(sparse_tf)))
-  if mask_indices:
-    indices = twml.limit_bits(sparse_tf.indices, input_size_bits)
-  else:
-    indices = sparse_tf.indices
-  dense_shape = tf.stack([sparse_tf.dense_shape[0], 1 << input_size_bits])
-  return tf.SparseTensor(indices=indices, values=sparse_tf.values,
-                         dense_shape=dense_shape)
+    """
+    Returns a ``tf.SparseTensor`` which is the input SparseTensor
+    limited to the specified input_size_bits
+
+    Args:
+      sparse_tf:
+        twml.SparseTensor or tf.SparseTensor
+      input_size_bits:
+        The number of bits allocated to the input size.
+        Input size will be power(2,input_size_bits).
+        Note that twml.limit_bits truncates any feature keys that
+        exceed the input size.
+      mask_indices:
+        If mask indices is False; only the shape is changed. Defaults to True.
+    """
+    if isinstance(sparse_tf, twml.SparseTensor):
+        sparse_tf = sparse_tf.to_tf()
+    if not isinstance(sparse_tf, tf.SparseTensor):
+        raise TypeError(
+            "Input argument `sparse_tf` should either be of type"
+            "twml.SparseTensor of tf.SparseTensor. Found type: {}".format(
+                type(sparse_tf)
+            )
+        )
+    if mask_indices:
+        indices = twml.limit_bits(sparse_tf.indices, input_size_bits)
+    else:
+        indices = sparse_tf.indices
+    dense_shape = tf.stack([sparse_tf.dense_shape[0], 1 << input_size_bits])
+    return tf.SparseTensor(
+        indices=indices, values=sparse_tf.values, dense_shape=dense_shape
+    )
 
 
 def create_module_spec(mlp_fn, mode, params, drop_collections=None):
-  """
-  Creates a standard tags_and_args which should be passed to the create_module_spec
-  spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args).
-
-  Args:
-    module_fn:
-      a function to build a graph for the Module.
-    mode:
-      mode in which the Estimator is run
-    params:
-      parameters passed to the Estimator
-  """
-  import tensorflow_hub as hub # noqa: F402
-  tags_and_args = [(set(), {"params": params, "mode": mode}),  # serving graph
-                   ({"train"}, {"params": params, "mode": mode})  # training graph
-                   ]
-  spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args, drop_collections=drop_collections)
-  return spec
+    """
+    Creates a standard tags_and_args which should be passed to the create_module_spec
+    spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args).
+
+    Args:
+      module_fn:
+        a function to build a graph for the Module.
+      mode:
+        mode in which the Estimator is run
+      params:
+        parameters passed to the Estimator
+    """
+    import tensorflow_hub as hub  # noqa: F402
+
+    tags_and_args = [
+        (set(), {"params": params, "mode": mode}),  # serving graph
+        ({"train"}, {"params": params, "mode": mode}),  # training graph
+    ]
+    spec = hub.create_module_spec(
+        mlp_fn, tags_and_args=tags_and_args, drop_collections=drop_collections
+    )
+    return spec
 
 
 def change_name_scope_from_dir(init_scope_name, final_scope_name, save_dir):
-  """
-  Changes the name of the saved scope to the desired name and saves it
-  to the same save_dir.
-
-  Args:
-    init_scope_name:
-      initial scope name
-    final_scope_name:
-      desired (final) scope name
-    save_dir:
-      directory which the scopes are saved
-
-  In the follwing section we:
-    - Read all the variables from the latest checkpoint.
-    - Make a copy of the variables with new name scope.
-    - Store both sets of variables into the latest checkpoint.
-  This essentially doubles up the size of the checkpoint.
-  But when a job is restarted after this part is done, the checkpoint size doubles again.
-  To avoid doing this, we create a copy in backup if a backup isn't found.
-  This allows us always read (from backup) and write same sized checkpoint files.
-  """
-
-  # Create a backup_checkpoints dir
-  backup_dir = os.path.join(save_dir, "change_name_scope_backups")
-  tf.io.gfile.makedirs(backup_dir)
-
-  latest_checkpoint = tf.train.latest_checkpoint(save_dir)
-
-  if latest_checkpoint is None:
-    raise OSError("No checkpoints found in save_dir: %s" % save_dir)
-
-  latest_backup_checkpoint = tf.train.latest_checkpoint(backup_dir)
-
-  if (latest_backup_checkpoint is None or
-      (os.path.basename(latest_checkpoint) !=
-       os.path.basename(latest_backup_checkpoint))):
-    backup_checkpoint(latest_checkpoint, backup_dir, empty_backup=False)
-
-  variables = tf.train.list_variables(backup_dir)
-  with tf.Graph().as_default(), tf.Session().as_default() as sess:
-    new_variables = []
-    for name, _ in variables:
-      var = tf.train.load_variable(backup_dir, name)
-      # Append both the rename and the original variable
-      new_variables.append(
-        tf.Variable(var, name=name.replace(init_scope_name, final_scope_name)))
-      new_variables.append(tf.Variable(var, name=name))
-    # Save this to the checkpoint in the save_dir
-    saver = tf.train.Saver(new_variables)
-    sess.run(tf.global_variables_initializer())
-    saver.save(sess, latest_checkpoint)  # pylint: disable=no-member
+    """
+    Changes the name of the saved scope to the desired name and saves it
+    to the same save_dir.
+
+    Args:
+      init_scope_name:
+        initial scope name
+      final_scope_name:
+        desired (final) scope name
+      save_dir:
+        directory which the scopes are saved
+
+    In the follwing section we:
+      - Read all the variables from the latest checkpoint.
+      - Make a copy of the variables with new name scope.
+      - Store both sets of variables into the latest checkpoint.
+    This essentially doubles up the size of the checkpoint.
+    But when a job is restarted after this part is done, the checkpoint size doubles again.
+    To avoid doing this, we create a copy in backup if a backup isn't found.
+    This allows us always read (from backup) and write same sized checkpoint files.
+    """
+
+    # Create a backup_checkpoints dir
+    backup_dir = os.path.join(save_dir, "change_name_scope_backups")
+    tf.io.gfile.makedirs(backup_dir)
+
+    latest_checkpoint = tf.train.latest_checkpoint(save_dir)
+
+    if latest_checkpoint is None:
+        raise OSError("No checkpoints found in save_dir: %s" % save_dir)
+
+    latest_backup_checkpoint = tf.train.latest_checkpoint(backup_dir)
+
+    if latest_backup_checkpoint is None or (
+        os.path.basename(latest_checkpoint)
+        != os.path.basename(latest_backup_checkpoint)
+    ):
+        backup_checkpoint(latest_checkpoint, backup_dir, empty_backup=False)
+
+    variables = tf.train.list_variables(backup_dir)
+    with tf.Graph().as_default(), tf.Session().as_default() as sess:
+        new_variables = []
+        for name, _ in variables:
+            var = tf.train.load_variable(backup_dir, name)
+            # Append both the rename and the original variable
+            new_variables.append(
+                tf.Variable(var, name=name.replace(init_scope_name, final_scope_name))
+            )
+            new_variables.append(tf.Variable(var, name=name))
+        # Save this to the checkpoint in the save_dir
+        saver = tf.train.Saver(new_variables)
+        sess.run(tf.global_variables_initializer())
+        saver.save(sess, latest_checkpoint)  # pylint: disable=no-member
 
 
 def hub_import(input, module, module_name, trainable=False):
-  """
-  Loads exported hub module.
-
-  Args:
-    input:
-      input to hub module
-    module:
-      module path
-    module_name:
-      signature of the exported hub module
-  """
-  import tensorflow_hub as hub # noqa: F402
-  hub_module = hub.Module(module, trainable=trainable)
-  output = hub_module(input, signature=module_name)
-  return output
+    """
+    Loads exported hub module.
+
+    Args:
+      input:
+        input to hub module
+      module:
+        module path
+      module_name:
+        signature of the exported hub module
+    """
+    import tensorflow_hub as hub  # noqa: F402
+
+    hub_module = hub.Module(module, trainable=trainable)
+    output = hub_module(input, signature=module_name)
+    return output
 
 
 def _extract_hash_space_bits(feature_config):
-  """
-  Extract Sparse Shapes for contrib.FeatureConfig.
-  Arguments:
-    feature_config:
-      Feature Configuration of the type contrib.FeatureConfig
-  Returns:
-    Dictionary of tensor names and hash space bits.
-  """
-  if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
-    fc_type = type(feature_config)
-    raise TypeError(f"Feature config must be of type contrib.FeatureConfig: {fc_type}")
-  sparse_shapes_dict = {}
-  for config in feature_config.sparse_extraction_configs:
-    sparse_shapes_dict[config.output_name] = config.hash_space_bits
-  return sparse_shapes_dict
+    """
+    Extract Sparse Shapes for contrib.FeatureConfig.
+    Arguments:
+      feature_config:
+        Feature Configuration of the type contrib.FeatureConfig
+    Returns:
+      Dictionary of tensor names and hash space bits.
+    """
+    if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
+        fc_type = type(feature_config)
+        raise TypeError(
+            f"Feature config must be of type contrib.FeatureConfig: {fc_type}"
+        )
+    sparse_shapes_dict = {}
+    for config in feature_config.sparse_extraction_configs:
+        sparse_shapes_dict[config.output_name] = config.hash_space_bits
+    return sparse_shapes_dict
 
 
 def fix_shape_sparse(features, feature_config):
-  """
-  Modifies the shape of features which are extracted using the hashing trick.
-  Features itself is changed by this function.
-  Arguments:
-    features:
-      Feature dictionary extracted by the feature config
-    feature_config:
-      Feature Configuration of the type contrib.FeatureConfig
-  """
-  if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
-    raise TypeError(f"Feature config must be of type contrib.FeatureConfig, currently of {type(feature_config)}")
-  sparse_shape = _extract_hash_space_bits(feature_config)
-  if not isinstance(features, dict):
-    raise TypeError(f"features must be of dictionary type, it is of {type(features)} type")
-  for key in set(features) & set(sparse_shape):
-    features[key] = limit_sparse_tensor_size(features[key], sparse_shape[key], mask_indices=False)
+    """
+    Modifies the shape of features which are extracted using the hashing trick.
+    Features itself is changed by this function.
+    Arguments:
+      features:
+        Feature dictionary extracted by the feature config
+      feature_config:
+        Feature Configuration of the type contrib.FeatureConfig
+    """
+    if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
+        raise TypeError(
+            f"Feature config must be of type contrib.FeatureConfig, currently of {type(feature_config)}"
+        )
+    sparse_shape = _extract_hash_space_bits(feature_config)
+    if not isinstance(features, dict):
+        raise TypeError(
+            f"features must be of dictionary type, it is of {type(features)} type"
+        )
+    for key in set(features) & set(sparse_shape):
+        features[key] = limit_sparse_tensor_size(
+            features[key], sparse_shape[key], mask_indices=False
+        )
 
 
 def touch_file_in_dir(directory, filename):
-  """
-  Creates a file named filename in directory.
+    """
+    Creates a file named filename in directory.
 
-  Arguments:
-    filename: (str)
-    directory: (str)
-  """
-  file_path = os.path.join(directory, filename)
-  with tf.io.gfile.GFile(file_path, "w") as f:
-    f.write("")
+    Arguments:
+      filename: (str)
+      directory: (str)
+    """
+    file_path = os.path.join(directory, filename)
+    with tf.io.gfile.GFile(file_path, "w") as f:
+        f.write("")
 
 
 def file_exist_in_dir(directory: str, filename: str) -> bool:
-  file_path = os.path.join(directory, filename)
-  return tf.io.gfile.exists(file_path)
+    file_path = os.path.join(directory, filename)
+    return tf.io.gfile.exists(file_path)
 
 
 def copy_to_local(remote, local, filename, overwrite=False):
-  """Function to file from remote directory to local directory."""
-  assert "hdfs://" not in local
-  tf.io.gfile.makedirs(local)
-  return tf.io.gfile.copy(
-    os.path.join(remote, filename),
-    os.path.join(local, filename),
-    overwrite=overwrite,
-  )
+    """Function to file from remote directory to local directory."""
+    assert "hdfs://" not in local
+    tf.io.gfile.makedirs(local)
+    return tf.io.gfile.copy(
+        os.path.join(remote, filename),
+        os.path.join(local, filename),
+        overwrite=overwrite,
+    )
 
 
 def copy_recursive(src, dst, overwrite=False):
-  """
-  Function to copy a directory recursively.
+    """
+    Function to copy a directory recursively.
 
-  Arguments:
-    src: Source directory.
-    dst: Destination directory.
-    overwrite: Specifies if files are to be overwritten if they exist.
-  """
+    Arguments:
+      src: Source directory.
+      dst: Destination directory.
+      overwrite: Specifies if files are to be overwritten if they exist.
+    """
 
-  src = src.rstrip("/")
-  dst = dst.rstrip("/")
+    src = src.rstrip("/")
+    dst = dst.rstrip("/")
 
-  for dirname, subdirs, files in tf.io.gfile.walk(src):
-    dst_dirname = dirname.replace(src, dst)
-    tf.io.gfile.makedirs(dst_dirname)
+    for dirname, subdirs, files in tf.io.gfile.walk(src):
+        dst_dirname = dirname.replace(src, dst)
+        tf.io.gfile.makedirs(dst_dirname)
 
-    for f in files:
-      src_f = os.path.join(dirname, f)
-      dst_f = os.path.join(dst_dirname, f)
+        for f in files:
+            src_f = os.path.join(dirname, f)
+            dst_f = os.path.join(dst_dirname, f)
 
-      tf.logging.info(f"Copying {src_f} to {dst_f}")
-      tf.io.gfile.copy(src_f, dst_f, overwrite=overwrite)
+            tf.logging.info(f"Copying {src_f} to {dst_f}")
+            tf.io.gfile.copy(src_f, dst_f, overwrite=overwrite)
 
 
 def delete_file_or_dir(path):
-  """
-  Delete the file or directory given by `path`
-  Arguments:
-    path:
-      string indicating path of file or directory to remove
-  """
-  if tf.io.gfile.isdir(path):
-    tf.io.gfile.rmtree(path)
-  else:
-    tf.io.gfile.remove(path)
+    """
+    Delete the file or directory given by `path`
+    Arguments:
+      path:
+        string indicating path of file or directory to remove
+    """
+    if tf.io.gfile.isdir(path):
+        tf.io.gfile.rmtree(path)
+    else:
+        tf.io.gfile.remove(path)
 
 
 def get_distributed_training_job_path():
-  """
-  Function to get distributed training job path.
-  Note: distributed training has three jobs, one parameter server job,
-  one worker job and one evaluator job. All of these three jobs' name
-  share a common base job name.
-  """
-  job_path = AuroraPath(dc=os.environ.get("TWML_JOB_CLUSTER"),
-    role=os.environ.get("TWML_JOB_ROLE"),
-    env=os.environ.get("TWML_JOB_ENV"),
-    job_name=os.environ.get("TWML_DISTRIBUTED_BASE_JOBNAME"))
-  return job_path
+    """
+    Function to get distributed training job path.
+    Note: distributed training has three jobs, one parameter server job,
+    one worker job and one evaluator job. All of these three jobs' name
+    share a common base job name.
+    """
+    job_path = AuroraPath(
+        dc=os.environ.get("TWML_JOB_CLUSTER"),
+        role=os.environ.get("TWML_JOB_ROLE"),
+        env=os.environ.get("TWML_JOB_ENV"),
+        job_name=os.environ.get("TWML_DISTRIBUTED_BASE_JOBNAME"),
+    )
+    return job_path
+
 
 def do_every_n_steps(action, num_steps):
-  """
-  Execute a sequence of TensorFlow operations only once in a while.
-  Specifically, `action` is performed if `global_step` is a
-    multiple of `num_steps`
-
-  Args:
-    action: callable to be performed at regular intervals. This callable
-      must return a TF op with no output tensors.
-    num_steps: period of performing the action, as measured
-      in number of training steps
-
-  Returns:
-    A TensorFlow op with no output tensors, like a tf.print() or tf.no_op().
-    You must use tf.control_dependencies() to execute the op.
-
-  """
-  global_step = tf.train.get_or_create_global_step()
-  condition = tf.math.equal(tf.math.floormod(global_step, num_steps), 0)
-  return tf.cond(condition, action, lambda: tf.no_op())
+    """
+    Execute a sequence of TensorFlow operations only once in a while.
+    Specifically, `action` is performed if `global_step` is a
+      multiple of `num_steps`
+
+    Args:
+      action: callable to be performed at regular intervals. This callable
+        must return a TF op with no output tensors.
+      num_steps: period of performing the action, as measured
+        in number of training steps
+
+    Returns:
+      A TensorFlow op with no output tensors, like a tf.print() or tf.no_op().
+      You must use tf.control_dependencies() to execute the op.
+
+    """
+    global_step = tf.train.get_or_create_global_step()
+    condition = tf.math.equal(tf.math.floormod(global_step, num_steps), 0)
+    return tf.cond(condition, action, lambda: tf.no_op())
diff --git a/twml/twml_common/initializer.py b/twml/twml_common/initializer.py
index 7a9c734c7..01f07ecf9 100644
--- a/twml/twml_common/initializer.py
+++ b/twml/twml_common/initializer.py
@@ -2,13 +2,13 @@
 
 
 class PartitionInitializer(tf.keras.initializers.Initializer):
-  """Required to initialize partitioned weight with numpy array for tests"""
+    """Required to initialize partitioned weight with numpy array for tests"""
 
-  def __init__(self, np_array):
-    self.np_array = np_array
+    def __init__(self, np_array):
+        self.np_array = np_array
 
-  def __call__(self, shape, dtype=None, partition_info=None):
-    offset = partition_info.var_offset
-    ix0, ix1 = offset[0], offset[0] + shape[0]
-    iy0, iy1 = offset[1], offset[1] + shape[1]
-    return self.np_array[ix0:ix1, iy0:iy1]
+    def __call__(self, shape, dtype=None, partition_info=None):
+        offset = partition_info.var_offset
+        ix0, ix1 = offset[0], offset[0] + shape[0]
+        iy0, iy1 = offset[1], offset[1] + shape[1]
+        return self.np_array[ix0:ix1, iy0:iy1]
diff --git a/twml/twml_common/serialize.py b/twml/twml_common/serialize.py
index 36c53881e..744e3a051 100644
--- a/twml/twml_common/serialize.py
+++ b/twml/twml_common/serialize.py
@@ -3,14 +3,14 @@
 
 
 def serialize(obj):
-  tbuf = TTransport.TMemoryBuffer()
-  iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
-  obj.write(iproto)
-  return tbuf.getvalue()
+    tbuf = TTransport.TMemoryBuffer()
+    iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
+    obj.write(iproto)
+    return tbuf.getvalue()
 
 
 def deserialize(record, bytes):
-  tbuf = TTransport.TMemoryBuffer(bytes)
-  iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
-  record.read(iproto)
-  return record
+    tbuf = TTransport.TMemoryBuffer(bytes)
+    iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
+    record.read(iproto)
+    return record
diff --git a/twml/twml_common/sparse_inputs.py b/twml/twml_common/sparse_inputs.py
index b8f7939e5..2906a316d 100644
--- a/twml/twml_common/sparse_inputs.py
+++ b/twml/twml_common/sparse_inputs.py
@@ -3,22 +3,28 @@
 
 
 def create_sparse_tensor(batch_size, input_size, num_values, dtype=tf.float32):
-  random_indices = np.sort(np.random.randint(batch_size * input_size, size=num_values))
-  test_indices_i = random_indices // input_size
-  test_indices_j = random_indices % input_size
-  test_indices = np.stack([test_indices_i, test_indices_j], axis=1)
-  test_values = np.random.random(num_values).astype(dtype.as_numpy_dtype)
+    random_indices = np.sort(
+        np.random.randint(batch_size * input_size, size=num_values)
+    )
+    test_indices_i = random_indices // input_size
+    test_indices_j = random_indices % input_size
+    test_indices = np.stack([test_indices_i, test_indices_j], axis=1)
+    test_values = np.random.random(num_values).astype(dtype.as_numpy_dtype)
 
-  return tf.SparseTensor(indices=tf.constant(test_indices),
-                         values=tf.constant(test_values),
-                         dense_shape=(batch_size, input_size))
+    return tf.SparseTensor(
+        indices=tf.constant(test_indices),
+        values=tf.constant(test_values),
+        dense_shape=(batch_size, input_size),
+    )
 
 
 def create_reference_input(sparse_input, use_binary_values):
-  if use_binary_values:
-    sp_a = tf.SparseTensor(indices=sparse_input.indices,
-                           values=tf.ones_like(sparse_input.values),
-                           dense_shape=sparse_input.dense_shape)
-  else:
-    sp_a = sparse_input
-  return sp_a
+    if use_binary_values:
+        sp_a = tf.SparseTensor(
+            indices=sparse_input.indices,
+            values=tf.ones_like(sparse_input.values),
+            dense_shape=sparse_input.dense_shape,
+        )
+    else:
+        sp_a = sparse_input
+    return sp_a