diff --git a/.gitignore b/.gitignore index 82da18448..b3e91143c 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ .coverage .tox test_data/ +*.bak diff --git a/Contribute.md b/Contribute.md new file mode 100644 index 000000000..4a869c931 --- /dev/null +++ b/Contribute.md @@ -0,0 +1,191 @@ +# Contributing to the Model Zoo for IntelĀ® Architecture + +## Adding scripts for a new TensorFlow model + +### Code updates + +In order to add a new model to the zoo, there are a few things that are +required: + +1. Setup the directory structure to allow the + [launch script](/docs/general/tensorflow/LaunchBenchmark.md) to find + your model. This involves creating folders for: + `/benchmarks/////`. + Note that you will need to add `__init__.py` files in each new + directory that you add, in order for python to find the code. + + ![Directory Structure](benchmarks_directory_structure.png) + +2. Next, in the leaf folder that was created in the previous step, you + will need to create `config.json` and `model_init.py` files: + + ![Add model init](add_model_init_and_config.png) + + The `config.json` file contains the best known KMP environment variable + settings to get optimal performance for the model. Below default settings are recommended for most of + the models in Model Zoo. + + ``` + { + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } + } + ``` + + The `model_init.py` file is used to initialize the best known configuration for the + model, and then start executing inference or training. When the + [launch script](/docs/general/tensorflow/LaunchBenchmark.md) is run, + it will look for the appropriate `model_init.py` file to use + according to the model name, framework, mode, and precision that are + specified by the user. + + The contents of the `model_init.py` file will vary by framework. For + TensorFlow models, we typically use the + [base model init class](/benchmarks/common/base_model_init.py) that + includes functions for doing common tasks such as setting up the best + known environment variables (like `KMP_BLOCKTIME`, `KMP_SETTINGS`, + `KMP_AFFINITY` by loading **config.json** and `OMP_NUM_THREADS`), num intra threads, and num + inter threads. The `model_init.py` file also sets up the string that + will ultimately be used to run inference or model training, which + normally includes the use of `numactl` and sending all of the + appropriate arguments to the model's script. Also, if your model + requires any non-standard arguments (arguments that are not part of + the [launch script flags](/docs/general/tensorflow/LaunchBenchmark.md#launch_benchmarkpy-flags)), + the `model_init.py` file is where you would define and parse those + args. + +3. [start.sh](/benchmarks/common/tensorflow/start.sh) is a shell script + that is called by the `launch_benchmarks.py` script in the docker + container. This script installs dependencies that are required by + the model, sets up the `PYTHONPATH` environment variable, and then + calls the [run_tf_benchmark.py](/benchmarks/common/tensorflow/run_tf_benchmark.py) + script with the appropriate args. That run script will end up calling + the `model_init.py` file that you have defined in the previous step. + + To add support for a new model in the `start.sh` script, you will + need to add a function with the same name as your model. Note that + this function name should match the `` folder from the + first step where you setup the directories for your model. In this + function, add commands to install any third-party dependencies within + an `if [ ${NOINSTALL} != "True" ]; then` conditional block. The + purpose of the `NOINSTALL` flag is to be able to skip the installs + for quicker iteration when running on bare metal or debugging. If + your model requires the `PYTHONPATH` environment variable to be setup + to find model code or dependencies, that should be done in the + model's function. Next, setup the command that will be run. The + standard launch script args are already added to the `CMD` variable, + so your model function will only need to add on more args if you have + model-specific args defined in your `model_init.py`. Lastly, call the + `run_model` function with the `PYTHONPATH` and the `CMD` string. + + Below is a sample template of a `start.sh` model function that + installs dependencies from `requirements.txt` file, sets up the + `PYHTONPATH` to find model source files, adds on a custom steps flag + to the run command, and then runs the model: + ```bash + function () { + if [ ${PRECISION} == "fp32" ]; then + if [ ${NOINSTALL} != "True" ]; then + pip install -r ${MOUNT_EXTERNAL_MODELS_SOURCE}/requirements.txt + fi + + export PYTHONPATH=${PYTHONPATH}:${MOUNT_EXTERNAL_MODELS_SOURCE} + CMD="${CMD} $(add_steps_args)" + PYTHONPATH=${PYTHONPATH} CMD=${CMD} run_model + else + echo "PRECISION=${PRECISION} is not supported for ${MODEL_NAME}" + exit 1 + fi + } + ``` + +Optional step: +* If there is CPU-optimized model code that has not been upstreamed to + the original repository, then it can be added to the + [models](/models) directory in the zoo repo. As with the first step + in the previous section, the directory structure should be setup like: + `/models/////`. + + ![Models Directory Structure](models_directory_structure.png) + + If there are model files that can be shared by multiple modes or + precisions, they can be placed the higher-level directory. For + example, if a file could be shared by both `FP32` and `Int8` + precisions, then it could be placed in the directory at: + `/models////` (omitting the + `` directory). Note that if this is being done, you need to + ensure that the license that is associated with the original model + repository is compatible with the license of the model zoo. + +### Debugging + +There are a couple of options for debugging and quicker iteration when +developing new scripts: +* Use the `--debug` flag in the launch_benchmark.py script, which will + give you a shell into the docker container. See the + [debugging section](/docs/general/tensorflow/LaunchBenchmark.md#debugging) + of the launch script documentation for more information on using this + flag. +* Run the launch script on bare metal (without a docker container). The + launch script documentation also has a + [section](/docs/general/tensorflow/LaunchBenchmark.md#alpha-feature-running-on-bare-metal) + with instructions on how to do this. Note that when running without + docker, you are responsible for installing all dependencies on your + system before running the launch script. If you are using this option + during development, be sure to also test _with_ a docker container to + ensure that the `start.sh` script dependency installation is working + properly for your model. + +### Documentation updates + +1. Create a `README.md` file in the + `/benchmarks///` directory: + + ![Add README file](add_readme.png) + + This README file should describe all of the steps necessary to run + the model, including downloading and preprocessing the dataset, + downloading the pretrained model, cloning repositories, and running + the model script with the appropriate arguments. Most models + have best known settings for batch and online inference performance + testing as well as testing accuracy. The README file should specify + how to set these configs using the `launch_benchmark.py` script. + +2. Update the table in the [main `benchmarks` README](/benchmarks/README.md) + with a link to the model that you are adding. Note that the models + in this table are ordered alphabetically by use case, framework, and + model name. The model name should link to the original paper for the + model. The instructions column should link to the README + file that you created in the previous step. + +### Testing + +1. After you've completed the above steps, run the model according to + instructions in the README file for the new model. Ensure that the + performance and accuracy metrics are on par with what you would + expect. + +2. Add unit tests to cover the new model. + * For TensorFlow models, there is a + [parameterized test](/tests/unit/common/tensorflow/test_run_tf_benchmarks.py#L80) + that checks the flow running from `run_tf_benchmarks.py` to the + inference command that is executed by the `model_init.py` file. The + test ensures that the inference command has all of the expected + arguments. + + To add a new parameterized instance of the test for your + new model, add a new JSON file `tf__args.json` to the [tf_models_args](/tests/unit/common/tensorflow/tf_model_args) + directory. Each file has a list of dictionaries, a dictionary has three + items: (1) `_comment` a comment describes the command, + (2) `input` the `run_tf_benchmarks.py` command with the appropriate + flags to run the model (3) `output` the expected inference or training + command that should get run by the `model_init.py` file. + * If any launch script or base class files were changed, then + additional unit tests should be added. + * Unit tests and style checks are run when you post a GitHub PR, and + the tests must be passing before the PR is merged. + * For information on how to run the unit tests and style checks + locally, see the [tests documentation](/tests/README.md). diff --git a/Jenkinsfile b/Jenkinsfile index eac6e7fc9..0eb363206 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -16,8 +16,8 @@ node('skx') { sudo apt-get install -y python3-dev || sudo yum install -y python36-devel.x86_64 # virtualenv 16.3.0 is broken do not use it - python2 -m pip install --force-reinstall --user --upgrade pip virtualenv!=16.3.0 tox - python3 -m pip install --force-reinstall --user --upgrade pip virtualenv!=16.3.0 tox + python2 -m pip install --no-cache-dir --user --upgrade pip==19.0.3 virtualenv!=16.3.0 tox + python3 -m pip install --no-cache-dir --user --upgrade pip==19.0.3 virtualenv!=16.3.0 tox """ } stage('Style tests') { diff --git a/README.md b/README.md index 54b69df95..eb326584b 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,8 @@ This repository contains **links to pre-trained models, sample scripts, best pra - Show how to efficiently execute, train, and deploy Intel-optimized models - Make it easy to get started running Intel-optimized models on Intel hardware in the cloud or on bare metal -***DISCLAIMER: These scripts are not intended for benchmarking Intel platforms. For any performance and/or benchmarking information on specific Intel platforms, visit [https://www.intel.ai/blog](https://www.intel.ai/blog).*** +***DISCLAIMER: These scripts are not intended for benchmarking Intel platforms. +For any performance and/or benchmarking information on specific Intel platforms, visit [https://www.intel.ai/blog](https://www.intel.ai/blog).*** ## How to Use the Model Zoo @@ -31,3 +32,6 @@ We hope this structure is intuitive and helps you find what you are looking for; ![Repo Structure](repo_structure.png) *Note: For model quantization and optimization tools, see [https://github.com/IntelAI/tools](https://github.com/IntelAI/tools)*. + +## How to Contribute +If you would like to add a new benchmarking script, please use [this guide](/Contribute.md). diff --git a/add_model_init_and_config.png b/add_model_init_and_config.png new file mode 100644 index 000000000..ef9b88290 Binary files /dev/null and b/add_model_init_and_config.png differ diff --git a/add_readme.png b/add_readme.png new file mode 100644 index 000000000..f28783bad Binary files /dev/null and b/add_readme.png differ diff --git a/benchmarks/README.md b/benchmarks/README.md index 787949b75..a1bac907b 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -11,7 +11,7 @@ dependencies to be installed: * [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) * `wget` for downloading pre-trained models -## Use Cases +## TensorFlow Use Cases | Use Case | Framework | Model | Mode | Instructions | | -----------------------| --------------| ------------------- | --------- |------------------------------| @@ -19,23 +19,36 @@ dependencies to be installed: | Content Creation | TensorFlow | [DRAW](https://arxiv.org/pdf/1502.04623.pdf) | Inference | [FP32](content_creation/tensorflow/draw/README.md#fp32-inference-instructions) | | Face Detection and Alignment | Tensorflow | [FaceNet](https://arxiv.org/pdf/1503.03832.pdf) | Inference | [FP32](face_detection_and_alignment/tensorflow/facenet/README.md#fp32-inference-instructions) | | Face Detection and Alignment | TensorFlow | [MTCC](https://arxiv.org/pdf/1604.02878.pdf) | Inference | [FP32](face_detection_and_alignment/tensorflow/mtcc/README.md#fp32-inference-instructions) | +| Image Recognition | TensorFlow | [DenseNet169](https://arxiv.org/pdf/1608.06993.pdf) | Inference | [FP32](image_recognition/tensorflow/densenet169/README.md#fp32-inference-instructions) | | Image Recognition | TensorFlow | [Inception ResNet V2](https://arxiv.org/pdf/1602.07261.pdf) | Inference | [Int8](image_recognition/tensorflow/inception_resnet_v2/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/inception_resnet_v2/README.md#fp32-inference-instructions) | | Image Recognition | TensorFlow | [Inception V3](https://arxiv.org/pdf/1512.00567.pdf) | Inference | [Int8](image_recognition/tensorflow/inceptionv3/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/inceptionv3/README.md#fp32-inference-instructions) | | Image Recognition | TensorFlow | [Inception V4](https://arxiv.org/pdf/1602.07261.pdf) | Inference | [Int8](image_recognition/tensorflow/inceptionv4/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/inceptionv4/README.md#fp32-inference-instructions) | -| Image Recognition | TensorFlow | [MobileNet V1](https://arxiv.org/pdf/1704.04861.pdf) | Inference | [FP32](image_recognition/tensorflow/mobilenet_v1/README.md#fp32-inference-instructions) | +| Image Recognition | TensorFlow | [MobileNet V1](https://arxiv.org/pdf/1704.04861.pdf) | Inference | [Int8](image_recognition/tensorflow/mobilenet_v1/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/mobilenet_v1/README.md#fp32-inference-instructions) | | Image Recognition | TensorFlow | [ResNet 101](https://arxiv.org/pdf/1512.03385.pdf) | Inference | [Int8](image_recognition/tensorflow/resnet101/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet101/README.md#fp32-inference-instructions) | | Image Recognition | TensorFlow | [ResNet 50](https://arxiv.org/pdf/1512.03385.pdf) | Inference | [Int8](image_recognition/tensorflow/resnet50/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet50/README.md#fp32-inference-instructions) | +| Image Recognition | TensorFlow | [ResNet 50v1.5](https://github.com/tensorflow/models/tree/master/official/resnet) | Inference | [Int8](image_recognition/tensorflow/resnet50v1_5/README.md#int8-inference-instructions) [FP32](image_recognition/tensorflow/resnet50v1_5/README.md#fp32-inference-instructions) | | Image Recognition | TensorFlow | [SqueezeNet](https://arxiv.org/pdf/1602.07360.pdf) | Inference | [FP32](image_recognition/tensorflow/squeezenet/README.md#fp32-inference-instructions) | | Image Segmentation | TensorFlow | [Mask R-CNN](https://arxiv.org/pdf/1703.06870.pdf) | Inference | [FP32](image_segmentation/tensorflow/maskrcnn/README.md#fp32-inference-instructions) | | Image Segmentation | TensorFlow | [UNet](https://arxiv.org/pdf/1505.04597.pdf) | Inference | [FP32](image_segmentation/tensorflow/unet/README.md#fp32-inference-instructions) | +| Language Modeling | TensorFlow | [LM-1B](https://arxiv.org/pdf/1602.02410.pdf) | Inference | [FP32](language_modeling/tensorflow/lm-1b/README.md#fp32-inference-instructions) | | Language Translation | TensorFlow | [GNMT](https://arxiv.org/pdf/1609.08144.pdf) | Inference | [FP32](language_translation/tensorflow/gnmt/README.md#fp32-inference-instructions) | | Language Translation | TensorFlow | [Transformer Language](https://arxiv.org/pdf/1706.03762.pdf)| Inference | [FP32](language_translation/tensorflow/transformer_language/README.md#fp32-inference-instructions) | | Language Translation | TensorFlow | [Transformer_LT_Official ](https://arxiv.org/pdf/1706.03762.pdf)| Inference | [FP32](language_translation/tensorflow/transformer_lt_official/README.md#fp32-inference-instructions) | -| Object Detection | TensorFlow | [R-FCN](https://arxiv.org/pdf/1605.06409.pdf) | Inference | [FP32](object_detection/tensorflow/rfcn/README.md#fp32-inference-instructions) | +| Object Detection | TensorFlow | [R-FCN](https://arxiv.org/pdf/1605.06409.pdf) | Inference | [Int8](object_detection/tensorflow/rfcn/README.md#int8-inference-instructions) [FP32](object_detection/tensorflow/rfcn/README.md#fp32-inference-instructions) | | Object Detection | TensorFlow | [Faster R-CNN](https://arxiv.org/pdf/1506.01497.pdf) | Inference | [Int8](object_detection/tensorflow/faster_rcnn/README.md#int8-inference-instructions) [FP32](object_detection/tensorflow/faster_rcnn/README.md#fp32-inference-instructions) | -| Object Detection | TensorFlow | [SSD-MobileNet](https://arxiv.org/pdf/1704.04861.pdf) | Inference | [FP32](object_detection/tensorflow/ssd-mobilenet/README.md#fp32-inference-instructions) | -| Object Detection | TensorFlow | [SSD-ResNet34](https://arxiv.org/pdf/1512.02325.pdf) | Inference | [FP32](object_detection/tensorflow/ssd-resnet34/README.md#fp32-inference-instructions) | +| Object Detection | TensorFlow | [SSD-MobileNet](https://arxiv.org/pdf/1704.04861.pdf) | Inference | [Int8](object_detection/tensorflow/ssd-mobilenet/README.md#int8-inference-instructions) [FP32](object_detection/tensorflow/ssd-mobilenet/README.md#fp32-inference-instructions) | +| Object Detection | TensorFlow | [SSD-ResNet34](https://arxiv.org/pdf/1512.02325.pdf) | Inference | [Int8](object_detection/tensorflow/ssd-resnet34/README.md#int8-inference-instructions) [FP32](object_detection/tensorflow/ssd-resnet34/README.md#fp32-inference-instructions) | +| Object Detection | TensorFlow | [SSD-VGG16](https://arxiv.org/pdf/1512.02325.pdf) | Inference | [Int8](object_detection/tensorflow/ssd_vgg16/README.md#int8-inference-instructions) [FP32](object_detection/tensorflow/ssd_vgg16/README.md#fp32-inference-instructions) | | Recommendation | TensorFlow | [NCF](https://arxiv.org/pdf/1708.05031.pdf) | Inference | [FP32](recommendation/tensorflow/ncf/README.md#fp32-inference-instructions) | | Recommendation | TensorFlow | [Wide & Deep Large Dataset](https://arxiv.org/pdf/1606.07792.pdf) | Inference | [Int8](recommendation/tensorflow/wide_deep_large_ds/README.md#int8-inference-instructions) [FP32](recommendation/tensorflow/wide_deep_large_ds/README.md#fp32-inference-instructions) | | Recommendation | TensorFlow | [Wide & Deep](https://arxiv.org/pdf/1606.07792.pdf) | Inference | [FP32](recommendation/tensorflow/wide_deep/README.md#fp32-inference-instructions) | | Text-to-Speech | TensorFlow | [WaveNet](https://arxiv.org/pdf/1609.03499.pdf) | Inference | [FP32](text_to_speech/tensorflow/wavenet/README.md#fp32-inference-instructions) | + + +## TensorFlow Serving Use Cases + + +| Use Case | Framework | Model | Mode | Instructions | +| -----------------------| --------------| ------------------- | --------- |------------------------------| +| Image Recognition | TensorFlow Serving | [Inception V3](https://arxiv.org/pdf/1512.00567.pdf) | Inference | [FP32](image_recognition/tensorflow_serving/inceptionv3/README.md#fp32-inference-instructions) | + diff --git a/benchmarks/adversarial_networks/tensorflow/dcgan/README.md b/benchmarks/adversarial_networks/tensorflow/dcgan/README.md index e23fc9c6a..4950d0f63 100644 --- a/benchmarks/adversarial_networks/tensorflow/dcgan/README.md +++ b/benchmarks/adversarial_networks/tensorflow/dcgan/README.md @@ -60,7 +60,7 @@ $ python launch_benchmark.py \ --socket-id 0 \ --checkpoint /home//dcgan_fp32_unconditional_cifar10_pretrained_model \ --data-location /home//cifar10 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` 5. Log files are located at the value of `--output-dir`. @@ -71,8 +71,6 @@ Batch size: 100 Batches number: 500 Time spent per BATCH: 35.8268 ms Total samples/sec: 2791.2030 samples/s -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_dcgan_inference_fp32_20190117_220342.log ``` \ No newline at end of file diff --git a/benchmarks/adversarial_networks/tensorflow/dcgan/inference/fp32/config.json b/benchmarks/adversarial_networks/tensorflow/dcgan/inference/fp32/config.json new file mode 100644 index 000000000..dfac18793 --- /dev/null +++ b/benchmarks/adversarial_networks/tensorflow/dcgan/inference/fp32/config.json @@ -0,0 +1,8 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1, + "KMP_HW_SUBSET": "1T" + } +} diff --git a/benchmarks/adversarial_networks/tensorflow/dcgan/inference/fp32/model_init.py b/benchmarks/adversarial_networks/tensorflow/dcgan/inference/fp32/model_init.py index aed323e94..2e2f88104 100644 --- a/benchmarks/adversarial_networks/tensorflow/dcgan/inference/fp32/model_init.py +++ b/benchmarks/adversarial_networks/tensorflow/dcgan/inference/fp32/model_init.py @@ -37,13 +37,13 @@ def __init__(self, args, custom_args=[], platform_util=None): self.set_num_inter_intra_threads() # Set KMP env vars, if they haven't already been set - self.set_kmp_vars() - set_env_var("KMP_HW_SUBSET", "1T") + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) benchmark_script = os.path.join( self.args.intelai_models, args.mode, args.precision, "inference_bench.py") - self.benchmark_command = self.get_numactl_command(args.socket_id) + \ + self.benchmark_command = self.get_command_prefix(args.socket_id) + \ self.python_exe + " " + benchmark_script set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) diff --git a/benchmarks/common/base_benchmark_util.py b/benchmarks/common/base_benchmark_util.py index adb102c3c..1aefdebd0 100644 --- a/benchmarks/common/base_benchmark_util.py +++ b/benchmarks/common/base_benchmark_util.py @@ -23,6 +23,7 @@ from __future__ import print_function import os +import sys from argparse import ArgumentParser from common import platform_util @@ -47,6 +48,9 @@ def _define_args(self): """define args for the benchmark interface shared by FP32 and int8 models""" + # only require the arg, if we aren't just printing out --help + required_arg = "--help" not in sys.argv + self._common_arg_parser = ArgumentParser( add_help=False, description="Parse args for base benchmark " "interface") @@ -54,7 +58,7 @@ def _define_args(self): self._common_arg_parser.add_argument( "-f", "--framework", help="Specify the name of the deep learning framework to use.", - dest="framework", default=None, required=True) + dest="framework", default=None, required=required_arg) self._common_arg_parser.add_argument( "-r", "--model-source-dir", @@ -64,15 +68,15 @@ def _define_args(self): self._common_arg_parser.add_argument( "-p", "--precision", help="Specify the model precision to use: fp32, int8, or bfloat16", - required=True, choices=["fp32", "int8", "bfloat16"], + required=required_arg, choices=["fp32", "int8", "bfloat16"], dest="precision") self._common_arg_parser.add_argument( "-mo", "--mode", help="Specify the type training or inference ", - required=True, choices=["training", "inference"], dest="mode") + required=required_arg, choices=["training", "inference"], dest="mode") self._common_arg_parser.add_argument( - "-m", "--model-name", required=True, + "-m", "--model-name", required=required_arg, help="model name to run benchmarks for", dest="model_name") self._common_arg_parser.add_argument( @@ -128,7 +132,9 @@ def _define_args(self): help="Specify the location of trained model checkpoint directory. " "If mode=training model/weights will be written to this " "location. If mode=inference assumes that the location points" - " to a model that has already been trained.", + " to a model that has already been trained. Note that using " + "checkpoint files for inference is being deprecated, in favor " + "of using frozen graphs.", dest="checkpoint", default=None, type=check_valid_folder) self._common_arg_parser.add_argument( @@ -155,6 +161,30 @@ def _define_args(self): "with --accuracy-only and --mode=inference.", dest="output_results", action="store_true") + # Note this can't be a normal boolean flag, because we need to know when the user + # does not explicitly set the arg value so that we can apply the appropriate + # default value, depending on the the precision. + self._common_arg_parser.add_argument( + "--disable-tcmalloc", + help="When TCMalloc is enabled, the google-perftools are installed (if running " + "using docker) and the LD_PRELOAD environment variable is set to point to " + "the TCMalloc library file. The TCMalloc memory allocator produces better " + "performance results with smaller batch sizes. This flag disables the use of " + "TCMalloc when set to True. For int8 benchmarking, TCMalloc is enabled by " + "default (--disable-tcmalloc=False). For other precisions, the flag is " + "--disable-tcmalloc=True by default.", + dest="disable_tcmalloc", choices=["True", "False"], + default=None + ) + + self._common_arg_parser.add_argument( + "--tcmalloc-large-alloc-report-threshold", + help="Sets the TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD environment variable to " + "the specified value. The environment variable sets the threshold (in bytes) " + "for when large memory allocation messages will be displayed.", + dest="tcmalloc_large_alloc_report_threshold", default=2147483648, type=int + ) + self._common_arg_parser.add_argument( "-v", "--verbose", help="Print verbose information.", dest="verbose", action="store_true") @@ -198,7 +228,8 @@ def _validate_args(self): raise ValueError("Number of cores exceeds system core number: {}". format(system_num_cores)) - if args.output_results and (args.model_name != "resnet50" or args.precision != "fp32"): + if args.output_results and ((args.model_name != "resnet50" and + args.model_name != "resnet50v1_5") or args.precision != "fp32"): raise ValueError("--output-results is currently only supported for resnet50 FP32 inference.") elif args.output_results and (args.mode != "inference" or not args.data_location): raise ValueError("--output-results can only be used when running inference with a dataset.") diff --git a/benchmarks/common/base_model_init.py b/benchmarks/common/base_model_init.py index 6294190d8..4a334ca65 100644 --- a/benchmarks/common/base_model_init.py +++ b/benchmarks/common/base_model_init.py @@ -18,6 +18,8 @@ # SPDX-License-Identifier: EPL-2.0 # +import glob +import json import os @@ -42,6 +44,13 @@ def __init__(self, args, custom_args=[], platform_util=None): self.custom_args = custom_args self.platform_util = platform_util + # Set default values for TCMalloc and convert string value to a boolean + if self.args.disable_tcmalloc is None: + # Set to False for int8 and True for other precisions + self.args.disable_tcmalloc = self.args.precision != "int8" + elif isinstance(self.args.disable_tcmalloc, str): + self.args.disable_tcmalloc = self.args.disable_tcmalloc == "True" + # Ensure that we are using the proper version of python to run the benchmarking script self.python_exe = os.environ["PYTHON_EXE"] @@ -61,15 +70,32 @@ def run_command(self, cmd): os.system(cmd) - def get_numactl_command(self, socket_id): + def get_command_prefix(self, socket_id, numactl=True): """ - Returns the numactl command with --cpunodebind and --membind set to the - specified socket_id. If socket_id is set to -1 (undefined) then an - empty string is returned. + Returns the command prefix with: + - LD_PRELOAD for int8 models (if tcmalloc is not disabled) + - The numactl command with --cpunodebind and --membind set to the specified socket_id (if numactl=True) """ - return "" if socket_id == -1 else \ - "numactl --cpunodebind={0} --membind={0} ".format( - str(socket_id)) + command = "" + + if not self.args.disable_tcmalloc: + # Try to find the TCMalloc library file + matches = glob.glob("/usr/lib/libtcmalloc.so*") + + if len(matches) == 0: + matches = glob.glob("/usr/lib64/libtcmalloc.so*") + + if len(matches) > 0: + command += "LD_PRELOAD={} ".format(matches[0]) + else: + # Unable to find the TCMalloc library file + print("Warning: Unable to find the TCMalloc library file (libtcmalloc.so) in /usr/lib or /usr/lib64, " + "so the LD_PRELOAD environment variable will not be set.") + + if socket_id != -1 and numactl: + command += "numactl --cpunodebind={0} --membind={0} ".format(str(socket_id)) + + return command def add_args_to_command(self, command, arg_list): """ @@ -135,14 +161,28 @@ def set_num_inter_intra_threads(self, num_inter_threads=None, num_intra_threads= print("num_inter_threads: {}\nnum_intra_threads: {}".format( self.args.num_inter_threads, self.args.num_intra_threads)) - def set_kmp_vars(self, kmp_settings="1", kmp_blocktime="1", kmp_affinity="granularity=fine,verbose,compact,1,0"): + def set_kmp_vars(self, config_file_path, kmp_settings=None, kmp_blocktime=None, kmp_affinity=None): """ Sets KMP_* environment variables to the specified value, if the environment variable has not already been set. - The default values for this function's args are the most common values that we have seen in the model zoo. + The default values in the json file are the best known settings for the model. """ + if os.path.exists(config_file_path): + with open(config_file_path, 'r') as config: + config_object = json.load(config) + + # First sets default from config file + for param in config_object.keys(): + for env in config_object[param].keys(): + set_env_var(env, config_object[param][env]) + + else: + print("Warning: File {} does not exist and \ + cannot be used to set KMP environment variables".format(config_file_path)) + + # Override user provided envs if kmp_settings: - set_env_var("KMP_SETTINGS", kmp_settings) + set_env_var("KMP_SETTINGS", kmp_settings, overwrite_existing=True) if kmp_blocktime: - set_env_var("KMP_BLOCKTIME", kmp_blocktime) + set_env_var("KMP_BLOCKTIME", kmp_blocktime, overwrite_existing=True) if kmp_affinity: - set_env_var("KMP_AFFINITY", kmp_affinity) + set_env_var("KMP_AFFINITY", kmp_affinity, overwrite_existing=True) diff --git a/benchmarks/common/tensorflow/start.sh b/benchmarks/common/tensorflow/start.sh index bc7fd699c..9ea5f9f02 100755 --- a/benchmarks/common/tensorflow/start.sh +++ b/benchmarks/common/tensorflow/start.sh @@ -45,6 +45,8 @@ echo " NUM_CORES: ${NUM_CORES}" echo " BENCHMARK_ONLY: ${BENCHMARK_ONLY}" echo " ACCURACY_ONLY: ${ACCURACY_ONLY}" echo " OUTPUT_RESULTS: ${OUTPUT_RESULTS}" +echo " DISABLE_TCMALLOC: ${DISABLE_TCMALLOC}" +echo " TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD: ${TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD}" echo " NOINSTALL: ${NOINSTALL}" echo " OUTPUT_DIR: ${OUTPUT_DIR}" @@ -58,10 +60,23 @@ if [[ ${NOINSTALL} != "True" ]]; then ## install common dependencies apt update apt full-upgrade -y + # Set env var before installs so that user interaction is not required + export DEBIAN_FRONTEND=noninteractive apt-get install python-tk numactl -y apt install -y libsm6 libxext6 pip install --upgrade pip pip install requests + + # install libgoogle-perftools-dev for tcmalloc + if [[ ${DISABLE_TCMALLOC} != "True" ]]; then + apt-get install --no-install-recommends --fix-missing google-perftools -y + if [ ! -f /usr/lib/libtcmalloc.so ]; then + apt-get install --no-install-recommends --fix-missing libgoogle-perftools-dev -y + if [ ! -f /usr/lib/libtcmalloc.so ]; then + ln -sf /usr/lib/x86_64-linux-gnu/libtcmalloc.so /usr/lib/libtcmalloc.so + fi + fi + fi fi verbose_arg="" @@ -170,6 +185,10 @@ if [ ${DATA_NUM_INTRA_THREADS} != "None" ]; then CMD="${CMD} --data-num-intra-threads=${DATA_NUM_INTRA_THREADS}" fi +if [ ${DISABLE_TCMALLOC} != "None" ]; then + CMD="${CMD} --disable-tcmalloc=${DISABLE_TCMALLOC}" +fi + function install_protoc() { pushd "${MOUNT_EXTERNAL_MODELS_SOURCE}/research" @@ -177,7 +196,7 @@ function install_protoc() { if [ ! -f "bin/protoc" ]; then install_location=$1 echo "protoc not found, installing protoc from ${install_location}" - apt-get -y install wget + apt-get -y install wget unzip wget -O protobuf.zip ${install_location} unzip -o protobuf.zip rm protobuf.zip @@ -278,6 +297,19 @@ function dcgan() { fi } +# DenseNet 169 model +function densenet169() { + if [ ${PRECISION} == "fp32" ]; then + CMD="${CMD} $(add_arg "--input_height" ${input_height}) $(add_arg "--input_width" ${input_width}) \ + $(add_arg "--warmup_steps" ${warmup_steps}) $(add_arg "--steps" ${steps}) $(add_arg "--input_layer" ${input_layer}) \ + $(add_arg "--output_layer" ${output_layer})" + PYTHONPATH=${PYTHONPATH} CMD=${CMD} run_model + else + echo "PRECISION=${PRECISION} is not supported for ${MODEL_NAME}" + exit 1 + fi +} + # DRAW model function draw() { if [ ${PRECISION} == "fp32" ]; then @@ -397,6 +429,18 @@ function inception_resnet_v2() { fi } +# language modeling lm-1b +function lm-1b() { + if [ ${PRECISION} == "fp32" ]; then + CMD="${CMD} $(add_steps_args)" + + PYTHONPATH=${PYTHONPATH} CMD=${CMD} run_model + else + echo "PRECISION=${PRECISION} is not supported for ${MODEL_NAME}" + exit 1 + fi +} + # Mask R-CNN model function maskrcnn() { if [ ${PRECISION} == "fp32" ]; then @@ -405,6 +449,7 @@ function maskrcnn() { if [ ${NOINSTALL} != "True" ]; then # install dependencies pip3 install -r ${MOUNT_EXTERNAL_MODELS_SOURCE}/requirements.txt + pip3 install --force-reinstall scipy==1.2.1 Pillow==5.3.0 # install cocoapi get_cocoapi ${MOUNT_EXTERNAL_MODELS_SOURCE}/coco ${MOUNT_EXTERNAL_MODELS_SOURCE}/samples/coco @@ -423,6 +468,11 @@ function mobilenet_v1() { if [ ${PRECISION} == "fp32" ]; then export PYTHONPATH=${PYTHONPATH}:${MOUNT_EXTERNAL_MODELS_SOURCE}:${MOUNT_EXTERNAL_MODELS_SOURCE}/research:${MOUNT_EXTERNAL_MODELS_SOURCE}/research/slim PYTHONPATH=${PYTHONPATH} CMD=${CMD} run_model + elif [ ${PRECISION} == "int8" ]; then + CMD="${CMD} $(add_arg "--input_height" ${input_height}) $(add_arg "--input_width" ${input_width}) \ + $(add_arg "--warmup_steps" ${warmup_steps}) $(add_arg "--steps" ${steps}) $(add_arg "--input_layer" ${input_layer}) \ + $(add_arg "--output_layer" ${output_layer})" + PYTHONPATH=${PYTHONPATH} CMD=${CMD} run_model else echo "PRECISION=${PRECISION} is not supported for ${MODEL_NAME}" exit 1 @@ -501,6 +551,7 @@ function rfcn() { if [ ${NOINSTALL} != "True" ]; then # install dependencies pip install -r "${MOUNT_BENCHMARK}/object_detection/tensorflow/rfcn/requirements.txt" + original_dir=$(pwd) cd "${MOUNT_EXTERNAL_MODELS_SOURCE}/research" @@ -511,6 +562,10 @@ function rfcn() { get_cocoapi ${MOUNT_EXTERNAL_MODELS_SOURCE}/cocoapi ${MOUNT_EXTERNAL_MODELS_SOURCE}/research/ fi + # Fix the object_detection_evaluation.py file to change unicode() to str() so that it works in py3 + chmod -R 777 ${MOUNT_EXTERNAL_MODELS_SOURCE}/research/object_detection/utils/object_detection_evaluation.py + sed -i.bak "s/unicode(/str(/g" ${MOUNT_EXTERNAL_MODELS_SOURCE}/research/object_detection/utils/object_detection_evaluation.py + split_arg="" if [ -n "${split}" ] && [ ${ACCURACY_ONLY} == "True" ]; then split_arg="--split=${split}" @@ -586,12 +641,19 @@ function ssd_mobilenet() { # SSD-ResNet34 model function ssd-resnet34() { - if [ ${PRECISION} == "fp32" ]; then + if [ ${PRECISION} == "fp32" ] || [ ${PRECISION} == "int8" ]; then if [ ${NOINSTALL} != "True" ]; then for line in $(cat ${MOUNT_BENCHMARK}/object_detection/tensorflow/ssd-resnet34/requirements.txt) do pip install $line done + apt install -y git-all + old_dir=${PWD} + cd /tmp + git clone --single-branch https://github.com/tensorflow/benchmarks.git + cd benchmarks + git checkout 1e7d788042dfc6d5e5cd87410c57d5eccee5c664 + cd ${old_dir} fi CMD=${CMD} run_model @@ -601,6 +663,32 @@ function ssd-resnet34() { fi } +# SSD-VGG16 model +function ssd_vgg16() { + + if [ ${NOINSTALL} != "True" ]; then + pip install opencv-python Cython + + if [ ${ACCURACY_ONLY} == "True" ]; then + # get the python cocoapi + get_cocoapi ${MOUNT_EXTERNAL_MODELS_SOURCE}/coco ${MOUNT_INTELAI_MODELS_SOURCE}/inference + fi + fi + + cp ${MOUNT_INTELAI_MODELS_SOURCE}/__init__.py ${MOUNT_EXTERNAL_MODELS_SOURCE}/dataset + cp ${MOUNT_INTELAI_MODELS_SOURCE}/__init__.py ${MOUNT_EXTERNAL_MODELS_SOURCE}/preprocessing + cp ${MOUNT_INTELAI_MODELS_SOURCE}/__init__.py ${MOUNT_EXTERNAL_MODELS_SOURCE}/utility + export PYTHONPATH=${PYTHONPATH}:${MOUNT_EXTERNAL_MODELS_SOURCE} + + if [ ${PRECISION} == "int8" ] || [ ${PRECISION} == "fp32" ]; then + CMD="${CMD} $(add_steps_args)" + PYTHONPATH=${PYTHONPATH} CMD=${CMD} run_model + else + echo "PRECISION=${PRECISION} is not supported for ${MODEL_NAME}" + exit 1 + fi +} + # UNet model function unet() { if [ ${PRECISION} == "fp32" ]; then @@ -629,10 +717,6 @@ function transformer_language() { echo "transformer-language requires -- decode_from_file arg to be defined" exit 1 fi - if [[ -z "${reference}" ]]; then - echo "transformer-language requires -- reference arg to be defined" - exit 1 - fi if [[ -z "${CHECKPOINT_DIRECTORY}" ]]; then echo "transformer-language requires --checkpoint arg to be defined" exit 1 @@ -650,8 +734,11 @@ function transformer_language() { cp ${MOUNT_INTELAI_MODELS_SOURCE}/${MODE}/${PRECISION}/decoding.py ${MOUNT_EXTERNAL_MODELS_SOURCE}/tensor2tensor/utils/decoding.py - CMD="${CMD} --decode_from_file=${CHECKPOINT_DIRECTORY}/${decode_from_file} \ - --reference=${CHECKPOINT_DIRECTORY}/${reference}" + CMD="${CMD} --decode_from_file=${CHECKPOINT_DIRECTORY}/${decode_from_file}" + + if [[ -n "${reference}" ]]; then + CMD="${CMD} --reference=${CHECKPOINT_DIRECTORY}/${reference}" + fi PYTHONPATH=${PYTHONPATH} CMD=${CMD} run_model else @@ -681,6 +768,10 @@ function transformer_lt_official() { exit 1 fi + if [ ${NOINSTALL} != "True" ]; then + pip install pandas + fi + cp ${MOUNT_INTELAI_MODELS_SOURCE}/${MODE}/${PRECISION}/infer_ab.py \ ${MOUNT_EXTERNAL_MODELS_SOURCE}/official/transformer/infer_ab.py @@ -752,7 +843,13 @@ function wide_deep_large_ds() { if [[ -z "${LIBTCMALLOC}" ]]; then echo "libtcmalloc.so.4 not found, trying to install" apt-get update - apt-get install google-perftools --fix-missing -y + apt-get install --no-install-recommends --fix-missing google-perftools -y + if [ ! -f /usr/lib/libtcmalloc.so ]; then + apt-get install --no-install-recommends --fix-missing libgoogle-perftools-dev -y + if [ ! -f /usr/lib/libtcmalloc.so ]; then + ln -sf /usr/lib/x86_64-linux-gnu/libtcmalloc.so /usr/lib/libtcmalloc.so + fi + fi fi LIBTCMALLOC="$(ldconfig -p | grep $TCMALLOC_LIB | tr ' ' '\n' | grep /)" @@ -789,6 +886,8 @@ echo "Log output location: ${LOGFILE}" MODEL_NAME=$(echo ${MODEL_NAME} | tr 'A-Z' 'a-z') if [ ${MODEL_NAME} == "dcgan" ]; then dcgan +elif [ ${MODEL_NAME} == "densenet169" ]; then + densenet169 elif [ ${MODEL_NAME} == "draw" ]; then draw elif [ ${MODEL_NAME} == "facenet" ]; then @@ -803,6 +902,8 @@ elif [ ${MODEL_NAME} == "inceptionv4" ]; then inceptionv4 elif [ ${MODEL_NAME} == "inception_resnet_v2" ]; then inception_resnet_v2 +elif [ ${MODEL_NAME} == "lm-1b" ]; then + lm-1b elif [ ${MODEL_NAME} == "maskrcnn" ]; then maskrcnn elif [ ${MODEL_NAME} == "mobilenet_v1" ]; then @@ -815,6 +916,8 @@ elif [ ${MODEL_NAME} == "resnet101" ]; then resnet50_101_inceptionv3 elif [ ${MODEL_NAME} == "resnet50" ]; then resnet50_101_inceptionv3 +elif [ ${MODEL_NAME} == "resnet50v1_5" ]; then + resnet50_101_inceptionv3 elif [ ${MODEL_NAME} == "rfcn" ]; then rfcn elif [ ${MODEL_NAME} == "squeezenet" ]; then @@ -823,6 +926,8 @@ elif [ ${MODEL_NAME} == "ssd-mobilenet" ]; then ssd_mobilenet elif [ ${MODEL_NAME} == "ssd-resnet34" ]; then ssd-resnet34 +elif [ ${MODEL_NAME} == "ssd_vgg16" ]; then + ssd_vgg16 elif [ ${MODEL_NAME} == "unet" ]; then unet elif [ ${MODEL_NAME} == "transformer_language" ]; then diff --git a/benchmarks/common/tensorflow_serving/__init__.py b/benchmarks/common/tensorflow_serving/__init__.py new file mode 100644 index 000000000..cf793ec6a --- /dev/null +++ b/benchmarks/common/tensorflow_serving/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/common/tensorflow_serving/build_tfserving_image.sh b/benchmarks/common/tensorflow_serving/build_tfserving_image.sh new file mode 100644 index 000000000..a47505f88 --- /dev/null +++ b/benchmarks/common/tensorflow_serving/build_tfserving_image.sh @@ -0,0 +1,73 @@ +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Bash script to build tensorflow serving image +# Setup proxy on your terminal before running the script. + +# To build image separately +# TF_SERVING_VERSION=1.13.0 MKL_IMAGE_TAG=tensorflow/serving:latest-mkl bash build_tfserving_image.sh + +#!/usr/bin/env bash +set -e +set -x + +WORKDIR=serving_workspace + +if [ -d ${WORKDIR} ]; then + rm -rf ${WORKDIR} +fi + +pushd $(pwd) + +mkdir -p ${WORKDIR} +cd ${WORKDIR} + +# Build Tensorflow Serving image +TF_SERVING_VERSION=${TF_SERVING_VERSION:-"1.13.0"} +echo "Using TF_SERVING_VERSION=${TF_SERVING_VERSION} to build docker image" + +# Clone official tensorflow serving repo +git clone https://github.com/tensorflow/serving.git + +TF_SERVING_ROOT=$(pwd)/serving +cd ${TF_SERVING_ROOT}/tensorflow_serving/tools/docker/ + +# Build Dockerfile.devel-mkl +docker build \ + --build-arg TF_SERVING_BAZEL_OPTIONS="--incompatible_disallow_data_transition=false --incompatible_disallow_filetype=false" \ + --build-arg TF_SERVING_VERSION_GIT_BRANCH=${TF_SERVING_VERSION} \ + --build-arg HTTP_PROXY=${HTTP_PROXY} \ + --build-arg HTTPS_PROXY=${HTTPS_PROXY} \ + --build-arg http_proxy=${http_proxy} \ + --build-arg https_proxy=${https_proxy} \ + -f Dockerfile.devel-mkl -t tensorflow/serving:latest-devel-mkl . + +# Build Dockerfile.mkl, which uses above image as base_image +docker build \ + --build-arg TF_SERVING_VERSION_GIT_BRANCH=${TF_SERVING_VERSION} \ + --build-arg HTTP_PROXY=${HTTP_PROXY} \ + --build-arg HTTPS_PROXY=${HTTPS_PROXY} \ + --build-arg http_proxy=${http_proxy} \ + --build-arg https_proxy=${https_proxy} \ + -f Dockerfile.mkl -t ${MKL_IMAGE_TAG} . + +popd + +rm -rf ${WORKDIR} + +echo "Image built with tag: ${MKL_IMAGE_TAG}" diff --git a/benchmarks/common/tensorflow_serving/start.sh b/benchmarks/common/tensorflow_serving/start.sh new file mode 100644 index 000000000..b27e40ed5 --- /dev/null +++ b/benchmarks/common/tensorflow_serving/start.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# +#!/usr/bin/env bash +set -e +set -x + +echo 'Running with parameters:' +echo " USE_CASE: ${USE_CASE}" +echo " FRAMEWORK: ${FRAMEWORK}" +echo " WORKSPACE: ${WORKSPACE}" +echo " IN_GRAPH: ${IN_GRAPH}" +echo " MODEL_NAME: ${MODEL_NAME}" +echo " MODE: ${MODE}" +echo " PRECISION: ${PRECISION}" +echo " BATCH_SIZE: ${BATCH_SIZE}" +echo " BENCHMARK_ONLY: ${BENCHMARK_ONLY}" +echo " ACCURACY_ONLY: ${ACCURACY_ONLY}" +echo " OMP_NUM_THREADS: ${OMP_NUM_THREADS}" +echo " NUM_INTRA_THREADS: ${NUM_INTRA_THREADS}" +echo " NUM_INTER_THREADS: ${NUM_INTER_THREADS}" +echo " OUTPUT_DIR: ${OUTPUT_DIR}" +echo " TF_SERVING_VERSION: ${TF_SERVING_VERSION}" + + +if [ ${ACCURACY_ONLY} == "True" ]; then + echo "Accuracy is not supported with Tensorflow Serving" + exit 1 +fi + +WORKDIR=workspace + +if [ -d ${WORKDIR} ]; then + rm -rf ${WORKDIR} +fi + +pushd $(pwd) + +mkdir -p ${WORKDIR} +cd ${WORKDIR} + +# Check docker +if ! [[ $(which docker) && $(docker --version) ]]; then + echo "Docker not found, please install docker to proceed." + exit 1 +fi + +# Check for pip +if ! [[ $(which pip) && $(pip --version) ]]; then + echo "pip not found, please install pip to proceed." + exit 1 +fi + +timestamp=`date +%Y%m%d_%H%M%S` +LOG_FILENAME="benchmark_${MODEL_NAME}_${MODE}_${PRECISION}_${timestamp}.log" +if [ ! -d "${OUTPUT_DIR}" ]; then + mkdir ${OUTPUT_DIR} +fi + +MKL_IMAGE_TAG=tensorflow/serving:latest-mkl + +# Build Tensorflow Serving docker image +echo "Building tensorflow serving image..." +echo "First time it takes few minutes to build images, consecutive builds are much faster" + +TF_SERVING_VERSION=${TF_SERVING_VERSION} MKL_IMAGE_TAG=${MKL_IMAGE_TAG} bash ${WORKSPACE}/build_tfserving_image.sh + +function docker_run(){ + docker run \ + --name=${CONTAINER_NAME} \ + --rm \ + -d \ + -p 8500:8500 \ + -v /tmp:/models/${MODEL_NAME} \ + -e MODEL_NAME=${MODEL_NAME} \ + -e OMP_NUM_THREADS=${OMP_NUM_THREADS} \ + -e TENSORFLOW_INTER_OP_PARALLELISM=${NUM_INTER_THREADS} \ + -e TENSORFLOW_INTRA_OP_PARALLELISM=${NUM_INTRA_THREADS} \ + ${MKL_IMAGE_TAG} +} + + +function resnet50_or_inceptionv3(){ + # Setup virtual env + pip install virtualenv + virtualenv venv + + source venv/bin/activate + # Make sure intel-tensorflow is after tensorflow-serving-api, so that + # tensorflow from intel-tensorflow get installed effectively. + pip install grpc \ + requests \ + tensorflow-serving-api \ + intel-tensorflow + # cd to image recognition tfserving scripts + cd ${WORKSPACE}/../../${USE_CASE}/${FRAMEWORK}/${MODEL_NAME}/${MODE}/${PRECISION} + + # by default converted model is saved at /tmp/1 + rm -rf /tmp/1 + + # convert pretrained model to savedmodel + python model_graph_to_saved_model.py --import_path ${IN_GRAPH} + + RUNNING=$(docker ps --filter="expose=8501/tcp" -q | xargs) + if [[ -n ${RUNNING} ]]; then + docker rm -f ${RUNNING} + fi + + CONTAINER_NAME=tfserving_${RANDOM} + + # Run container + MKL_IMAGE_TAG=${MKL_IMAGE_TAG} CONTAINER_NAME=${CONTAINER_NAME} docker_run + + # Test + python image_recognition_client.py --model ${MODEL_NAME} + + + if [ ${BATCH_SIZE} == 1 ];then + # Test Average latency + python image_recognition_benchmark.py --batch_size ${BATCH_SIZE} --model ${MODEL_NAME} + else + # Test max throughput + python image_recognition_benchmark.py --batch_size ${BATCH_SIZE} --model ${MODEL_NAME} + fi + + # Clean up + docker rm -f ${CONTAINER_NAME} +} + +LOGFILE=${OUTPUT_DIR}/${LOG_FILENAME} + +MODEL_NAME=$(echo ${MODEL_NAME} | tr 'A-Z' 'a-z') +if [ ${MODEL_NAME} == "inceptionv3" ] || [ ${MODEL_NAME} == "resnet50" ] && [ ${PRECISION} == "fp32" ]; then + resnet50_or_inceptionv3 | tee -a ${LOGFILE} +else + echo "Unsupported Model: ${MODEL_NAME} or Precision: ${PRECISION}" + exit 1 +fi + +popd + +# Clean up work directory +rm -rf ${WORKDIR} + +echo "Log output location: ${LOGFILE}" | tee -a ${LOGFILE} diff --git a/benchmarks/common/utils/validators.py b/benchmarks/common/utils/validators.py index 54f280dfd..16ec18aba 100644 --- a/benchmarks/common/utils/validators.py +++ b/benchmarks/common/utils/validators.py @@ -88,3 +88,23 @@ def check_valid_file_or_dir(value): raise ArgumentTypeError("{} does not exist.".format(value)) check_for_link(value) return value + + +def check_volume_mount(value): + """ + Verifies that the value is a valid docker volume mount, where there should be + at least two fields separated by a : (for the local directory to mount and the + path to the where the directory will be mounted in the container. The third + optional field is for extra options like read only. + """ + if value: + # Check that we have at least 2 fields and at most 3 fields + if not 3 > value.count(":") > 0: + raise ArgumentTypeError( + "{} is not a valid volume mount string where ':' is used to separate the fields. " + "See https://docs.docker.com/storage/volumes for information on formatting the volume " + "mount string".format(value)) + + # Check that the local directory specified is a valid folder and not a link + check_valid_folder(value.split(':')[0]) + return value diff --git a/benchmarks/content_creation/tensorflow/draw/README.md b/benchmarks/content_creation/tensorflow/draw/README.md index f3ea0732f..a918d1a5a 100644 --- a/benchmarks/content_creation/tensorflow/draw/README.md +++ b/benchmarks/content_creation/tensorflow/draw/README.md @@ -48,7 +48,7 @@ modes/precisions: --model-name draw \ --mode inference \ --framework tensorflow \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl-py3 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --checkpoint /home//draw_fp32_pretrained_model \ --data-location /home//mnist \ --batch-size 1 \ @@ -61,7 +61,7 @@ modes/precisions: --model-name draw \ --mode inference \ --framework tensorflow \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl-py3 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --checkpoint /home//draw_fp32_pretrained_model \ --data-location /home//mnist \ --batch-size 100 \ @@ -82,8 +82,6 @@ modes/precisions: Time spent per BATCH: 6.6667 ms Total samples/sec: 149.9996 samples/s Outputs saved in file: /home//mnist/draw_data.npy - lscpu_path_cmd = command -v lscpu - lscpu located here: b'/usr/bin/lscpu' Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_draw_inference_fp32_20190123_012947.log ``` @@ -97,8 +95,6 @@ modes/precisions: Time spent per BATCH: 28.1952 ms Total samples/sec: 3546.7006 samples/s Outputs saved in file: /home//mnist/draw_data.npy - lscpu_path_cmd = command -v lscpu - lscpu located here: b'/usr/bin/lscpu' Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_draw_inference_fp32_20190123_013432.log ``` \ No newline at end of file diff --git a/benchmarks/content_creation/tensorflow/draw/inference/fp32/config.json b/benchmarks/content_creation/tensorflow/draw/inference/fp32/config.json new file mode 100644 index 000000000..dfac18793 --- /dev/null +++ b/benchmarks/content_creation/tensorflow/draw/inference/fp32/config.json @@ -0,0 +1,8 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1, + "KMP_HW_SUBSET": "1T" + } +} diff --git a/benchmarks/content_creation/tensorflow/draw/inference/fp32/model_init.py b/benchmarks/content_creation/tensorflow/draw/inference/fp32/model_init.py index 390bcae82..e306ecd55 100644 --- a/benchmarks/content_creation/tensorflow/draw/inference/fp32/model_init.py +++ b/benchmarks/content_creation/tensorflow/draw/inference/fp32/model_init.py @@ -22,7 +22,6 @@ import os import sys from common.base_model_init import BaseModelInitializer -from common.base_model_init import set_env_var class ModelInitializer(BaseModelInitializer): @@ -32,8 +31,8 @@ def __init__(self, args, custom_args=[], platform_util=None): super(ModelInitializer, self).__init__(args, custom_args, platform_util) # Set KMP env vars, if they haven't already been set - self.set_kmp_vars() - set_env_var("KMP_HW_SUBSET", "1T") + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) if self.args.accuracy_only: print("Accuracy testing for DRAW inference is not supported yet.") @@ -45,7 +44,7 @@ def __init__(self, args, custom_args=[], platform_util=None): # Create the command prefix with numactl and executing the script script_path = os.path.join(self.args.intelai_models, self.args.mode, self.args.precision, "draw_inf.py") - self.command_prefix = self.get_numactl_command(args.socket_id) + \ + self.command_prefix = self.get_command_prefix(args.socket_id) + \ " {} {} ".format(self.python_exe, script_path) # Add additional args to the command diff --git a/benchmarks/face_detection_and_alignment/tensorflow/facenet/README.md b/benchmarks/face_detection_and_alignment/tensorflow/facenet/README.md index 0a3659d20..fd27ffa2b 100644 --- a/benchmarks/face_detection_and_alignment/tensorflow/facenet/README.md +++ b/benchmarks/face_detection_and_alignment/tensorflow/facenet/README.md @@ -59,7 +59,7 @@ python launch_benchmark.py \ --checkpoint /home//checkpoints \ --data-location /home//dataset \ --model-source-dir /home//facenet/ \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` Example log tail for online inference: ``` @@ -77,8 +77,6 @@ Total samples/sec: 33.1608 samples/s 2019-03-28 21:00:02.725722: W tensorflow/core/kernels/queue_base.cc:277] _1_batch_join/fifo_queue: Skipping cancelled enqueue attempt with queue not closed 2019-03-28 21:00:02.725746: W tensorflow/core/kernels/queue_base.cc:277] _1_batch_join/fifo_queue: Skipping cancelled enqueue attempt with queue not closed 2019-03-28 21:00:02.725776: W tensorflow/core/kernels/queue_base.cc:277] _1_batch_join/fifo_queue: Skipping cancelled enqueue attempt with queue not closed -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_facenet_inference_fp32_20190328_205911.log ``` @@ -96,7 +94,7 @@ python launch_benchmark.py \ --checkpoint /home//checkpoints \ --data-location /home//dataset \ --model-source-dir /home//facenet/ \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` Example log tail for batch inference: ``` @@ -110,8 +108,6 @@ Accuracy: 0.98833+-0.00489 Validation rate: 0.96200+-0.01968 @ FAR=0.00100 Area Under Curve (AUC): 0.999 Equal Error Rate (EER): 0.011 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_facenet_inference_fp32_20190329_002623.log ``` @@ -130,7 +126,7 @@ python launch_benchmark.py \ --checkpoint /home//checkpoints \ --data-location /home//dataset \ --model-source-dir /home//facenet/ \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` Example log tail for accuracy: ``` @@ -144,8 +140,6 @@ Accuracy: 0.98833+-0.00489 Validation rate: 0.96200+-0.01968 @ FAR=0.00100 Area Under Curve (AUC): 0.999 Equal Error Rate (EER): 0.011 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_facenet_inference_fp32_20190328_214145.log ``` diff --git a/benchmarks/face_detection_and_alignment/tensorflow/facenet/inference/fp32/config.json b/benchmarks/face_detection_and_alignment/tensorflow/facenet/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/face_detection_and_alignment/tensorflow/facenet/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/face_detection_and_alignment/tensorflow/facenet/inference/fp32/model_init.py b/benchmarks/face_detection_and_alignment/tensorflow/facenet/inference/fp32/model_init.py index 9bd9c6243..e00bf70f7 100644 --- a/benchmarks/face_detection_and_alignment/tensorflow/facenet/inference/fp32/model_init.py +++ b/benchmarks/face_detection_and_alignment/tensorflow/facenet/inference/fp32/model_init.py @@ -30,11 +30,12 @@ class ModelInitializer(BaseModelInitializer): def __init__(self, args, custom_args=[], platform_util=None): super(ModelInitializer, self).__init__(args, custom_args, platform_util) - self.cmd = self.get_numactl_command(self.args.socket_id) + \ + self.cmd = self.get_command_prefix(self.args.socket_id) + \ self.python_exe + " " # Set KMP env vars, if they haven't already been set - self.set_kmp_vars() + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) pairs_file = os.path.join(self.args.model_source_dir, "data/pairs.txt") diff --git a/benchmarks/face_detection_and_alignment/tensorflow/mtcc/README.md b/benchmarks/face_detection_and_alignment/tensorflow/mtcc/README.md index 1963f9cbc..36cad0fe3 100644 --- a/benchmarks/face_detection_and_alignment/tensorflow/mtcc/README.md +++ b/benchmarks/face_detection_and_alignment/tensorflow/mtcc/README.md @@ -55,7 +55,7 @@ Run: --mode inference \ --socket-id 0 \ --checkpoint /home//MTCNN_model \ - --docker-image intelaipg/intel-optimized-tensorflow:nightly-latestprs-bdw + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` 6. The log file is saved to the value of `--output-dir`. diff --git a/benchmarks/face_detection_and_alignment/tensorflow/mtcc/inference/fp32/config.json b/benchmarks/face_detection_and_alignment/tensorflow/mtcc/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/face_detection_and_alignment/tensorflow/mtcc/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/face_detection_and_alignment/tensorflow/mtcc/inference/fp32/model_init.py b/benchmarks/face_detection_and_alignment/tensorflow/mtcc/inference/fp32/model_init.py index 34409b702..5d1983139 100644 --- a/benchmarks/face_detection_and_alignment/tensorflow/mtcc/inference/fp32/model_init.py +++ b/benchmarks/face_detection_and_alignment/tensorflow/mtcc/inference/fp32/model_init.py @@ -33,7 +33,8 @@ def __init__(self, args, custom_args, platform_util=None): self.set_num_inter_intra_threads() # Set KMP env vars, if they haven't already been set - self.set_kmp_vars() + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) @@ -41,7 +42,7 @@ def __init__(self, args, custom_args, platform_util=None): self.args.intelai_models, self.args.mode, self.args.precision, "one_image_test.py") self.command_prefix = \ - self.get_numactl_command(self.args.socket_id) + \ + self.get_command_prefix(self.args.socket_id) + \ "{} ".format(self.python_exe) + benchmark_script self.run_cmd = \ diff --git a/benchmarks/image_recognition/tensorflow/densenet169/README.md b/benchmarks/image_recognition/tensorflow/densenet169/README.md new file mode 100644 index 000000000..aaf2fd9e2 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/densenet169/README.md @@ -0,0 +1,143 @@ +# DenseNet 169 + +This document has instructions for how to run DenseNet 169 for the +following modes/precisions: +* [FP32 inference](#fp32-inference-instructions) + +## FP32 Inference Instructions + +1. Download ImageNet dataset. + + This step is required only for running accuracy, for running the model for performance we do not need to provide dataset. + + Register and download the ImageNet dataset. Once you have the raw ImageNet dataset downloaded, we need to convert + it to the TFRecord format. The TensorFlow models repo provides + [scripts and instructions](https://github.com/tensorflow/models/tree/master/research/slim#an-automated-script-for-processing-imagenet-data) + to download, process and convert the ImageNet dataset to the TF records format. After converting data, you should have a directory + with the sharded dataset something like below, we only need `validation-*` files, discard `train-*` files: + ``` + $ ll /home/myuser/datasets/ImageNet_TFRecords + -rw-r--r--. 1 user 143009929 Jun 20 14:53 train-00000-of-01024 + -rw-r--r--. 1 user 144699468 Jun 20 14:53 train-00001-of-01024 + -rw-r--r--. 1 user 138428833 Jun 20 14:53 train-00002-of-01024 + ... + -rw-r--r--. 1 user 143137777 Jun 20 15:08 train-01022-of-01024 + -rw-r--r--. 1 user 143315487 Jun 20 15:08 train-01023-of-01024 + -rw-r--r--. 1 user 52223858 Jun 20 15:08 validation-00000-of-00128 + -rw-r--r--. 1 user 51019711 Jun 20 15:08 validation-00001-of-00128 + -rw-r--r--. 1 user 51520046 Jun 20 15:08 validation-00002-of-00128 + ... + -rw-r--r--. 1 user 52508270 Jun 20 15:09 validation-00126-of-00128 + -rw-r--r--. 1 user 55292089 Jun 20 15:09 validation-00127-of-00128 + ``` + +2. Download the pretrained model: + ``` + $ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/densenet169_fp32_pretrained_model.pb + ``` + +3. Clone the [intelai/models](https://github.com/intelai/models) repo + and then run the model scripts for either online or batch inference or accuracy. For --dataset-location in accuracy run, please use the ImageNet validation data path from step 1. + Each model run has user configurable arguments separated from regular arguments by '--' at the end of the command. + Unless configured, these arguments will run with default values. Below are the example codes for each use case: + + ``` + $ git clone https://github.com/IntelAI/models.git + + $ cd benchmarks + ``` + + For throughput (using `--benchmark-only`, `--socket-id 0` and `--batch-size 100`): + ``` + python launch_benchmark.py \ + --model-name densenet169 \ + --precision fp32 \ + --mode inference \ + --framework tensorflow \ + --benchmark-only \ + --batch-size 100 \ + --socket-id 0 \ + --in-graph /home//densenet169_fp32_pretrained_model.pb \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + -- input_height=224 input_width=224 warmup_steps=20 steps=100 \ + input_layer="input" output_layer="densenet169/predictions/Reshape_1" + ``` + + For latency (using `--benchmark-only`, `--socket-id 0` and `--batch-size 1`) + ``` + python launch_benchmark.py \ + --model-name densenet169 \ + --precision fp32 \ + --mode inference \ + --framework tensorflow \ + --benchmark-only \ + --batch-size 1 \ + --socket-id 0 \ + --in-graph /home//densenet169_fp32_pretrained_model.pb \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + -- input_height=224 input_width=224 warmup_steps=20 steps=100 \ + input_layer="input" output_layer="densenet169/predictions/Reshape_1" + ``` + + For accuracy (using your `--data-location`, `--socket-id 0`, `--accuracy-only` and + `--batch-size 100`): + ``` + python launch_benchmark.py \ + --model-name densenet169 \ + --precision fp32 \ + --mode inference \ + --framework tensorflow \ + --accuracy-only \ + --batch-size 100 \ + --socket-id 0 \ + --in-graph /home//densenet169_fp32_pretrained_model.pb \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --data-location /home//imagenet_validation_dataset \ + -- input_height=224 input_width=224 \ + input_layer="input" output_layer="densenet169/predictions/Reshape_1" + ``` + + Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands + to get additional debug output or change the default output location. + +4. The log file is saved to the `models/benchmarks/common/tensorflow/logs` directory, + or the directory specified by the `--output-dir` arg. Below are examples of + what the tail of your log file should look like for the different configs. + + Example log tail when running for batch inference: + ``` + steps = 80, 159.83471377 images/sec + Latency: 625.646317005 ms + steps = 90, 159.852789241 images/sec + Latency: 625.57557159 ms + steps = 100, 159.853966416 images/sec + Latency: 625.570964813 ms + Ran inference with batch size 100 + Log location outside container: {--output-dir value}/benchmark_densenet169_inference_fp32_20190412_023940.log + ``` + + Example log tail when running for online inference: + ``` + steps = 80, 34.9948442873 images/sec + Latency: 28.5756379366 ms + steps = 90, 34.9644341907 images/sec + Latency: 28.6004914178 ms + steps = 100, 34.9655988121 images/sec + Latency: 28.5995388031 ms + Ran inference with batch size 1 + Log location outside container: {--output-dir value}/benchmark_densenet169_inference_fp32_20190412_024505.log + ``` + + Example log tail when running for accuracy: + ``` + Iteration time: 581.6446 ms + 0.757505030181 + Iteration time: 581.5755 ms + 0.757489959839 + Iteration time: 581.5709 ms + 0.75749498998 + Iteration time: 581.1705 ms + 0.75748 + Ran inference with batch size 100 + Log location outside container: {--output-dir value}/benchmark_densenet169_inference_fp32_20190412_021545.log + ``` diff --git a/benchmarks/image_recognition/tensorflow/densenet169/__init__.py b/benchmarks/image_recognition/tensorflow/densenet169/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/densenet169/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow/densenet169/inference/__init__.py b/benchmarks/image_recognition/tensorflow/densenet169/inference/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/densenet169/inference/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow/densenet169/inference/fp32/__init__.py b/benchmarks/image_recognition/tensorflow/densenet169/inference/fp32/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/densenet169/inference/fp32/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow/densenet169/inference/fp32/config.json b/benchmarks/image_recognition/tensorflow/densenet169/inference/fp32/config.json new file mode 100644 index 000000000..812311847 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/densenet169/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters":{ + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/densenet169/inference/fp32/model_init.py b/benchmarks/image_recognition/tensorflow/densenet169/inference/fp32/model_init.py new file mode 100644 index 000000000..3e4a376af --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/densenet169/inference/fp32/model_init.py @@ -0,0 +1,107 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os + +from common.base_model_init import BaseModelInitializer +from common.base_model_init import set_env_var + + +class ModelInitializer(BaseModelInitializer): + """Model initializer for Densenet169 FP32 inference""" + + def __init__(self, args, custom_args=[], platform_util=None): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) + self.cmd = self.get_command_prefix(self.args.socket_id) + "{} ".format(self.python_exe) + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + + if self.args.batch_size == -1: + self.args.batch_size = 100 + + # set num_inter_threads and num_intra_threads + self.set_num_inter_intra_threads() + + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) + + self.parse_args() + + if self.args.benchmark_only: + run_script = os.path.join(self.args.intelai_models, + self.args.mode, self.args.precision, + "benchmark.py") + + script_args_list = [ + "input_graph", "input_height", "input_width", "batch_size", + "input_layer", "output_layer", "num_inter_threads", + "num_intra_threads", "warmup_steps", "steps"] + + elif self.args.accuracy_only: + run_script = os.path.join(self.args.intelai_models, + self.args.mode, self.args.precision, + "accuracy.py") + + script_args_list = [ + "input_graph", "data_location", "input_height", "input_width", + "batch_size", "input_layer", "output_layer", + "num_inter_threads", "num_intra_threads"] + + self.cmd = self.add_args_to_command(self.cmd + run_script, + script_args_list) + + def parse_args(self): + if self.custom_args: + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_height", default=224, + dest='input_height', type=int, help="input height") + parser.add_argument( + "--input_width", default=224, + dest='input_width', type=int, help="input width") + parser.add_argument( + '--warmup_steps', dest='warmup_steps', + help='number of warmup steps', + type=int, default=20) + parser.add_argument( + '--steps', dest='steps', + help='number of steps', + type=int, default=100) + parser.add_argument( + '--input_layer', dest='input_layer', + help='name of input layer', + type=str, default="input") + parser.add_argument( + '--output_layer', dest='output_layer', + help='name of output layer', + type=str, default="densenet169/predictions/Reshape_1") + + self.args = parser.parse_args(self.custom_args, + namespace=self.args) + + def run(self): + if self.cmd: + self.run_command(self.cmd) diff --git a/benchmarks/image_recognition/tensorflow/inception_resnet_v2/README.md b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/README.md index 3cc4fdccb..c3a44d2d2 100644 --- a/benchmarks/image_recognition/tensorflow/inception_resnet_v2/README.md +++ b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/README.md @@ -7,6 +7,11 @@ following modes/precisions: ## Int8 Inference Instructions +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + 1. Clone this [intelai/models](https://github.com/IntelAI/models) repository: @@ -79,7 +84,7 @@ python launch_benchmark.py \ --framework tensorflow \ --accuracy-only \ --batch-size 100 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-prs-b5d67b7-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inception_resnet_v2_int8_pretrained_model.pb \ --data-location /home//datasets/ImageNet_TFRecords ``` @@ -95,7 +100,7 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-prs-b5d67b7-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inception_resnet_v2_int8_pretrained_model.pb ``` @@ -110,7 +115,7 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 128 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-prs-b5d67b7-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inception_resnet_v2_int8_pretrained_model.pb ``` @@ -136,30 +141,30 @@ Log location outside container: /benchmark_inception_resnet_v2 Example log tail when running for online inference: ``` ... -Iteration 37: 0.046 sec -Iteration 38: 0.046 sec -Iteration 39: 0.046 sec -Iteration 40: 0.046 sec -Average time: 0.045 sec +Iteration 37: 0.043 sec +Iteration 38: 0.042 sec +Iteration 39: 0.043 sec +Iteration 40: 0.043 sec +Average time: 0.043 sec Batch size = 1 -Latency: 45.441 ms -Throughput: 22.007 images/sec +Latency: 42.793 ms +Throughput: 23.368 images/sec Ran inference with batch size 1 -Log location outside container: /benchmark_inception_resnet_v2_inference_int8_20190330_012557.log +Log location outside container: /benchmark_inception_resnet_v2_inference_int8_20190415_231020.log ``` Example log tail when running for batch inference: ``` ... -Iteration 37: 0.975 sec -Iteration 38: 0.975 sec -Iteration 39: 0.987 sec -Iteration 40: 0.974 sec -Average time: 0.976 sec +Iteration 37: 0.932 sec +Iteration 38: 0.928 sec +Iteration 39: 0.927 sec +Iteration 40: 0.928 sec +Average time: 0.928 sec Batch size = 128 -Throughput: 131.178 images/sec +Throughput: 137.978 images/sec Ran inference with batch size 128 -Log location outside container: /benchmark_inception_resnet_v2_inference_int8_20190330_012719.log +Log location outside container: /benchmark_inception_resnet_v2_inference_int8_20190415_225215.log ``` @@ -174,21 +179,12 @@ $ git clone git@github.com:IntelAI/models.git This repository includes launch scripts for running an optimized version of the Inception ResNet V2 model code. -2. Download the pre-trained Inception ResNet V2 model files: - -For accuracy: +2. Download the pre-trained Inception ResNet V2 model: ``` $ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/inception_resnet_v2_fp32_pretrained_model.pb ``` -For batch and online inference: - -``` -$ wget http://download.tensorflow.org/models/inception_resnet_v2_2016_08_30.tar.gz -$ mkdir -p checkpoints && tar -C ./checkpoints/ -zxf inception_resnet_v2_2016_08_30.tar.gz -``` - 3. If you would like to run Inception ResNet V2 inference and test for accuracy, you will need the full ImageNet dataset. Running for online and batch inference do not require the ImageNet dataset. @@ -230,7 +226,7 @@ precision, and docker image to use, along with your path to the ImageNet TF Records that you generated in step 3. Substitute in your own `--data-location` (from step 3, for accuracy -only), `--checkpoint` pre-trained model checkpoint file path (from step 2). +only), `--in-graph` frozen graph file path (from step 2). Inception ResNet V2 can be run for accuracy, online inference, or batch inference. Use one of the following examples below, depending on your use case. @@ -246,8 +242,8 @@ python launch_benchmark.py \ --framework tensorflow \ --accuracy-only \ --batch-size 100 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ - --in-graph /home//inception_resnet_v2_int8_pretrained_model.pb \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --in-graph /home//inception_resnet_v2_fp32_pretrained_model.pb \ --data-location /home//datasets/ImageNet_TFRecords ``` @@ -262,9 +258,8 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 1 \ --socket-id 0 \ - --checkpoint /home//checkpoints \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ - --data-location /home//datasets/ImageNet_TFRecords + --in-graph /home//inception_resnet_v2_fp32_pretrained_model.pb \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` For batch inference (using `--benchmark-only`, `--socket-id 0` and `--batch-size 128`): @@ -278,9 +273,8 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 128 \ --socket-id 0 \ - --checkpoint /home//checkpoints \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ - --data-location /home//datasets/ImageNet_TFRecords + --in-graph /home//inception_resnet_v2_fp32_pretrained_model.pb \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands @@ -297,36 +291,31 @@ Example log tail when running for accuracy: Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.8036, 0.9526) Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.8036, 0.9525) Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.8037, 0.9525) -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_inception_resnet_v2_inference_fp32_20190109_081637.log ``` Example log tail when running for online inference: ``` -eval/Accuracy[0] -eval/Recall_5[0.01] -INFO:tensorflow:Finished evaluation at 2019-01-08-01:51:28 -self._total_images_per_sec = 69.7 -self._displayed_steps = 10 -Total images/sec = 7.0 -Latency ms/step = 143.4 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu +Iteration 38: 0.052 sec +Iteration 39: 0.051 sec +Iteration 40: 0.051 sec +Average time: 0.050 sec +Batch size = 1 +Latency: 50.094 ms +Throughput: 19.963 images/sec Ran inference with batch size 1 -Log location outside container: {--output-dir value}/benchmark_inception_resnet_v2_inference_fp32_20190108_015057.log +Log location outside container: {--output-dir value}/benchmark_inception_resnet_v2_inference_fp32_20190410_205213.log ``` Example log tail when running for batch inference: ``` -eval/Accuracy[0.00078125] -eval/Recall_5[0.00375] -INFO:tensorflow:Finished evaluation at 2019-01-08-01:59:37 -self._total_images_per_sec = 457.0 -self._displayed_steps = 10 -Total images/sec = 45.7 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu +Iteration 38: 1.848 sec +Iteration 39: 1.799 sec +Iteration 40: 1.850 sec +Average time: 1.818 sec +Batch size = 128 +Throughput: 70.402 images/sec Ran inference with batch size 128 -Log location outside container: {--output-dir value}/benchmark_inception_resnet_v2_inference_fp32_20190108_015440.log +Log location outside container: {--output-dir value}/benchmark_inception_resnet_v2_inference_fp32_20190410_205628.log +``` diff --git a/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/fp32/config.json b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/fp32/model_init.py b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/fp32/model_init.py index 045921acd..13fd8a79f 100644 --- a/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/fp32/model_init.py +++ b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/fp32/model_init.py @@ -29,10 +29,11 @@ class ModelInitializer(BaseModelInitializer): def __init__(self, args, custom_args=[], platform_util=None): super(ModelInitializer, self).__init__(args, custom_args, platform_util) - self.cmd = self.get_numactl_command(self.args.socket_id) + self.python_exe + " " + self.cmd = self.get_command_prefix(self.args.socket_id) + self.python_exe + " " # Set KMP env vars, if they haven't already been set - self.set_kmp_vars() + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # use default batch size if -1 if self.args.batch_size == -1: @@ -45,20 +46,14 @@ def __init__(self, args, custom_args=[], platform_util=None): if self.args.benchmark_only: run_script = os.path.join(self.args.intelai_models, - "eval_image_classifier.py") + "eval_image_classifier_benchmark.py") - cmd_args = " --dataset_name=imagenet" + \ - " --checkpoint_path=" + self.args.checkpoint + \ - " --eval_dir=" + self.args.checkpoint + \ - " --dataset_dir=" + self.args.data_location + \ - " --dataset_split_name=validation" + \ - " --clone_on_cpu=True" + \ - " --model_name=" + str(self.args.model_name) + \ - " --inter_op_parallelism_threads=" + \ - str(self.args.num_inter_threads) + \ - " --intra_op_parallelism_threads=" + \ - str(self.args.num_intra_threads) + \ - " --batch_size=" + str(self.args.batch_size) + cmd_args = " --input-graph=" + self.args.input_graph + \ + " --inter-op-parallelism-threads=" + \ + str(self.args.num_inter_threads) + \ + " --intra-op-parallelism-threads=" + \ + str(self.args.num_intra_threads) + \ + " --batch-size=" + str(self.args.batch_size) elif self.args.accuracy_only: run_script = os.path.join(self.args.intelai_models, "eval_image_classifier_accuracy.py") diff --git a/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/int8/config.json b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/int8/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/int8/model_init.py b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/int8/model_init.py index f2e2e1469..90ce7bcb2 100644 --- a/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/int8/model_init.py +++ b/benchmarks/image_recognition/tensorflow/inception_resnet_v2/inference/int8/model_init.py @@ -31,8 +31,12 @@ class ModelInitializer(BaseModelInitializer): def __init__(self, args, custom_args=[], platform_util=None): super(ModelInitializer, self).__init__(args, custom_args, platform_util) - self.set_kmp_vars() - self.cmd = self.get_numactl_command(self.args.socket_id) + "{} ".format(self.python_exe) + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + + self.cmd = self.get_command_prefix(self.args.socket_id) + "{} ".format(self.python_exe) # use default batch size if -1 if self.args.batch_size == -1: diff --git a/benchmarks/image_recognition/tensorflow/inceptionv3/README.md b/benchmarks/image_recognition/tensorflow/inceptionv3/README.md index 9de17c994..0a9223914 100644 --- a/benchmarks/image_recognition/tensorflow/inceptionv3/README.md +++ b/benchmarks/image_recognition/tensorflow/inceptionv3/README.md @@ -9,6 +9,11 @@ Instructions for model training and inference for other precisions are coming la ## Int8 Inference Instructions +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + 1. Clone this [intelai/models](https://github.com/IntelAI/models) repository: @@ -92,7 +97,7 @@ python launch_benchmark.py \ --framework tensorflow \ --accuracy-only \ --batch-size 100 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv3_int8_pretrained_model.pb \ --data-location /home//datasets/ImageNet_TFRecords ``` @@ -113,7 +118,7 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv3_int8_pretrained_model.pb \ --data-location /home//datasets/ImageNet_TFRecords \ -- warmup_steps=50 steps=500 @@ -130,7 +135,7 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv3_int8_pretrained_model.pb \ -- warmup_steps=50 steps=500 ``` @@ -146,7 +151,7 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 128 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv3_int8_pretrained_model.pb \ --data-location /home//datasets/ImageNet_TFRecords \ -- warmup_steps=50 steps=500 @@ -163,17 +168,11 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 128 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv3_int8_pretrained_model.pb \ -- warmup_steps=50 steps=500 ``` -The docker image (`intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl`) -used in the commands above were built using -[TensorFlow](git@github.com:tensorflow/tensorflow.git) master -([e889ea1](https://github.com/tensorflow/tensorflow/commit/e889ea1dd965c31c391106aa3518fc23d2689954)) and -[PR #25765](https://github.com/tensorflow/tensorflow/pull/25765). - Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands to get additional debug output or change the default output location.. @@ -185,9 +184,8 @@ different configs. Example log tail when running for accuracy: ``` +Iteration time: 357.3781 ms Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7666, 0.9333) -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Executing command: python /workspace/intelai_models/int8/accuracy.py --input_height=299 --input_width=299 --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/in_graph/inceptionv3_int8_pretrained_model.pb --data_location=/dataset Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_inceptionv3_inference_int8_20190104_013246.log @@ -196,27 +194,25 @@ Log location outside container: {--output-dir value}/benchmark_inceptionv3_infer Example log tail when running for online inference: ``` ... -steps = 470, 53.7256017113 images/sec -steps = 480, 52.5430812016 images/sec -steps = 490, 52.9076139058 images/sec -steps = 500, 53.5021876395 images/sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu +steps = 470, 134.912798739 images/sec +steps = 480, 132.379245045 images/sec +steps = 490, 133.977640069 images/sec +steps = 500, 132.083262478 images/sec +Average throughput for batch size 1: 133.440858806 images/sec Ran inference with batch size 1 -Log location outside container: {--output-dir value}/benchmark_inceptionv3_inference_int8_20190223_194002.log +Log location outside container: {--output-dir value}/benchmark_inceptionv3_inference_int8_20190415_220455.log ``` Example log tail when running for batch inference: ``` ... -steps = 470, 370.435654276 images/sec -steps = 480, 369.710160177 images/sec -steps = 490, 369.083388904 images/sec -steps = 500, 370.287978128 images/sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu +steps = 470, 369.151656047 images/sec +steps = 480, 373.174541014 images/sec +steps = 490, 372.402638382 images/sec +steps = 500, 371.836748659 images/sec +Average throughput for batch size 128: 371.269087408 images/sec Ran inference with batch size 128 -Log location outside container: {--output-dir value}/benchmark_inceptionv3_inference_int8_20190223_194314.log +Log location outside container: {--output-dir value}/benchmark_inceptionv3_inference_int8_20190416_162155.log ``` ## FP32 Inference Instructions @@ -262,7 +258,7 @@ python launch_benchmark.py \ --framework tensorflow \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv3_fp32_pretrained_model.pb ``` Example log tail when running for online inference: @@ -279,8 +275,6 @@ Average time: 0.014 sec Batch size = 1 Latency: 14.442 ms Throughput: 69.243 images/sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_inceptionv3_inference_fp32_20190104_025220.log ``` @@ -295,7 +289,7 @@ python launch_benchmark.py \ --framework tensorflow \ --batch-size 128 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv3_fp32_pretrained_model.pb ``` Example log tail when running for batch inference: @@ -311,8 +305,6 @@ Iteration 40: 0.757 sec Average time: 0.760 sec Batch size = 128 Throughput: 168.431 images/sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 128 Log location outside container: {--output-dir value}/benchmark_inceptionv3_inference_fp32_20190104_024842.log ``` @@ -329,19 +321,20 @@ python launch_benchmark.py \ --accuracy-only \ --batch-size 100 \ --data-location /dataset/Imagenet_Validation \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv3_fp32_pretrained_model.pb ``` Example log tail when running for accuracy: ``` +Iteration time: 756.7571 ms Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.7673, 0.9341) +Iteration time: 757.3781 ms Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.7674, 0.9341) +Iteration time: 760.3024 ms Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7675, 0.9342) -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_inceptionv3_inference_fp32_20190104_023816.log ``` Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands -to get additional debug output or change the default output location.. \ No newline at end of file +to get additional debug output or change the default output location.. diff --git a/benchmarks/image_recognition/tensorflow/inceptionv3/inference/fp32/config.json b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/inceptionv3/inference/fp32/model_init.py b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/fp32/model_init.py index dd504259e..f550765f4 100644 --- a/benchmarks/image_recognition/tensorflow/inceptionv3/inference/fp32/model_init.py +++ b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/fp32/model_init.py @@ -60,15 +60,17 @@ def __init__(self, args, custom_args=[], platform_util=None): self.args = arg_parser.parse_args(self.custom_args, namespace=self.args) - # Use default KMP variable values, but override the default KMP_BLOCKTIME value - self.set_kmp_vars(kmp_blocktime=str(self.args.kmp_blocktime)) + # Set KMP env vars, if they haven't already been set, but override the default KMP_BLOCKTIME value + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime)) + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) benchmark_script = os.path.join( self.args.intelai_models, self.args.precision, "eval_image_classifier_inference.py") - self.benchmark_command = self.get_numactl_command(args.socket_id) + \ + self.benchmark_command = self.get_command_prefix(args.socket_id) + \ self.python_exe + " " + benchmark_script num_cores = self.platform_util.num_cores_per_socket if self.args.num_cores == -1 \ diff --git a/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/__init__.py b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/__init__.py index 87301fd64..139d705c0 100644 --- a/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/__init__.py +++ b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/__init__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/config.json b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/model_init.py b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/model_init.py index 6d586ea80..645f2f92e 100644 --- a/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/model_init.py +++ b/benchmarks/image_recognition/tensorflow/inceptionv3/inference/int8/model_init.py @@ -60,8 +60,9 @@ def parse_args(self): self.args = parser.parse_args(self.custom_args, namespace=self.args) - # Use default KMP variable values, but override the default KMP_BLOCKTIME value - self.set_kmp_vars(kmp_blocktime=str(self.args.kmp_blocktime)) + # Set KMP env vars, if they haven't already been set, but override the default KMP_BLOCKTIME value + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime)) def run_benchmark(self): benchmark_script = os.path.join(self.args.intelai_models, @@ -73,7 +74,7 @@ def run_benchmark(self): "data_num_inter_threads", "data_num_intra_threads", "warmup_steps", "steps"] - cmd_prefix = self.get_numactl_command(self.args.socket_id) + \ + cmd_prefix = self.get_command_prefix(self.args.socket_id) + \ self.python_exe + " " + benchmark_script cmd = self.add_args_to_command(cmd_prefix, script_args_list) # add num_cores @@ -93,7 +94,7 @@ def run_accuracy(self): "batch_size", "num_inter_threads", "num_intra_threads"] - cmd_prefix = self.get_numactl_command(self.args.socket_id) + \ + cmd_prefix = self.get_command_prefix(self.args.socket_id) + \ self.python_exe + " " + accuracy_script cmd = self.add_args_to_command(cmd_prefix, script_args_list) self.run_command(cmd) @@ -105,7 +106,7 @@ def run_calibration(self): "input_graph", "data_location", "batch_size", "num_inter_threads", "num_intra_threads"] - cmd_prefix = self.get_numactl_command(self.args.socket_id) + \ + cmd_prefix = self.get_command_prefix(self.args.socket_id) + \ self.python_exe + " " + calibration_script cmd = self.add_args_to_command(cmd_prefix, script_args_list) self.run_command(cmd) diff --git a/benchmarks/image_recognition/tensorflow/inceptionv4/README.md b/benchmarks/image_recognition/tensorflow/inceptionv4/README.md index a1228c3ad..560de9ef5 100644 --- a/benchmarks/image_recognition/tensorflow/inceptionv4/README.md +++ b/benchmarks/image_recognition/tensorflow/inceptionv4/README.md @@ -10,6 +10,11 @@ other precisions are coming later. ## Int8 Inference Instructions +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + 1. Clone this [intelai/models](https://github.com/IntelAI/models) repository: ``` @@ -51,7 +56,7 @@ other precisions are coming later. --accuracy-only \ --batch-size 100 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv4_int8_pretrained_model.pb \ --data-location /home//ImageNet_TFRecords ``` @@ -66,7 +71,7 @@ other precisions are coming later. --benchmark-only \ --batch-size 240 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv4_int8_pretrained_model.pb ``` @@ -80,16 +85,10 @@ other precisions are coming later. --benchmark-only \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv4_int8_pretrained_model.pb ``` - The docker image (`intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl`) - used in the commands above were built using - [TensorFlow](git@github.com:tensorflow/tensorflow.git) master - ([e889ea1](https://github.com/tensorflow/tensorflow/commit/e889ea1dd965c31c391106aa3518fc23d2689954)) and - [PR #25765](https://github.com/tensorflow/tensorflow/pull/25765). - Note that the `--verbose` flag can be added to any of the above commands to get additional debug output. @@ -101,47 +100,45 @@ other precisions are coming later. Example log tail when running for accuracy: ``` ... + Iteration time: 685.1976 ms Processed 49700 images. (Top1 accuracy, Top5 accuracy) = (0.7985, 0.9504) + Iteration time: 686.3845 ms Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.7983, 0.9504) + Iteration time: 686.7021 ms Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.7984, 0.9504) + Iteration time: 685.8914 ms Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7984, 0.9504) - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: /benchmark_inceptionv4_inference_int8_20190306_221608.log ``` Example log tail when running for batch inference: ``` - [Running warmup steps...] - steps = 10, 185.108768528 images/sec - [Running benchmark steps...] - steps = 10, 184.482999017 images/sec - steps = 20, 184.561572444 images/sec - steps = 30, 184.620504126 images/sec - steps = 40, 183.900309054 images/sec - steps = 50, 184.110358713 images/sec - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu - Ran inference with batch size 240 - Log location outside container: /benchmark_inceptionv4_inference_int8_20190306_215858.log + [Running warmup steps...] + steps = 10, 184.497605972 images/sec + [Running benchmark steps...] + steps = 10, 184.664702184 images/sec + steps = 20, 184.938455688 images/sec + steps = 30, 184.454197634 images/sec + steps = 40, 184.491891402 images/sec + steps = 50, 184.390001575 images/sec + Ran inference with batch size 240 + Log location outside container: /benchmark_inceptionv4_inference_int8_20190415_233517.log ``` Example log tail when running for online inference: ``` - [Running warmup steps...] - steps = 10, 30.8738415788 images/sec - [Running benchmark steps...] - steps = 10, 31.8633787623 images/sec - steps = 20, 31.1129375635 images/sec - steps = 30, 31.2716048462 images/sec - steps = 40, 31.9682931663 images/sec - steps = 50, 31.6665962009 images/sec - Latency: 31.936 ms - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu - Ran inference with batch size 1 - Log location outside container: /benchmark_inceptionv4_inference_int8_20190306_215702.log + [Running warmup steps...] + steps = 10, 32.6095380262 images/sec + [Running benchmark steps...] + steps = 10, 32.9024373024 images/sec + steps = 20, 32.5328989723 images/sec + steps = 30, 32.5988932413 images/sec + steps = 40, 31.3991914957 images/sec + steps = 50, 32.7053998207 images/sec + Latency: 30.598 ms + Ran inference with batch size 1 + Log location outside container: /benchmark_inceptionv4_inference_int8_20190415_232441.log ``` ## FP32 Inference Instructions @@ -188,7 +185,7 @@ other precisions are coming later. --accuracy-only \ --batch-size 100 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv4_fp32_pretrained_model.pb \ --data-location /home//ImageNet_TFRecords ``` @@ -203,7 +200,7 @@ other precisions are coming later. --benchmark-only \ --batch-size 240 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv4_fp32_pretrained_model.pb ``` @@ -217,7 +214,7 @@ other precisions are coming later. --benchmark-only \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//inceptionv4_fp32_pretrained_model.pb ``` @@ -232,10 +229,15 @@ other precisions are coming later. Example log tail when running for accuracy: ``` ... + Iteration time: 1337.8728 ms Processed 49600 images. (Top1 accuracy, Top5 accuracy) = (0.8015, 0.9517) + Iteration time: 1331.8253 ms Processed 49700 images. (Top1 accuracy, Top5 accuracy) = (0.8017, 0.9518) + Iteration time: 1339.1553 ms Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.8017, 0.9518) + Iteration time: 1334.5991 ms Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.8018, 0.9519) + Iteration time: 1336.1905 ms Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.8018, 0.9519) Ran inference with batch size 100 Log location outside container: /benchmark_inceptionv4_inference_fp32_20190308_182729.log diff --git a/benchmarks/image_recognition/tensorflow/inceptionv4/inference/config.json b/benchmarks/image_recognition/tensorflow/inceptionv4/inference/config.json new file mode 100644 index 000000000..6f1228ba7 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/inceptionv4/inference/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/inceptionv4/inference/inceptionv4_model_init.py b/benchmarks/image_recognition/tensorflow/inceptionv4/inference/inceptionv4_model_init.py index c7d546477..74da197fd 100644 --- a/benchmarks/image_recognition/tensorflow/inceptionv4/inference/inceptionv4_model_init.py +++ b/benchmarks/image_recognition/tensorflow/inceptionv4/inference/inceptionv4_model_init.py @@ -38,7 +38,11 @@ def __init__(self, args, custom_args=[], platform_util=None): # Environment variables set_env_var("OMP_NUM_THREADS", platform_util.num_cores_per_socket if self.args.num_cores == -1 else self.args.num_cores) - self.set_kmp_vars(kmp_blocktime="0") + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + self.set_num_inter_intra_threads(num_inter_threads=platform_util.num_threads_per_core, num_intra_threads=platform_util.num_cores_per_socket) @@ -69,7 +73,7 @@ def parse_args(self): def add_command_prefix(self, script_path): """ Uses the specified script path and adds on the command prefix """ - return self.get_numactl_command(self.args.socket_id) + self.python_exe + " " + \ + return self.get_command_prefix(self.args.socket_id) + self.python_exe + " " + \ script_path def run_benchmark(self): diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/README.md b/benchmarks/image_recognition/tensorflow/mobilenet_v1/README.md index 17e274aa9..e7d0d6f5d 100644 --- a/benchmarks/image_recognition/tensorflow/mobilenet_v1/README.md +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/README.md @@ -2,14 +2,164 @@ This document has instructions for how to run MobileNet V1 for the following modes/precisions: +* [Int8 inference](#int8-inference-instructions) * [FP32 inference](#fp32-inference-instructions) Instructions and scripts for model training are coming later. + +## Int8 Inference Instructions + +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + +1. Download ImageNet dataset. + + This step is required only for running accuracy, for running benchmark we do not need to provide dataset. + + Register and download the ImageNet dataset. Once you have the raw ImageNet dataset downloaded, we need to convert + it to the TFRecord format. The TensorFlow models repo provides + [scripts and instructions](https://github.com/tensorflow/models/tree/master/research/slim#an-automated-script-for-processing-imagenet-data) + to download, process and convert the ImageNet dataset to the TF records format. After converting data, you should have a directory + with the sharded dataset something like below, we only need `validation-*` files, discard `train-*` files: + ``` + $ ll /home/myuser/datasets/ImageNet_TFRecords + -rw-r--r--. 1 user 143009929 Jun 20 14:53 train-00000-of-01024 + -rw-r--r--. 1 user 144699468 Jun 20 14:53 train-00001-of-01024 + -rw-r--r--. 1 user 138428833 Jun 20 14:53 train-00002-of-01024 + ... + -rw-r--r--. 1 user 143137777 Jun 20 15:08 train-01022-of-01024 + -rw-r--r--. 1 user 143315487 Jun 20 15:08 train-01023-of-01024 + -rw-r--r--. 1 user 52223858 Jun 20 15:08 validation-00000-of-00128 + -rw-r--r--. 1 user 51019711 Jun 20 15:08 validation-00001-of-00128 + -rw-r--r--. 1 user 51520046 Jun 20 15:08 validation-00002-of-00128 + ... + -rw-r--r--. 1 user 52508270 Jun 20 15:09 validation-00126-of-00128 + -rw-r--r--. 1 user 55292089 Jun 20 15:09 validation-00127-of-00128 + ``` +2. Download the pre-trained model. +``` +$ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/mobilenetv1_int8_pretrained_model.pb +``` + +3. Clone the [intelai/models](https://github.com/intelai/models) repo + and then run the model scripts for either online or batch inference or accuracy. For --dataset-location in accuracy run, please use the ImageNet validation data path from step 1. + Each model run has user configurable arguments separated from regular arguments by '--' at the end of the command. + Unless configured, these arguments will run with default values. Below are the example codes for each use case: + + ``` + $ git clone https://github.com/IntelAI/models.git + + $ cd benchmarks + ``` + + For batch inference (using `--benchmark-only`, `--socket-id 0` and `--batch-size 240`): + ``` + python launch_benchmark.py \ + --model-name mobilenet_v1 \ + --precision int8 \ + --mode inference \ + --framework tensorflow \ + --benchmark-only \ + --batch-size 240 \ + --socket-id 0 \ + --in-graph /home//mobilenetv1_int8_pretrained_model.pb \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + -- input_height=224 input_width=224 warmup_steps=10 steps=50 \ + input_layer="input" output_layer="MobilenetV1/Predictions/Reshape_1" + ``` + + For online inference (using `--benchmark-only`, `--socket-id 0` and `--batch-size 1`) + ``` + python launch_benchmark.py \ + --model-name mobilenet_v1 \ + --precision int8 \ + --mode inference \ + --framework tensorflow \ + --benchmark-only \ + --batch-size 1 \ + --socket-id 0 \ + --in-graph /home//mobilenetv1_int8_pretrained_model.pb \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + -- input_height=224 input_width=224 warmup_steps=10 steps=50 \ + input_layer="input" output_layer="MobilenetV1/Predictions/Reshape_1" + ``` + + For accuracy (using your `--data-location`, `--accuracy-only` and + `--batch-size 100`): + ``` + python launch_benchmark.py \ + --model-name mobilenet_v1 \ + --precision int8 \ + --mode inference \ + --framework tensorflow \ + --accuracy-only \ + --batch-size 100 \ + --socket-id 0 \ + --in-graph /home//mobilenetv1_int8_pretrained_model.pb \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --data-location /home//imagenet_validation_dataset \ + -- input_height=224 input_width=224 \ + input_layer="input" output_layer="MobilenetV1/Predictions/Reshape_1" + ``` + + Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands + to get additional debug output or change the default output location. + +4. The log file is saved to the `models/benchmarks/common/tensorflow/logs` directory, + or the directory specified by the `--output-dir` arg. Below are examples of + what the tail of your log file should look like for the different configs. + + Example log tail when running for batch inference: + ``` + [Running warmup steps...] + steps = 10, 1865.30956528 images/sec + [Running benchmark steps...] + steps = 10, 1872.92398031 images/sec + steps = 20, 1862.64499512 images/sec + steps = 30, 1857.97283454 images/sec + steps = 40, 1864.70142784 images/sec + steps = 50, 1854.23896906 images/sec + Ran inference with batch size 240 + Log location outside container: {--output-dir value}/benchmark_mobilenet_v1_inference_int8_20190523_164626.log + ``` + + Example log tail when running for online inference: + ``` + [Running warmup steps...] + steps = 10, 197.082229114 images/sec + [Running benchmark steps...] + steps = 10, 195.201936054 images/sec + steps = 20, 195.693743293 images/sec + steps = 30, 198.999098543 images/sec + steps = 40, 189.256565292 images/sec + steps = 50, 201.252531069 images/sec + Ran inference with batch size 1 + Log location outside container: {--output-dir value}/benchmark_mobilenet_v1_inference_int8_20190523_164348.log + ``` + + Example log tail when running for accuracy: + ``` + Iteration time: 66.8541 ms + Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.7014, 0.8935) + Iteration time: 66.7909 ms + Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.7014, 0.8934) + Iteration time: 66.7001 ms + Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7014, 0.8934) + Ran inference with batch size 100 + Log location outside container: {--output-dir value}/benchmark_mobilenet_v1_inference_int8_20190523_164955.log + ``` + ## FP32 Inference Instructions -1. Download the ImageNet dataset and convert it to the TF records format +1. The ImageNet dataset is required for testing accuracy and can also be + used when running online or batch inference. If no dataset is provided when running + online or batch inference, synthetic data will be used. + + Download the ImageNet dataset and convert it to the TF records format using the instructions [here](https://github.com/tensorflow/models/tree/master/research/slim#an-automated-script-for-processing-imagenet-data). @@ -56,28 +206,35 @@ later. [tensorflow/models](https://github.com/tensorflow/models) repo that was cloned in step 3. - * Run for online inference (with `--batch-size 1` and `--checkpoint` with a path to the checkpoint file directory): + * Run for online inference (with `--batch-size 1`, `--checkpoint` + with a path to the checkpoint file directory, and the `--data-location` + is optional): + ``` python launch_benchmark.py \ --precision fp32 \ --model-name mobilenet_v1 \ --mode inference \ --framework tensorflow \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ --batch-size 1 \ --socket-id 0 \ --data-location /dataset/Imagenet_Validation \ --checkpoint /home//mobilenet_v1_fp32_pretrained_model ``` - * Run for batch inference (with `--batch-size 100` and `--checkpoint` with a path to the checkpoint file directory): + + * Run for batch inference (with `--batch-size 100`, + `--checkpoint` with a path to the checkpoint file directory, and + the `--data-location` is optional): + ``` python launch_benchmark.py \ --precision fp32 \ --model-name mobilenet_v1 \ --mode inference \ --framework tensorflow \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ --batch-size 100 \ --socket-id 0 \ @@ -91,7 +248,7 @@ later. --model-name mobilenet_v1 \ --mode inference \ --framework tensorflow \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ --batch-size 100 \ --accuracy-only \ @@ -115,8 +272,6 @@ later. self._displayed_steps = 10 Total images/sec = 81.0 Latency ms/step = 12.4 - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_mobilenet_v1_inference_fp32_20190104_200218.log ``` @@ -132,18 +287,17 @@ later. self._total_images_per_sec = 1810.2 self._displayed_steps = 10 Total images/sec = 181.0 - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_mobilenet_v1_inference_fp32_20190104_200512.log ``` * Below is a sample lof file snippet when testing accuracy: ``` + Iteration time: 119.1134 ms Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.7104, 0.8999) + Iteration time: 118.8375 ms Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.7103, 0.8999) + Iteration time: 119.9311 ms Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7102, 0.8999) - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_mobilenet_v1_inference_fp32_20190110_211648.log - ``` \ No newline at end of file + ``` diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/__init__.py b/benchmarks/image_recognition/tensorflow/mobilenet_v1/__init__.py index cf793ec6a..d9c4123de 100644 --- a/benchmarks/image_recognition/tensorflow/mobilenet_v1/__init__.py +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/__init__.py @@ -1,7 +1,7 @@ # # -*- coding: utf-8 -*- # -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/__init__.py b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/__init__.py index cf793ec6a..d9c4123de 100644 --- a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/__init__.py +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/__init__.py @@ -1,7 +1,7 @@ # # -*- coding: utf-8 -*- # -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/config.json b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/config.json new file mode 100644 index 000000000..f0b327528 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/config.json @@ -0,0 +1,6 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/model_init.py b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/model_init.py index e75c72194..8fa7391ae 100644 --- a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/model_init.py +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/fp32/model_init.py @@ -33,8 +33,9 @@ def __init__(self, args, custom_args=[], platform_util=None): if self.args.batch_size == -1: self.args.batch_size = 128 - # Set KMP env vars (except KMP_SETTINGS is not set) - self.set_kmp_vars(kmp_settings=None) + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # set num_inter_threads and num_intra_threads (override inter threads to 2) self.set_num_inter_intra_threads(num_inter_threads=2) @@ -56,7 +57,6 @@ def __init__(self, args, custom_args=[], platform_util=None): self.command_prefix = ("{prefix} " "--dataset_name imagenet " "--checkpoint_path {checkpoint} " - "--dataset_dir {dataset} " "--dataset_split_name=validation " "--clone_on_cpu=True " "--model_name {model} " @@ -64,9 +64,11 @@ def __init__(self, args, custom_args=[], platform_util=None): "--intra_op_parallelism_threads {intra} " "--batch_size {bz}").format( prefix=self.command_prefix, checkpoint=self.args.checkpoint, - dataset=self.args.data_location, model=self.args.model_name, - inter=self.args.num_inter_threads, + model=self.args.model_name, inter=self.args.num_inter_threads, intra=self.args.num_intra_threads, bz=self.args.batch_size) + + if self.args.data_location: + self.command_prefix += " --dataset_dir {}".format(self.args.data_location) else: # add args for the accuracy script script_args_list = [ diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/int8/__init__.py b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/int8/__init__.py new file mode 100644 index 000000000..cf793ec6a --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/int8/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/int8/config.json b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/int8/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/int8/model_init.py b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/int8/model_init.py new file mode 100644 index 000000000..c693b055c --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/mobilenet_v1/inference/int8/model_init.py @@ -0,0 +1,100 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os + +from common.base_model_init import BaseModelInitializer +from common.base_model_init import set_env_var + + +class ModelInitializer(BaseModelInitializer): + """Model initializer for Mobilenet INT8 inference""" + + def __init__(self, args, custom_args=[], platform_util=None): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) + self.cmd = self.get_command_prefix(self.args.socket_id) + "python " + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + + # Set the num_inter_threads and num_intra_threads + self.set_num_inter_intra_threads() + # Set env vars, if they haven't already been set + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) + + self.parse_args() + + if self.args.benchmark_only: + run_script = os.path.join( + self.args.intelai_models, self.args.mode, + self.args.precision, "benchmark.py") + script_args_list = [ + "input_graph", "input_height", "input_width", "batch_size", + "input_layer", "output_layer", "num_inter_threads", + "num_intra_threads", "warmup_steps", "steps"] + if self.args.accuracy_only: + run_script = os.path.join( + self.args.intelai_models, self.args.mode, + self.args.precision, "accuracy.py") + script_args_list = [ + "input_graph", "data_location", "input_height", "input_width", + "batch_size", "input_layer", "output_layer", + "num_inter_threads", "num_intra_threads"] + + self.cmd = self.add_args_to_command(self.cmd + run_script, script_args_list) + + def parse_args(self): + if self.custom_args: + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_height", default=224, + dest='input_height', type=int, help="input height") + parser.add_argument( + "--input_width", default=224, + dest='input_width', type=int, help="input width") + parser.add_argument( + '--warmup_steps', dest='warmup_steps', + help='number of warmup steps', + type=int, default=10) + parser.add_argument( + '--steps', dest='steps', + help='number of steps', + type=int, default=50) + parser.add_argument( + '--input_layer', dest='input_layer', + help='name of input layer', + type=str, default="input") + parser.add_argument( + '--output_layer', dest='output_layer', + help='name of output layer', + type=str, default="MobilenetV1/Predictions/Reshape_1") + + self.args = parser.parse_args(self.custom_args, + namespace=self.args) + + def run(self): + if self.cmd: + self.run_command(self.cmd) diff --git a/benchmarks/image_recognition/tensorflow/resnet101/README.md b/benchmarks/image_recognition/tensorflow/resnet101/README.md index 442c9cb21..7fb3566eb 100644 --- a/benchmarks/image_recognition/tensorflow/resnet101/README.md +++ b/benchmarks/image_recognition/tensorflow/resnet101/README.md @@ -7,6 +7,11 @@ following modes/precisions: ## Int8 Inference Instructions +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + 1. Clone this [intelai/models](https://github.com/IntelAI/models) repository: @@ -80,7 +85,7 @@ $ python launch_benchmark.py \ --framework tensorflow \ --accuracy-only \ --batch-size 100 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --data-location /home//dataset/FullImageNetData_directory \ --in-graph=/home//resnet101_int8_pretrained_model.pb ``` @@ -101,7 +106,7 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph=/home//resnet101_int8_pretrained_model.pb \ -- warmup_steps=50 steps=500 ``` @@ -118,7 +123,7 @@ python launch_benchmark.py \ --batch-size 1 \ --socket-id 0 \ --data-location /home//dataset/FullImageNetData_directory \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph=/home//resnet101_int8_pretrained_model.pb \ -- warmup_steps=50 steps=500 ``` @@ -134,7 +139,7 @@ python launch_benchmark.py \ --benchmark-only \ --batch-size 128 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph=/home//resnet101_int8_pretrained_model.pb \ -- warmup_steps=50 steps=500 ``` @@ -151,18 +156,11 @@ python launch_benchmark.py \ --batch-size 128 \ --data-location /home//dataset/FullImageNetData_directory \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph=/home//resnet101_int8_pretrained_model.pb \ -- warmup_steps=50 steps=500 ``` - -The docker image (`intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl`) -used in the commands above were built using -[TensorFlow](git@github.com:tensorflow/tensorflow.git) master -([e889ea1](https://github.com/tensorflow/tensorflow/commit/e889ea1dd965c31c391106aa3518fc23d2689954)) and -[PR #25765](https://github.com/tensorflow/tensorflow/pull/25765). - Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands to get additional debug output or change the default output location.. @@ -176,8 +174,6 @@ Example log tail when running for accuracy: Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.7690, 0.9304) Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.7691, 0.9305) Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7691, 0.9305) -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_resnet101_inference_int8_20190104_205838.log ``` @@ -189,8 +185,6 @@ steps = 470, 48.3195530058 images/sec steps = 480, 47.2792312364 images/sec steps = 490, 46.3175214744 images/sec steps = 500, 45.4044245083 images/sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_resnet101_inference_int8_20190223_191406.log ``` @@ -202,8 +196,6 @@ steps = 470, 328.906266308 images/sec steps = 480, 322.0451309 images/sec steps = 490, 315.455582114 images/sec steps = 500, 309.142758646 images/sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 128 Log location outside container: {--output-dir value}/benchmark_resnet101_inference_int8_20190223_192438.log ``` @@ -258,7 +250,7 @@ $ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/resnet10 --mode inference \ --model-name resnet101 \ --batch-size 128 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//trained_models/resnet101_fp32_pretrained_model.pb \ --socket-id 0 ``` @@ -272,8 +264,6 @@ $ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/resnet10 steps = 80, 169.258177508 images/sec steps = 90, 150.457869027 images/sec steps = 100, 135.433960175 images/sec - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu Ran inference with batch size 128 Log location outside container: {--output-dir value}/benchmark_resnet101_inference_fp32_20190104_204615.log ``` @@ -287,7 +277,7 @@ $ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/resnet10 --mode inference \ --model-name resnet101 \ --batch-size 100 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /home//trained_models/resnet101_fp32_pretrained_model.pb \ --data-location /home//imagenet_validation_dataset \ --accuracy-only \ @@ -304,8 +294,6 @@ $ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/resnet10 Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.7639, 0.9289) Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.7641, 0.9289) Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7640, 0.9289) - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_resnet101_inference_fp32_20190104_201506.log ``` diff --git a/benchmarks/image_recognition/tensorflow/resnet101/inference/fp32/config.json b/benchmarks/image_recognition/tensorflow/resnet101/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet101/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/resnet101/inference/fp32/model_init.py b/benchmarks/image_recognition/tensorflow/resnet101/inference/fp32/model_init.py index 5e35e462b..98962a670 100644 --- a/benchmarks/image_recognition/tensorflow/resnet101/inference/fp32/model_init.py +++ b/benchmarks/image_recognition/tensorflow/resnet101/inference/fp32/model_init.py @@ -60,15 +60,17 @@ def __init__(self, args, custom_args=[], platform_util=None): self.args = arg_parser.parse_args(self.custom_args, namespace=self.args) - # Use default KMP variable values, but override the default KMP_BLOCKTIME value - self.set_kmp_vars(kmp_blocktime=str(self.args.kmp_blocktime)) + # Set KMP env vars, if they haven't already been set, but override the default KMP_BLOCKTIME value + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime)) + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) benchmark_script = os.path.join( self.args.intelai_models, self.args.mode, "eval_image_classifier_inference.py") - self.benchmark_command = self.get_numactl_command(args.socket_id) + \ + self.benchmark_command = self.get_command_prefix(args.socket_id) + \ self.python_exe + " " + benchmark_script self.benchmark_command = \ diff --git a/benchmarks/image_recognition/tensorflow/resnet101/inference/int8/config.json b/benchmarks/image_recognition/tensorflow/resnet101/inference/int8/config.json new file mode 100644 index 000000000..6f1228ba7 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet101/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/resnet101/inference/int8/model_init.py b/benchmarks/image_recognition/tensorflow/resnet101/inference/int8/model_init.py index 5e32d3e92..36a9f479a 100644 --- a/benchmarks/image_recognition/tensorflow/resnet101/inference/int8/model_init.py +++ b/benchmarks/image_recognition/tensorflow/resnet101/inference/int8/model_init.py @@ -41,8 +41,9 @@ def __init__(self, args, custom_args=[], platform_util=None): set_env_var("OMP_NUM_THREADS", platform_util.num_cores_per_socket if args.num_cores == -1 else args.num_cores) - # Set KMP env vars, but override default KMP_BLOCKTIME value - self.set_kmp_vars(kmp_blocktime="0") + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) def parse_args(self): parser = argparse.ArgumentParser() @@ -77,7 +78,7 @@ def run_benchmark_or_accuracy(self): self.args.intelai_models, self.args.mode, "eval_image_classifier_inference.py") - cmd = self.get_numactl_command(self.args.socket_id) + self.python_exe + " " + cmd + cmd = self.get_command_prefix(self.args.socket_id) + self.python_exe + " " + cmd cmd += " --input-graph=" + self.args.input_graph + \ " --num-inter-threads=" + str(self.args.num_inter_threads) + \ @@ -100,13 +101,13 @@ def run_benchmark_or_accuracy(self): self.run_command(cmd) def run_calibration(self): - calibration_script = os.path.join(self.args.intelai_models, self.args.mode, + calibration_script = os.path.join(self.args.intelai_models, self.args.precision, "calibration.py") script_args_list = [ "input_graph", "data_location", "batch_size", "num_inter_threads", "num_intra_threads"] - cmd_prefix = self.get_numactl_command(self.args.socket_id) + \ + cmd_prefix = self.get_command_prefix(self.args.socket_id) + \ self.python_exe + " " + calibration_script cmd = self.add_args_to_command(cmd_prefix, script_args_list) self.run_command(cmd) diff --git a/benchmarks/image_recognition/tensorflow/resnet50/README.md b/benchmarks/image_recognition/tensorflow/resnet50/README.md index fec96a4f2..71bbdf7cc 100644 --- a/benchmarks/image_recognition/tensorflow/resnet50/README.md +++ b/benchmarks/image_recognition/tensorflow/resnet50/README.md @@ -10,6 +10,11 @@ precisions. ## Int8 Inference Instructions +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + 1. Download the full ImageNet dataset and convert to the TF records format. * Clone the tensorflow/models repository: @@ -38,12 +43,6 @@ $ git clone https://github.com/IntelAI/models.git The optimized ResNet50 model files are attached to the [intelai/models](https://github.com/intelai/models) repo and located at `models/models/image_recognition/tensorflow/resnet50/`. - The docker image (`intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl`) - used in the commands above were built using - [TensorFlow](git@github.com:tensorflow/tensorflow.git) master - ([e889ea1](https://github.com/tensorflow/tensorflow/commit/e889ea1dd965c31c391106aa3518fc23d2689954)) and - [PR #25765](https://github.com/tensorflow/tensorflow/pull/25765). - * Calculate the model accuracy, the required parameters parameters include: the `ImageNet` dataset location (from step 1), the pre-trained `final_int8_resnet50.pb` input graph file (from step 2), and the `--accuracy-only` flag. @@ -59,20 +58,23 @@ $ python launch_benchmark.py \ --mode inference \ --batch-size=100 \ --accuracy-only \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` The log file is saved to the value of `--output-dir`. The tail of the log output when the script completes should look something like this: ``` +Iteration time: 233.495 ms Processed 49600 images. (Top1 accuracy, Top5 accuracy) = (0.7361, 0.9155) +Iteration time: 233.231 ms Processed 49700 images. (Top1 accuracy, Top5 accuracy) = (0.7361, 0.9155) +Iteration time: 234.541 ms Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.7360, 0.9154) +Iteration time: 233.033 ms Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.7361, 0.9155) +Iteration time: 233.013 ms Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7360, 0.9154) -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_resnet50_inference_int8_20190104_212224.log ``` @@ -97,21 +99,22 @@ $ python launch_benchmark.py \ --mode inference \ --batch-size=128 \ --benchmark-only \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 -- warmup_steps=50 steps=500 ``` The tail of the log output when the script completes should look something like this: ``` ... -steps = 470, 460.113806562 images/sec -steps = 480, 460.073982602 images/sec -steps = 490, 463.289831148 images/sec -steps = 500, 463.521427264 images/sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu +Iteration 497: 0.253495 sec +Iteration 498: 0.253033 sec +Iteration 499: 0.258083 sec +Iteration 500: 0.254541 sec +Average time: 0.254572 sec +Batch size = 128 +Throughput: 502.805 images/sec Ran inference with batch size 128 -Log location outside container: {--output-dir value}/benchmark_resnet50_inference_int8_20190223_180546.log +Log location outside container: {--output-dir value}/benchmark_resnet50_inference_int8_20190416_172735.log ``` Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands @@ -157,7 +160,7 @@ $ python launch_benchmark.py \ --mode inference \ --batch-size=1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` The log file is saved to the value of `--output-dir`. @@ -176,8 +179,6 @@ Average time: 0.011 sec Batch size = 1 Latency: 10.924 ms Throughput: 91.541 images/sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_resnet50_inference_fp32_20190104_215326.log ``` @@ -194,7 +195,7 @@ $ python launch_benchmark.py \ --mode inference \ --batch-size=128 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` The log file is saved to the value of `--output-dir`. @@ -213,8 +214,6 @@ Iteration 40: 0.652 sec Average time: 0.653 sec Batch size = 128 Throughput: 196.065 images/sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 128 Log location outside container: {--output-dir value}/benchmark_resnet50_inference_fp32_20190104_215655.log ``` @@ -234,7 +233,7 @@ $ python launch_benchmark.py \ --batch-size 100 \ --socket-id 0 \ --data-location /home//dataset/ImageNetData_directory \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` The log file is saved to the value of `--output-dir`. @@ -242,9 +241,8 @@ The tail of the log output when the accuracy run completes should look something like this: ``` ... +Iteration time: 649.252 ms Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7430, 0.9188) -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 100 Log location outside container: {--output-dir value}/benchmark_resnet50_inference_fp32_20190104_213452.log ``` @@ -269,7 +267,7 @@ $ python launch_benchmark.py \ --batch-size 100 \ --socket-id 0 \ --data-location /home//dataset/ImageNetData_directory \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` The results file will be written to the `models/benchmarks/common/tensorflow/logs` directory, unless another diff --git a/benchmarks/image_recognition/tensorflow/resnet50/inference/fp32/config.json b/benchmarks/image_recognition/tensorflow/resnet50/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/resnet50/inference/fp32/model_init.py b/benchmarks/image_recognition/tensorflow/resnet50/inference/fp32/model_init.py index a2e6be8a3..88520cbdd 100644 --- a/benchmarks/image_recognition/tensorflow/resnet50/inference/fp32/model_init.py +++ b/benchmarks/image_recognition/tensorflow/resnet50/inference/fp32/model_init.py @@ -61,15 +61,17 @@ def __init__(self, args, custom_args=[], platform_util=None): self.args = arg_parser.parse_args(self.custom_args, namespace=self.args) - # Use default KMP variable values, but override the default KMP_BLOCKTIME value - self.set_kmp_vars(kmp_blocktime=str(self.args.kmp_blocktime)) + # Set KMP env vars, if they haven't already been set, but override the default KMP_BLOCKTIME value + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime)) + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) benchmark_script = os.path.join( self.args.intelai_models, self.args.mode, "eval_image_classifier_inference.py") - self.benchmark_command = self.get_numactl_command(args.socket_id) + \ + self.benchmark_command = self.get_command_prefix(args.socket_id) + \ self.python_exe + " " + benchmark_script num_cores = self.platform_util.num_cores_per_socket if self.args.num_cores == -1 \ diff --git a/benchmarks/image_recognition/tensorflow/resnet50/inference/int8/config.json b/benchmarks/image_recognition/tensorflow/resnet50/inference/int8/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/resnet50/inference/int8/model_init.py b/benchmarks/image_recognition/tensorflow/resnet50/inference/int8/model_init.py index 07dfa5d2f..41571564c 100644 --- a/benchmarks/image_recognition/tensorflow/resnet50/inference/int8/model_init.py +++ b/benchmarks/image_recognition/tensorflow/resnet50/inference/int8/model_init.py @@ -65,15 +65,16 @@ def parse_args(self): self.args = parser.parse_args(self.custom_args, namespace=self.args) - # Use default KMP variable values, but override the default KMP_BLOCKTIME value - self.set_kmp_vars(kmp_blocktime=str(self.args.kmp_blocktime)) + # Set KMP env vars, if they haven't already been set, but override the default KMP_BLOCKTIME value + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime)) def run_benchmark_or_accuracy(self): cmd = os.path.join( self.args.intelai_models, self.args.mode, "eval_image_classifier_inference.py") - cmd = self.get_numactl_command(self.args.socket_id) + self.python_exe + " " + cmd + cmd = self.get_command_prefix(self.args.socket_id) + self.python_exe + " " + cmd cmd += " --input-graph=" + self.args.input_graph + \ " --num-inter-threads=" + str(self.args.num_inter_threads) + \ @@ -105,7 +106,7 @@ def run_calibration(self): "input_graph", "data_location", "batch_size", "num_inter_threads", "num_intra_threads"] - cmd_prefix = self.get_numactl_command(self.args.socket_id) + \ + cmd_prefix = self.get_command_prefix(self.args.socket_id) + \ self.python_exe + " " + calibration_script cmd = self.add_args_to_command(cmd_prefix, script_args_list) self.run_command(cmd) diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md b/benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md new file mode 100644 index 000000000..18889005a --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md @@ -0,0 +1,300 @@ +# ResNet50 + +This document has instructions for how to run ResNet50 (v1.5) for the +following precisions: +* [Int8 inference](#int8-inference-instructions) +* [FP32 inference](#fp32-inference-instructions) + +Original ResNet model has multiple versions which have shown better accuracy +and/or batch inference performance. As mentioned in TensorFlow's [official ResNet +model page](https://github.com/tensorflow/models/tree/master/official/resnet), 3 different +versions of the original ResNet model exists - ResNet50v1, ResNet50v1.5, and ResNet50v2. +As a side note, ResNet50v1.5 is also in MLPerf's [cloud inference benchmark for +image classification](https://github.com/mlperf/inference/tree/master/cloud/image_classification). + +## Int8 Inference Instructions + +1. Download the full ImageNet dataset and convert to the TF records format. + +* Clone the tensorflow/models repository: +``` +$ git clone https://github.com/tensorflow/models.git +``` +The TensorFlow models repo provides +[scripts and instructions](https://github.com/tensorflow/models/tree/master/research/slim#an-automated-script-for-processing-imagenet-data) +to download, process and convert the ImageNet dataset to the TF records format. + +* The ImageNet dataset directory location is only required to calculate the model accuracy. + +2. Download the pre-trained model. +``` +wget https://storage.googleapis.com/intel-optimized-tensorflow/models/resnet50v1_5_int8_pretrained_model.pb +``` + +3. Clone the +[intelai/models](https://github.com/intelai/models) +repository +``` +$ git clone https://github.com/IntelAI/models.git +``` + +4. Run the inference script `launch_benchmark.py` with the appropriate parameters to evaluate the model performance and/or calculate the accuracy. +The optimized ResNet50v1.5 model files are attached to the [intelai/models](https://github.com/intelai/models) repo and +located at `models/models/image_recognition/tensorflow/resnet50v1_5/`. + + The docker image (`gcr.io/deeplearning-platform-release/tf-cpu.1-14`) + used in the commands above were built using + [TensorFlow](git@github.com:tensorflow/tensorflow.git) master for TensorFlow + version 1.14. + +* Calculate the model accuracy, the required parameters parameters include: the `ImageNet` dataset location (from step 1), +the pre-trained `resnet50v1_5_int8_pretrained_model.pb` input graph file (from step 2), and the `--accuracy-only` flag. +``` +$ cd /home//models/benchmarks + +$ python launch_benchmark.py \ + --data-location /home//dataset/FullImageNetData_directory + --in-graph resnet50v1_5_int8_pretrained_model.pb \ + --model-name resnet50v1_5 \ + --framework tensorflow \ + --precision int8 \ + --mode inference \ + --batch-size=100 \ + --accuracy-only \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 +``` +The log file is saved to the value of `--output-dir`. + +The tail of the log output when the benchmarking completes should look +something like this: +``` +Iteration time: 239.899 ms +Processed 49700 images. (Top1 accuracy, Top5 accuracy) = (0.7622, 0.9296) +Iteration time: 239.110 ms +Processed 49800 images. (Top1 accuracy, Top5 accuracy) = (0.7621, 0.9295) +Iteration time: 239.512 ms +Processed 49900 images. (Top1 accuracy, Top5 accuracy) = (0.7622, 0.9296) +Iteration time: 239.989 ms +Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7623, 0.9296) +Ran inference with batch size 100 +Log location outside container: {--output-dir value}/benchmark_resnet50_inference_int8_{timestamp}.log +``` + +* Evaluate the model performance: If just evaluate performance for dummy data, the `--data-location` is not needed. +Otherwise `--data-location` argument needs to be specified: +Calculate the batch inference performance `images/sec`, the required parameters to run the inference script would include: +the pre-trained `resnet50v1_5_int8_pretrained_model.pb` input graph file (from step +2), and the `--benchmark-only` flag. It is +optional to specify the number of `warmup_steps` and `steps` as extra +args, as shown in the command below. If these values are not specified, +the script will default to use `warmup_steps=10` and `steps=50`. + +``` +$ cd /home//models/benchmarks + +$ python launch_benchmark.py \ + --in-graph resnet50v1_5_int8_pretrained_model.pb \ + --model-name resnet50v1_5 \ + --framework tensorflow \ + --precision int8 \ + --mode inference \ + --batch-size=128 \ + --benchmark-only \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 + -- warmup_steps=50 steps=500 +``` +The tail of the log output when the benchmarking completes should look +something like this: +``` +... +Iteration 490: 0.249899 sec +Iteration 500: 0.249110 sec +Average time: 0.251280 sec +Batch size = 128 +Throughput: 509.392 images/sec +Ran inference with batch size 128 +Log location outside container: {--output-dir value}/benchmark_resnet50_inference_int8_{timestamp}.log +``` + +Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands +to get additional debug output or change the default output location. + +## FP32 Inference Instructions + +1. Download the pre-trained model. + +If you would like to get a pre-trained model for ResNet50v1.5, +``` +$ wget https://zenodo.org/record/2535873/files/resnet50_v1.pb +``` + +2. Clone the [intelai/models](https://github.com/intelai/models) repository +``` +$ git clone https://github.com/IntelAI/models.git +``` + +3. If running resnet50 for accuracy, the ImageNet dataset will be +required (if running the model for batch or online inference, then dummy +data will be used). + +The TensorFlow models repo provides +[scripts and instructions](https://github.com/tensorflow/models/tree/master/research/slim#an-automated-script-for-processing-imagenet-data) +to download, process, and convert the ImageNet dataset to the TF records format. + +4. Run the inference script `launch_benchmark.py` with the appropriate parameters to evaluate the model performance. +The optimized ResNet50v1.5 model files are attached to the [intelai/models](https://github.com/intelai/models) repo and +located at `models/models/image_recognition/tensorflow/resnet50v1_5/`. +If benchmarking uses dummy data for inference, `--data-location` flag is not required. Otherwise, +`--data-location` needs to point to point to ImageNet dataset location. + +* To measure online inference, set `--batch-size=1` and run the model script as shown: +``` +$ cd /home//models/benchmarks + +$ python launch_benchmark.py \ + --in-graph resnet50_v1.pb \ + --model-name resnet50v1_5 \ + --framework tensorflow \ + --precision fp32 \ + --mode inference \ + --batch-size=1 \ + --socket-id 0 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 +``` + +The log file is saved to the value of `--output-dir`. + +The tail of the log output when the script completes should look +something like this: +``` +Inference with dummy data. +Iteration 1: 2.761204 sec +Iteration 2: 0.011155 sec +Iteration 3: 0.009289 sec +... +Iteration 48: 0.009315 sec +Iteration 49: 0.009343 sec +Iteration 50: 0.009278 sec +Average time: 0.009481 sec +Batch size = 1 +Latency: 9.481 ms +Throughput: 105.470 images/sec +lscpu_path_cmd = command -v lscpu +lscpu located here: /usr/bin/lscpu +Ran inference with batch size 1 +Log location outside container: {--output-dir value}/benchmark_resnet50_inference_fp32_{timestamp}.log +``` + +* To measure batch inference, set `--batch-size=128` and run the model script as shown: +``` +$ cd /home//models/benchmarks + +$ python launch_benchmark.py \ + --in-graph resnet50_v1.pb \ + --model-name resnet50v1_5 \ + --framework tensorflow \ + --precision fp32 \ + --mode inference \ + --batch-size=128 \ + --socket-id 0 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 +``` + +The log file is saved to the value of `--output-dir`. + +The tail of the log output when the script completes should look +something like this: +``` +Inference with dummy data. +Iteration 1: 3.013918 sec +Iteration 2: 0.543498 sec +Iteration 3: 0.536187 sec +Iteration 4: 0.532568 sec +... +Iteration 46: 0.532444 sec +Iteration 47: 0.535652 sec +Iteration 48: 0.532158 sec +Iteration 49: 0.538117 sec +Iteration 50: 0.532411 sec +Average time: 0.534427 sec +Batch size = 128 +Throughput: 239.509 images/sec +Ran inference with batch size 128 +Log location outside container: {--output-dir value}/benchmark_resnet50_inference_fp32_{timestamp}.log +``` + +* To measure the model accuracy, use the `--accuracy-only` flag and pass +the ImageNet dataset directory from step 3 as the `--data-location`: +``` +$ cd /home//models/benchmarks + +$ python launch_benchmark.py \ + --in-graph resnet50_v1.pb \ + --model-name resnet50v1_5 \ + --framework tensorflow \ + --precision fp32 \ + --mode inference \ + --accuracy-only \ + --batch-size 100 \ + --socket-id 0 \ + --data-location /home//dataset/ImageNetData_directory \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 +``` + +The log file is saved to the value of `--output-dir`. +The tail of the log output when the accuracy run completes should look +something like this: +``` +... +Iteration time: 514.427 ms +Processed 50000 images. (Top1 accuracy, Top5 accuracy) = (0.7651, 0.9307) +lscpu_path_cmd = command -v lscpu +lscpu located here: /usr/bin/lscpu +Ran inference with batch size 100 +Log location outside container: {--output-dir value}/benchmark_resnet50_inference_fp32_{timestamp}.log +``` + +* The `--output-results` flag can be used along with above performance +or accuracy test, in order to also output a file with the inference +results (file name, actual label, and the predicted label). The results +output can only be used with real data. + +For example, the command below is the same as the accuracy test above, +except with the `--output-results` flag added: +``` +$ cd /home//models/benchmarks + +$ python launch_benchmark.py \ + --in-graph resnet50_v1.pb \ + --model-name resnet50v1_5 \ + --framework tensorflow \ + --precision fp32 \ + --mode inference \ + --accuracy-only \ + --output-results \ + --batch-size 100 \ + --socket-id 0 \ + --data-location /home//dataset/ImageNetData_directory \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 +``` +The results file will be written to the +`models/benchmarks/common/tensorflow/logs` directory, unless another +output directory is specified by the `--output-dir` arg. Below is an +example of what the inference results file will look like: +``` +filename,actual,prediction +ILSVRC2012_val_00033870.JPEG,592,592 +ILSVRC2012_val_00045598.JPEG,258,258 +ILSVRC2012_val_00047428.JPEG,736,736 +ILSVRC2012_val_00003341.JPEG,344,344 +ILSVRC2012_val_00037069.JPEG,192,192 +ILSVRC2012_val_00029701.JPEG,440,440 +ILSVRC2012_val_00016918.JPEG,286,737 +ILSVRC2012_val_00015545.JPEG,5,5 +ILSVRC2012_val_00016713.JPEG,274,274 +ILSVRC2012_val_00014735.JPEG,31,31 +... +``` + +Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands +to get additional debug output or change the default output location. diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/__init__.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/__init__.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/config.json b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/model_init.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/model_init.py new file mode 100644 index 000000000..7231243b8 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/model_init.py @@ -0,0 +1,115 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from common.base_model_init import BaseModelInitializer +from common.base_model_init import set_env_var + +import os +from argparse import ArgumentParser +import time + + +class ModelInitializer(BaseModelInitializer): + """initialize mode and run benchmark""" + + def __init__(self, args, custom_args=[], platform_util=None): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) + + self.benchmark_command = "" + if not platform_util: + raise ValueError("Did not find any platform info.") + + # use default batch size if -1 + if self.args.batch_size == -1: + self.args.batch_size = 128 + + # set num_inter_threads and num_intra_threads + self.set_num_inter_intra_threads() + + arg_parser = ArgumentParser(description='Parse args') + + arg_parser.add_argument("--warmup-steps", dest='warmup_steps', + type=int, default=10, + help="number of warmup steps") + arg_parser.add_argument("--steps", dest='steps', + type=int, default=50, + help="number of steps") + arg_parser.add_argument( + '--kmp-blocktime', dest='kmp_blocktime', + help='number of kmp block time', + type=int, default=1) + + self.args = arg_parser.parse_args(self.custom_args, namespace=self.args) + + # Set KMP env vars, if they haven't already been set, but override the default KMP_BLOCKTIME value + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime)) + + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) + + benchmark_script = os.path.join( + self.args.intelai_models, self.args.mode, + "eval_image_classifier_inference.py") + + self.benchmark_command = self.get_command_prefix(args.socket_id) + \ + self.python_exe + " " + benchmark_script + + num_cores = self.platform_util.num_cores_per_socket if self.args.num_cores == -1 \ + else self.args.num_cores + + self.benchmark_command = \ + self.benchmark_command + \ + " --input-graph=" + self.args.input_graph + \ + " --num-inter-threads=" + str(self.args.num_inter_threads) + \ + " --num-intra-threads=" + str(self.args.num_intra_threads) + \ + " --num-cores=" + str(num_cores) + \ + " --batch-size=" + str(self.args.batch_size) + \ + " --warmup-steps=" + str(self.args.warmup_steps) + \ + " --steps=" + str(self.args.steps) + + if self.args.data_num_inter_threads: + self.benchmark_command += " --data-num-inter-threads=" + str(self.args.data_num_inter_threads) + if self.args.data_num_intra_threads: + self.benchmark_command += " --data-num-intra-threads=" + str(self.args.data_num_intra_threads) + + # if the data location directory is not empty, then include the arg + if self.args.data_location and os.listdir(self.args.data_location): + self.benchmark_command += " --data-location=" + \ + self.args.data_location + if self.args.accuracy_only: + self.benchmark_command += " --accuracy-only" + + # if output results is enabled, generate a results file name and pass it to the inference script + if self.args.output_results: + self.results_filename = "{}_{}_{}_results_{}.txt".format( + self.args.model_name, self.args.precision, self.args.mode, + time.strftime("%Y%m%d_%H%M%S", time.gmtime())) + self.results_file_path = os.path.join(self.args.output_dir, self.results_filename) + self.benchmark_command += " --results-file-path {}".format(self.results_file_path) + + def run(self): + if self.benchmark_command: + self.run_command(self.benchmark_command) + + if self.args.output_results: + print("Inference results file in the output directory: {}".format(self.results_filename)) diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/__init__.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/config.json b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/model_init.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/model_init.py new file mode 100644 index 000000000..03b523829 --- /dev/null +++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/model_init.py @@ -0,0 +1,123 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from common.base_model_init import BaseModelInitializer +from common.base_model_init import set_env_var + +import argparse +import os + + +class ModelInitializer(BaseModelInitializer): + """Model initializer for resnet50 int8 inference""" + + def __init__(self, args, custom_args=[], platform_util=None): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) + + # Set the num_inter_threads and num_intra_threads + self.set_num_inter_intra_threads() + # Set env vars, if they haven't already been set + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads, overwrite_existing=True) + + def parse_args(self): + parser = argparse.ArgumentParser() + parser.add_argument( + "--warmup-steps", dest="warmup_steps", + help="number of warmup steps", + type=int, default=10) + parser.add_argument( + "--steps", dest="steps", + help="number of steps", + type=int, default=50) + parser.add_argument( + '--kmp-blocktime', dest='kmp_blocktime', + help='number of kmp block time', + type=int, default=1) + parser.add_argument( + "--calibration-only", + help="Calibrate the accuracy.", + dest="calibration_only", action="store_true") + parser.add_argument( + "--calibrate", dest="calibrate", + help=" run accuracy with calibration data, " + "to generate min_max ranges, calibrate=[True/False]", + type=bool, default=False) + + self.args = parser.parse_args(self.custom_args, + namespace=self.args) + + # Set KMP env vars, if they haven't already been set, but override the default KMP_BLOCKTIME value + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime)) + + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) + + def run_benchmark_or_accuracy(self): + cmd = os.path.join( + self.args.intelai_models, self.args.mode, + "eval_image_classifier_inference.py") + + cmd = self.get_command_prefix(self.args.socket_id) + self.python_exe + " " + cmd + + cmd += " --input-graph=" + self.args.input_graph + \ + " --num-inter-threads=" + str(self.args.num_inter_threads) + \ + " --num-intra-threads=" + str(self.args.num_intra_threads) + \ + " --batch-size=" + str(self.args.batch_size) + \ + " --warmup-steps=" + str(self.args.warmup_steps) + \ + " --steps=" + str(self.args.steps) + + if self.args.calibrate: + cmd += " --calibrate=" + str(self.args.calibrate) + if self.args.data_num_inter_threads: + cmd += " --data-num-inter-threads=" + str(self.args.data_num_inter_threads) + if self.args.data_num_intra_threads: + cmd += " --data-num-intra-threads=" + str(self.args.data_num_intra_threads) + + # if the data location directory is not empty, then include the arg + if self.args.data_location and os.listdir(self.args.data_location): + cmd += " --data-location=" + self.args.data_location + if self.args.accuracy_only: + cmd += " --accuracy-only" + + self.run_command(cmd) + + def run_calibration(self): + calibration_script = os.path.join(self.args.intelai_models, + self.args.precision, + "generate_calibration_data.py") + script_args_list = [ + "input_graph", "data_location", + "batch_size", + "num_inter_threads", "num_intra_threads"] + cmd_prefix = self.get_command_prefix(self.args.socket_id) + \ + self.python_exe + " " + calibration_script + cmd = self.add_args_to_command(cmd_prefix, script_args_list) + self.run_command(cmd) + + def run(self): + # Parse custom arguments and append to self.args + self.parse_args() + if self.args.accuracy_only and self.args.calibration_only: + self.run_calibration() + else: + self.run_benchmark_or_accuracy() diff --git a/benchmarks/image_recognition/tensorflow/squeezenet/README.md b/benchmarks/image_recognition/tensorflow/squeezenet/README.md index e5e9cc86c..feaba492a 100644 --- a/benchmarks/image_recognition/tensorflow/squeezenet/README.md +++ b/benchmarks/image_recognition/tensorflow/squeezenet/README.md @@ -79,7 +79,7 @@ $ python launch_benchmark.py \ --framework tensorflow \ --socket-id 0 \ --batch-size 64 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --checkpoint /home//squeezenet_checkpoints \ --data-location /home//datasets/ImageNet_TFRecords ``` @@ -94,7 +94,7 @@ $ python launch_benchmark.py \ --framework tensorflow \ --socket-id 0 \ --batch-size 1 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --checkpoint /home//squeezenet_checkpoints \ --data-location /home//datasets/ImageNet_TFRecords ``` @@ -114,8 +114,6 @@ SqueezeNet Inference Summary: throughput[med] = 837.1 image/sec latency[median] = 1.195 ms -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 64 Log location outside container: {--output-dir value}/benchmark_squeezenet_inference_fp32_20190104_220051.log ``` @@ -129,8 +127,6 @@ SqueezeNet Inference Summary: throughput[med] = 115.3 image/sec latency[median] = 8.67 ms -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_squeezenet_inference_fp32_20190104_220712.log ``` diff --git a/benchmarks/image_recognition/tensorflow_serving/__init__.py b/benchmarks/image_recognition/tensorflow_serving/__init__.py new file mode 100644 index 000000000..cf793ec6a --- /dev/null +++ b/benchmarks/image_recognition/tensorflow_serving/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow_serving/inceptionv3/README.md b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/README.md new file mode 100644 index 000000000..bef280f1d --- /dev/null +++ b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/README.md @@ -0,0 +1,90 @@ +# Inception V3 + +This document has instructions for how to run Inception V3 for the +following modes/precisions: +* [FP32 inference](#fp32-inference-instructions) + +## FP32 Inference Instructions + +1. Clone this [intelai/models](https://github.com/IntelAI/models) +repository: + +``` +$ git clone https://github.com/IntelAI/models.git +``` + +2. Download the pre-trained model. +``` +$ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/inceptionv3_fp32_pretrained_model.pb +``` + +3. Navigate to the `benchmarks` directory in your local clone of +the [intelai/models](https://github.com/IntelAI/models) repo from step 1. +The `launch_benchmark.py` script in the `benchmarks` directory is +used for starting a tensorflow serving run using optimized TensorFlow Serving docker +container. It has arguments to specify which model, framework, mode, +precision, and input graph. + +Substitute in your own `--in-graph` pretrained model file path (from step 2). + +4. Inception V3 can be run for measuring batch or online inference performance. Use one of the following examples below, +depending on your use case. + +* For online inference with dummy data (using `--batch-size 1`): + +``` +python launch_benchmark.py \ + --in-graph /home//inceptionv3_fp32_pretrained_model.pb \ + --model-name inceptionv3 \ + --framework tensorflow_serving \ + --precision fp32 \ + --mode inference \ + --batch-size=1 \ + --benchmark-only +``` +Example log tail when running for online inference: +``` +Iteration 35: 0.019 sec +Iteration 36: 0.020 sec +Iteration 37: 0.018 sec +Iteration 38: 0.018 sec +Iteration 39: 0.019 sec +Iteration 40: 0.018 sec +Average time: 0.019 sec +Batch size = 1 +Latency: 18.801 ms +Throughput: 53.189 images/sec +tfserving_3784 +Log output location: {--output-dir value}/benchmark_inceptionv3_inference_fp32_20190516_103531.log +``` + +* For batch inference with dummy data (using `--batch-size 128`): + +``` +python launch_benchmark.py \ + --in-graph /home//inceptionv3_fp32_pretrained_model.pb \ + --model-name inceptionv3 \ + --framework tensorflow_serving \ + --precision fp32 \ + --mode inference \ + --batch-size=128 \ + --benchmark-only +``` +Example log tail when running for batch inference: +``` +Iteration 34: 0.779 sec +Iteration 35: 0.916 sec +Iteration 36: 0.809 sec +Iteration 37: 0.793 sec +Iteration 38: 0.813 sec +Iteration 39: 0.796 sec +Iteration 40: 0.796 sec +Average time: 0.817 sec +Batch size = 128 +Throughput: 156.752 images/sec +tfserving_5299 +Log output location: {--output-dir value}/benchmark_inceptionv3_inference_fp32_20190516_103958.log +``` + +Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands +to get additional debug output or change the default output location. diff --git a/benchmarks/image_recognition/tensorflow_serving/inceptionv3/__init__.py b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/__init__.py new file mode 100644 index 000000000..cf793ec6a --- /dev/null +++ b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/__init__.py b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/__init__.py new file mode 100644 index 000000000..cf793ec6a --- /dev/null +++ b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/__init__.py b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/__init__.py new file mode 100644 index 000000000..cf793ec6a --- /dev/null +++ b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/image_recognition_benchmark.py b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/image_recognition_benchmark.py new file mode 100644 index 000000000..3178741db --- /dev/null +++ b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/image_recognition_benchmark.py @@ -0,0 +1,117 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +"""Send simulated image data to tensorflow_model_server loaded with ResNet50 or InceptionV3 model. + +""" + +from __future__ import print_function + +import os +import random + +import grpc +import numpy as np +import sys +import tensorflow as tf +import time +from tensorflow_serving.apis import predict_pb2 +from tensorflow_serving.apis import prediction_service_pb2_grpc + +from util import preprocess_image, parse_example_proto + +tf.app.flags.DEFINE_string('server', 'localhost:8500', + 'PredictionService host:port') +tf.app.flags.DEFINE_integer('batch_size', 1, 'Batch size to use') +tf.app.flags.DEFINE_string('data_dir', '', 'path to images in TF records format') +tf.app.flags.DEFINE_string('model', 'resnet50', 'Name of model (resnet50 or inceptionv3).') +FLAGS = tf.app.flags.FLAGS + + +def sample_images(image_size): + """Pull a random batch of images from FLAGS.data_dir containing TF record formatted ImageNet validation set + Returns: + ndarray of float32 with shape [FLAGS.batch_size, image_size, image_size, 3] + """ + + sample_file = random.choice(os.listdir(FLAGS.data_dir)) + dataset = tf.data.TFRecordDataset(os.path.join(FLAGS.data_dir, sample_file)) + dataset = dataset.map(lambda x: parse_example_proto(x)).shuffle(True).batch(FLAGS.batch_size) + iterator = dataset.make_one_shot_iterator() + next_element = iterator.get_next() + with tf.Session() as sess: + images, labels = sess.run(next_element) + images = np.array([sess.run(preprocess_image(x, FLAGS.model, image_size)) for x in images]) + + return images + + +def main(_): + if FLAGS.model == 'resnet50': + image_size = 224 + elif FLAGS.model == 'inceptionv3': + image_size = 299 + else: + print('Please specify model as either resnet50 or inceptionv3.') + sys.exit(-1) + + channel = grpc.insecure_channel(FLAGS.server) + stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) + i = 0 + num_iteration = 40 + warm_up_iteration = 10 + total_time = 0 + for _ in range(num_iteration): + i += 1 + if FLAGS.data_dir: + image_np = sample_images(image_size) + else: + image_np = np.random.rand(FLAGS.batch_size, image_size, image_size, 3).astype(np.float32) + if FLAGS.model == 'resnet50': + # For ResNet50, rescale to [0, 256] + image_np *= 256.0 + elif FLAGS.model == 'inceptionv3': + # For InceptionV3, rescale to [-1, 1] + image_np = (image_np - 0.5) * 2.0 + + request = predict_pb2.PredictRequest() + request.model_spec.name = FLAGS.model + request.model_spec.signature_name = 'serving_default' + request.inputs['input'].CopyFrom( + tf.contrib.util.make_tensor_proto(image_np, shape=[FLAGS.batch_size, image_size, image_size, 3])) + start_time = time.time() + stub.Predict(request, 10.0) # 10 secs timeout + time_consume = time.time() - start_time + print('Iteration %d: %.3f sec' % (i, time_consume)) + if i > warm_up_iteration: + total_time += time_consume + + time_average = total_time / (num_iteration - warm_up_iteration) + print('Average time: %.3f sec' % (time_average)) + + print('Batch size = %d' % FLAGS.batch_size) + if (FLAGS.batch_size == 1): + print('Latency: %.3f ms' % (time_average * 1000)) + + print('Throughput: %.3f images/sec' % (FLAGS.batch_size / time_average)) + + +if __name__ == '__main__': + tf.app.run() diff --git a/docs/image_recognition/tensorflow_serving/src/image_recognition_client.py b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/image_recognition_client.py similarity index 52% rename from docs/image_recognition/tensorflow_serving/src/image_recognition_client.py rename to benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/image_recognition_client.py index abdc77d05..2926f4621 100644 --- a/docs/image_recognition/tensorflow_serving/src/image_recognition_client.py +++ b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/image_recognition_client.py @@ -24,12 +24,11 @@ from __future__ import print_function -import sys import grpc -import requests import numpy as np +import requests +import sys import tensorflow as tf - from tensorflow_serving.apis import predict_pb2 from tensorflow_serving.apis import prediction_service_pb2_grpc @@ -41,45 +40,46 @@ tf.app.flags.DEFINE_string('server', 'localhost:8500', 'PredictionService host:port') tf.app.flags.DEFINE_string('image', '', 'path to image in JPEG format') -tf.app.flags.DEFINE_string('model', 'resnet50', 'Name of model (resnet50 or inceptionv3).') +tf.app.flags.DEFINE_string('model', 'resnet50', 'Name of model (resnet50 or Inceptionv3).') FLAGS = tf.app.flags.FLAGS def main(_): - if FLAGS.model == 'resnet50': - image_size = 224 - elif FLAGS.model == 'inceptionv3': - image_size = 299 - else: - print('Please specify model as either resnet50 or inceptionv3.') - sys.exit(-1) - - if FLAGS.image: - with open(FLAGS.image, 'rb') as f: - data = f.read() - else: - # Download the image URL if a path is not provided as input - dl_request = requests.get(IMAGE_URL, stream=True) - dl_request.raise_for_status() - data = dl_request.content - - channel = grpc.insecure_channel(FLAGS.server) - stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) - request = predict_pb2.PredictRequest() - request.model_spec.name = FLAGS.model - request.model_spec.signature_name = 'serving_default' - image_data = tf.reshape(preprocess_image(data, FLAGS.model, image_size), [1, image_size, image_size, 3]) - - # Run the graph - with tf.Session() as sess: - sess.run(tf.global_variables_initializer()) - image_data = (sess.run(image_data)) - - request.inputs['input'].CopyFrom(tf.contrib.util.make_tensor_proto(image_data, shape=[1, image_size, image_size, 3])) - result = stub.Predict(request) - print(result) - print('Predicted class: ', str(np.argmax(result.outputs['predict'].float_val))) + if FLAGS.model == 'resnet50': + image_size = 224 + elif FLAGS.model == 'inceptionv3': + image_size = 299 + else: + print('Please specify model as either resnet50 or Inceptionv3.') + sys.exit(-1) + + if FLAGS.image: + with open(FLAGS.image, 'rb') as f: + data = f.read() + else: + # Download the image URL if a path is not provided as input + dl_request = requests.get(IMAGE_URL, stream=True) + dl_request.raise_for_status() + data = dl_request.content + + channel = grpc.insecure_channel(FLAGS.server) + stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) + request = predict_pb2.PredictRequest() + request.model_spec.name = FLAGS.model + request.model_spec.signature_name = 'serving_default' + image_data = tf.reshape(preprocess_image(data, FLAGS.model, image_size), [1, image_size, image_size, 3]) + + # Run the graph + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + image_data = (sess.run(image_data)) + + request.inputs['input'].CopyFrom( + tf.contrib.util.make_tensor_proto(image_data, shape=[1, image_size, image_size, 3])) + result = stub.Predict(request) + print(result) + print('Predicted class: ', str(np.argmax(result.outputs['predict'].float_val))) if __name__ == '__main__': - tf.app.run() + tf.app.run() diff --git a/docs/image_recognition/tensorflow_serving/src/model_graph_to_saved_model.py b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/model_graph_to_saved_model.py similarity index 99% rename from docs/image_recognition/tensorflow_serving/src/model_graph_to_saved_model.py rename to benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/model_graph_to_saved_model.py index ca4f8092c..a593539ca 100644 --- a/docs/image_recognition/tensorflow_serving/src/model_graph_to_saved_model.py +++ b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/model_graph_to_saved_model.py @@ -25,9 +25,7 @@ from __future__ import print_function -import os import sys - import tensorflow as tf import tensorflow.tools.graph_transforms as graph_transforms diff --git a/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/util.py b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/util.py new file mode 100644 index 000000000..70eaba0de --- /dev/null +++ b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32/util.py @@ -0,0 +1,61 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +from __future__ import print_function + +import tensorflow as tf + + +def preprocess_image(image_buffer, model, image_size): + """Preprocess JPEG encoded bytes to 3D float Tensor.""" + + # Decode the string as an RGB JPEG of unknown height and width. + image = tf.image.decode_jpeg(image_buffer, channels=3) + # Convert pixels to [0, 1) + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + # Crop the central region to 87.5% of the original image. + image = tf.image.central_crop(image, central_fraction=0.875) + # Resize the image to image_size x image_size. + image = tf.expand_dims(image, 0) + image = tf.image.resize_bilinear(image, [image_size, image_size], align_corners=False) + image = tf.squeeze(image, [0]) + if model == 'resnet50': + # For ResNet50, rescale to [0, 256] + image = tf.multiply(image, 256.0) + elif model == 'Inceptionv3': + # For InceptionV3, rescale to [-1, 1] + image = tf.subtract(image, 0.5) + image = tf.multiply(image, 2.0) + return image + + +def parse_example_proto(example_serialized): + # Dense features in Example proto. + feature_map = { + 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, + default_value=-1), + } + + features = tf.parse_single_example(example_serialized, feature_map) + label = tf.cast(features['image/class/label'], dtype=tf.int32) + + return features['image/encoded'], label diff --git a/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/int8/__init__.py b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/int8/__init__.py new file mode 100644 index 000000000..cf793ec6a --- /dev/null +++ b/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/int8/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/image_segmentation/tensorflow/maskrcnn/README.md b/benchmarks/image_segmentation/tensorflow/maskrcnn/README.md index 15edaebba..218fd7e2f 100644 --- a/benchmarks/image_segmentation/tensorflow/maskrcnn/README.md +++ b/benchmarks/image_segmentation/tensorflow/maskrcnn/README.md @@ -61,7 +61,7 @@ $ python launch_benchmark.py \ --batch-size 1 \ --socket-id 0 \ --data-location /home//COCO2014 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl-py3 + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` 5. Log files are located at the value of `--output-dir`. @@ -90,7 +90,5 @@ Batch size: 1 Time spent per BATCH: 609.6943 ms Total samples/sec: 1.6402 samples/s Total time: 35.407243490219116 -lscpu_path_cmd = command -v lscpu -lscpu located here: b'/usr/bin/lscpu' Log location outside container: {--output-dir value}/benchmark_maskrcnn_inference_fp32_20190111_205935.log ``` \ No newline at end of file diff --git a/benchmarks/image_segmentation/tensorflow/maskrcnn/inference/fp32/config.json b/benchmarks/image_segmentation/tensorflow/maskrcnn/inference/fp32/config.json new file mode 100644 index 000000000..23d5de76e --- /dev/null +++ b/benchmarks/image_segmentation/tensorflow/maskrcnn/inference/fp32/config.json @@ -0,0 +1,8 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1, + "KMP_HW_SUBSET": "1T" + } +} diff --git a/benchmarks/image_segmentation/tensorflow/maskrcnn/inference/fp32/model_init.py b/benchmarks/image_segmentation/tensorflow/maskrcnn/inference/fp32/model_init.py index 1fe96fe2b..35412be2f 100644 --- a/benchmarks/image_segmentation/tensorflow/maskrcnn/inference/fp32/model_init.py +++ b/benchmarks/image_segmentation/tensorflow/maskrcnn/inference/fp32/model_init.py @@ -37,12 +37,12 @@ def __init__(self, args, custom_args=[], platform_util=None): self.set_num_inter_intra_threads() # Set KMP env vars, if they haven't already been set - self.set_kmp_vars(kmp_affinity="granularity=fine, compact, 1, 0") - set_env_var("KMP_HW_SUBSET", "1T") + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) benchmark_script = os.path.join( self.args.intelai_models, "coco.py") - self.benchmark_command = self.get_numactl_command(args.socket_id) + \ + self.benchmark_command = self.get_command_prefix(args.socket_id) + \ self.python_exe + " " + benchmark_script + " evaluate " set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) diff --git a/benchmarks/image_segmentation/tensorflow/unet/README.md b/benchmarks/image_segmentation/tensorflow/unet/README.md index e7d9693e4..d86505a69 100644 --- a/benchmarks/image_segmentation/tensorflow/unet/README.md +++ b/benchmarks/image_segmentation/tensorflow/unet/README.md @@ -57,7 +57,7 @@ modes/precisions: --benchmark-only \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --checkpoint /home//unet_trained \ --model-source-dir /home//tf_unet \ -- checkpoint_name=model.cpkt @@ -73,8 +73,6 @@ modes/precisions: ``` Time spent per BATCH: 1.1043 ms Total samples/sec: 905.5344 samples/s - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_unet_inference_fp32_20190201_205601.log ``` \ No newline at end of file diff --git a/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/config.json b/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/config.json new file mode 100644 index 000000000..ca15cfe6d --- /dev/null +++ b/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine, compact", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/model_init.py b/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/model_init.py index cd4f5837d..3cdcf1701 100644 --- a/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/model_init.py +++ b/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/model_init.py @@ -41,7 +41,8 @@ def __init__(self, args, custom_args=[], platform_util=None): self.set_num_inter_intra_threads() # Set KMP env vars, if they haven't already been set - self.set_kmp_vars(kmp_affinity="granularity=fine, compact") + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # Get path to the inference script script_path = os.path.join( @@ -50,7 +51,7 @@ def __init__(self, args, custom_args=[], platform_util=None): "unet_infer.py") # Create the command prefix using numactl - self.command_prefix = self.get_numactl_command(self.args.socket_id) +\ + self.command_prefix = self.get_command_prefix(self.args.socket_id) +\ "{} {}".format(self.python_exe, script_path) # Add batch size arg diff --git a/benchmarks/language_modeling/__init__.py b/benchmarks/language_modeling/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/language_modeling/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/language_modeling/tensorflow/__init__.py b/benchmarks/language_modeling/tensorflow/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/language_modeling/tensorflow/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/language_modeling/tensorflow/lm-1b/README.md b/benchmarks/language_modeling/tensorflow/lm-1b/README.md new file mode 100644 index 000000000..fa05e8b3b --- /dev/null +++ b/benchmarks/language_modeling/tensorflow/lm-1b/README.md @@ -0,0 +1,100 @@ +# LM-1B + +This document has instructions for how to run LM-1B for the +following modes/platforms: +* [FP32 inference](#fp32-inference-instructions) + +Instructions and scripts for model training and inference for +other platforms are coming later. + +## FP32 Inference Instructions + +1. Clone [mlperf/inference](https://github.com/mlperf/inference.git) +with the current SHA from master of the repo on 6/26/2019: +``` +git clone https://github.com/mlperf/inference.git +cd inference +git checkout 41eb3e489233e83e544cd25148aca177b95d7bea +``` + +To prepare the checkpoint and dataset, run the `benchmark.py` script +from the mlperf inference repo. Since this requires python3 and +TensorFlow to be installed, the following instructions show how to run +a docker container with your cloned mlperf inference repo mounted as a +volume: +``` +docker run --volume /home//inference:/inference -it gcr.io/deeplearning-platform-release/tf-cpu.1-14 /bin/bash +``` +In the docker container, run: +``` +cd /inference/others/cloud/language_modeling/ +python3 benchmark.py +``` + +2. Clone this [intelai/models](https://github.com/IntelAI/models) +repository: + +``` +git clone https://github.com/IntelAI/models.git +``` + +3. Next, navigate to the `benchmarks` directory in your local clone of +the [intelai/models](https://github.com/IntelAI/models) repo (from step 2). +The `launch_benchmark.py` script in the `benchmarks` directory is +used for starting a model run in a optimized TensorFlow docker +container. It has arguments to specify which model, framework, mode, +precision, and docker image to use, and the checkpoint directory. + +Substitute the `--model-source-dir` to `/inference/cloud/language_modeling`. +Before running, ensure that you have run the script to prepare checkpoint files and the dataset +from Step 1. + +LM-1B can run for online or batch inference. Use one of the following examples below, depending on +your use case. + +For online inference (using `--socket-id 0` and `--batch-size 1`): + +``` +python launch_benchmark.py \ + --model-name lm-1b \ + --precision fp32 \ + --mode inference \ + --framework tensorflow \ + --batch-size 1 \ + --socket-id 0 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --model-source-dir /inference/others/cloud/language_modeling + +``` + +For batch inference (using `--socket-id 0` and `--batch-size 1024`): + +``` +python launch_benchmark.py \ + --model-name lm-1b \ + --precision fp32 \ + --mode inference \ + --framework tensorflow \ + --batch-size 1024 \ + --socket-id 0 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --model-source-dir /inference/others/cloud/language_modeling \ + -- steps=4 \ +``` + +Note that the `--verbose` flag can be added to any of the above commands +to get additional debug output. + +4. By default, the log file is saved to the +`models/benchmarks/common/tensorflow/logs` directory. The user can specify a +different directory using `--output-dir`. + +Example log tail when running for online or batch inference: +``` +Running warmup... +Running benchmark... +Number samples: 4234 +Longest latency was: 2.9153692722320557 seconds. Average latency was:2.891982913017273 +Perplexity: 40.110043230980665, target is 40.209 . +Ran inference with batch size 1024 +``` diff --git a/benchmarks/language_modeling/tensorflow/lm-1b/__init__.py b/benchmarks/language_modeling/tensorflow/lm-1b/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/language_modeling/tensorflow/lm-1b/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/language_modeling/tensorflow/lm-1b/inference/__init__.py b/benchmarks/language_modeling/tensorflow/lm-1b/inference/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/language_modeling/tensorflow/lm-1b/inference/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/language_modeling/tensorflow/lm-1b/inference/fp32/__init__.py b/benchmarks/language_modeling/tensorflow/lm-1b/inference/fp32/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/language_modeling/tensorflow/lm-1b/inference/fp32/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/language_modeling/tensorflow/lm-1b/inference/fp32/config.json b/benchmarks/language_modeling/tensorflow/lm-1b/inference/fp32/config.json new file mode 100644 index 000000000..8ae78e72a --- /dev/null +++ b/benchmarks/language_modeling/tensorflow/lm-1b/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,compact,1,0", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/language_modeling/tensorflow/lm-1b/inference/fp32/model_init.py b/benchmarks/language_modeling/tensorflow/lm-1b/inference/fp32/model_init.py new file mode 100644 index 000000000..535f42416 --- /dev/null +++ b/benchmarks/language_modeling/tensorflow/lm-1b/inference/fp32/model_init.py @@ -0,0 +1,77 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +import os +from argparse import ArgumentParser + +from common.base_model_init import BaseModelInitializer +from common.base_model_init import set_env_var + + +class ModelInitializer(BaseModelInitializer): + """Model initializer for LM-1B FP32 inference""" + + def __init__(self, args, custom_args, platform_util=None): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) + + self.cmd = self.get_command_prefix(self.args.socket_id) + + self.set_num_inter_intra_threads() + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) + + if self.args.socket_id != -1: + if self.args.num_cores != -1: + self.cmd += "--physcpubind=0-" + \ + (str(self.args.num_cores - 1)) + " " + self.cmd += self.python_exe + " " + + run_script = os.path.join(self.args.model_source_dir, + "benchmark.py") + + # Model args + arg_parser = ArgumentParser(description='process custom_args') + + arg_parser.add_argument('-S', '--steps', help='Number of steps', + dest="steps", + default="100") + self.args = arg_parser.parse_args(self.custom_args, + namespace=self.args) + + # Model parameter control + cmd_args = " -b=" + str(self.args.batch_size) + \ + " -I=" + str(self.args.steps) + \ + " --inter=" + \ + str(self.args.num_inter_threads) + \ + " --intra=" + \ + str(self.args.num_intra_threads) + + self.cmd = self.cmd + run_script + cmd_args + + def run(self): + original_dir = os.getcwd() + os.chdir(self.args.model_source_dir) + self.run_command(self.cmd) + + os.chdir(original_dir) diff --git a/benchmarks/language_translation/tensorflow/gnmt/README.md b/benchmarks/language_translation/tensorflow/gnmt/README.md index fd92755b2..987be7075 100644 --- a/benchmarks/language_translation/tensorflow/gnmt/README.md +++ b/benchmarks/language_translation/tensorflow/gnmt/README.md @@ -82,7 +82,7 @@ python launch_benchmark.py \ --socket-id 0 \ --checkpoint /home//gnmt_checkpoints \ --data-location /home//wmt16 \ ---docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ +--docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ -- infer_mode=beam_search ``` @@ -99,7 +99,7 @@ python launch_benchmark.py \ --socket-id 0 \ --checkpoint /home//gnmt_checkpoints \ --data-location /home//wmt16 \ ---docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ +--docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ -- infer_mode=beam_search ``` @@ -118,8 +118,6 @@ Example log tail when running for online inference: done, num sentences 2169, num translations per input 1, time 1108s, Wed Feb 6 01:36:13 2019. The latency of the model is 511.2466 ms/sentences bleu: 29.2 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_gnmt_inference_fp32_20190206_011740.log ``` @@ -134,8 +132,6 @@ Example log tail when running for batch inference: done, num sentences 2169, num translations per input 1, time 302s, Wed Feb 6 01:48:30 2019. The throughput of the model is 7.1780 sentences/s bleu: 29.2 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 32 Log location outside container: {--output-dir value}/benchmark_gnmt_inference_fp32_20190206_014324.log ``` diff --git a/benchmarks/language_translation/tensorflow/gnmt/inference/fp32/config.json b/benchmarks/language_translation/tensorflow/gnmt/inference/fp32/config.json new file mode 100644 index 000000000..4d0e2acf5 --- /dev/null +++ b/benchmarks/language_translation/tensorflow/gnmt/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/language_translation/tensorflow/gnmt/inference/fp32/model_init.py b/benchmarks/language_translation/tensorflow/gnmt/inference/fp32/model_init.py index 61ef1bda6..6f46f2c80 100644 --- a/benchmarks/language_translation/tensorflow/gnmt/inference/fp32/model_init.py +++ b/benchmarks/language_translation/tensorflow/gnmt/inference/fp32/model_init.py @@ -30,15 +30,16 @@ class ModelInitializer(BaseModelInitializer): def __init__(self, args, custom_args=[], platform_util=None): super(ModelInitializer, self).__init__(args, custom_args, platform_util) - self.cmd = self.get_numactl_command(self.args.socket_id) + self.cmd = self.get_command_prefix(self.args.socket_id) if self.args.socket_id != -1 and self.args.num_cores != -1: self.cmd += "--physcpubind=0-" + \ (str(self.args.num_cores - 1)) + " " self.cmd += "{} ".format(self.python_exe) - # Set the KMP env vars - self.set_kmp_vars(kmp_affinity="granularity=fine,compact,1,0") + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # use default batch size if -1 if self.args.batch_size == -1: diff --git a/benchmarks/language_translation/tensorflow/transformer_language/README.md b/benchmarks/language_translation/tensorflow/transformer_language/README.md index 100b1e16d..2c0b700f2 100644 --- a/benchmarks/language_translation/tensorflow/transformer_language/README.md +++ b/benchmarks/language_translation/tensorflow/transformer_language/README.md @@ -67,8 +67,10 @@ Substitute the `--model-source-dir` for the location where you cloned the [tensorflow/tensor2tensor](https://github.com/tensorflow/tensor2tensor) repo (from step 1). -Transformer Language can run for online or batch inference. Use one of the following examples below, depending on -your use case. +Transformer Language can run for online or batch +inference. Use one of the following examples below, depending on +your use case. Note that if no `reference` file is provided in the +launch script parameters, then the BLEU score cannot be calculated. For online inference (using `--socket-id 0` and `--batch-size 1`): @@ -80,7 +82,7 @@ python launch_benchmark.py \ --framework tensorflow \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --checkpoint /home//transformer_lt_fp32_pretrained_model \ --data-location /home//t2t_data \ --model-source-dir /home//tensor2tensor/ \ @@ -97,7 +99,7 @@ python launch_benchmark.py \ --framework tensorflow \ --batch-size 32 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --checkpoint /home//transformer_lt_fp32_pretrained_model \ --data-location /home//t2t_data \ --model-source-dir /home//tensor2tensor/ \ @@ -124,8 +126,6 @@ INFO:tensorflow:Writing decodes into /workspace/models/out_dir/output_infer Inference time 6094.9205, Latency = 2810.0141 ms/setences BLEU_uncased = 22.63 BLEU_cased = 22.20 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_transformer_language_inference_fp32_20190210_050451.log ``` @@ -140,8 +140,6 @@ INFO:tensorflow:Writing decodes into /workspace/models/out_dir/output_infer Inference time 1174.0522, Throughput = 1.8474 sentences/second BLEU_uncased = 22.63 BLEU_cased = 22.20 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 32 Log location outside container: {--output-dir value}/benchmark_transformer_language_inference_fp32_20190210_072635.log ``` \ No newline at end of file diff --git a/benchmarks/language_translation/tensorflow/transformer_language/inference/fp32/config.json b/benchmarks/language_translation/tensorflow/transformer_language/inference/fp32/config.json new file mode 100644 index 000000000..8ae78e72a --- /dev/null +++ b/benchmarks/language_translation/tensorflow/transformer_language/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,compact,1,0", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/language_translation/tensorflow/transformer_language/inference/fp32/model_init.py b/benchmarks/language_translation/tensorflow/transformer_language/inference/fp32/model_init.py index 20790b541..8d01493ae 100644 --- a/benchmarks/language_translation/tensorflow/transformer_language/inference/fp32/model_init.py +++ b/benchmarks/language_translation/tensorflow/transformer_language/inference/fp32/model_init.py @@ -32,13 +32,14 @@ class ModelInitializer(BaseModelInitializer): def __init__(self, args, custom_args, platform_util=None): super(ModelInitializer, self).__init__(args, custom_args, platform_util) - self.cmd = self.get_numactl_command(self.args.socket_id) + self.cmd = self.get_command_prefix(self.args.socket_id) self.bleu_params = "" self.set_num_inter_intra_threads() - # Set the KMP env vars - self.set_kmp_vars(kmp_blocktime="0", kmp_affinity="granularity=fine,compact,1,0") + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) TEMP_DIR = str(self.args.model_source_dir) + "/out_dir" if os.path.exists(TEMP_DIR): @@ -97,14 +98,15 @@ def __init__(self, args, custom_args, platform_util=None): " --output_dir=" + self.args.checkpoint + \ " --decode_from_file=" + self.args.decode_from_file + \ " --decode_to_file=" + self.args.decode_to_file + \ - " --reference=" + self.args.reference + \ " --inter_op_parallelism_threads=" + \ str(self.args.num_inter_threads) + \ " --intra_op_parallelism_threads=" + \ str(self.args.num_intra_threads) - self.bleu_params += " --translation=" + self.args.decode_to_file + \ - " --reference=" + self.args.reference + # If a reference file was provided, also calculate the bleu file + if self.args.reference: + self.bleu_params += " --translation=" + self.args.decode_to_file + \ + " --reference=" + self.args.reference self.cmd = self.cmd + run_script + cmd_args @@ -113,10 +115,12 @@ def run(self): os.chdir(self.args.model_source_dir) self.run_command(self.cmd) - # calculate the bleu number after inference is done - bleucmd = "python " + \ - os.path.join(self.args.model_source_dir, - "tensor2tensor/bin/t2t_bleu.py") + \ - self.bleu_params - os.system(bleucmd) + # calculate the bleu number after inference is done (this is skipped if no reference file is provided) + if self.bleu_params: + bleucmd = "python " + \ + os.path.join(self.args.model_source_dir, + "tensor2tensor/bin/t2t_bleu.py") + \ + self.bleu_params + os.system(bleucmd) + os.chdir(original_dir) diff --git a/benchmarks/language_translation/tensorflow/transformer_lt_official/README.md b/benchmarks/language_translation/tensorflow/transformer_lt_official/README.md index f0d79e4e3..87cc6b472 100644 --- a/benchmarks/language_translation/tensorflow/transformer_lt_official/README.md +++ b/benchmarks/language_translation/tensorflow/transformer_lt_official/README.md @@ -65,7 +65,7 @@ python launch_benchmark.py \ --framework tensorflow \ --batch-size 1 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow-models/models \ --in-graph /home//transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \ --data-location /home//transformer_lt_official_fp32_pretrained_model/data \ @@ -85,7 +85,7 @@ python launch_benchmark.py \ --framework tensorflow \ --batch-size 64 \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow-models/models \ --in-graph /home//transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \ --data-location /home//transformer_lt_official_fp32_pretrained_model/data \ diff --git a/benchmarks/language_translation/tensorflow/transformer_lt_official/inference/fp32/config.json b/benchmarks/language_translation/tensorflow/transformer_lt_official/inference/fp32/config.json new file mode 100644 index 000000000..8ae78e72a --- /dev/null +++ b/benchmarks/language_translation/tensorflow/transformer_lt_official/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,compact,1,0", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/language_translation/tensorflow/transformer_lt_official/inference/fp32/model_init.py b/benchmarks/language_translation/tensorflow/transformer_lt_official/inference/fp32/model_init.py index b598191f0..a8b0b9432 100644 --- a/benchmarks/language_translation/tensorflow/transformer_lt_official/inference/fp32/model_init.py +++ b/benchmarks/language_translation/tensorflow/transformer_lt_official/inference/fp32/model_init.py @@ -31,13 +31,14 @@ class ModelInitializer(BaseModelInitializer): def __init__(self, args, custom_args, platform_util=None): super(ModelInitializer, self).__init__(args, custom_args, platform_util) - self.cmd = self.get_numactl_command(self.args.socket_id) + self.cmd = self.get_command_prefix(self.args.socket_id) self.bleu_params = "" self.set_num_inter_intra_threads() - # Set the KMP env vars - self.set_kmp_vars(kmp_blocktime="0", kmp_affinity="granularity=fine,compact,1,0") + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) MODEL_EXEC_DIR = str(self.args.model_source_dir) + "/official/transformer/" @@ -91,7 +92,9 @@ def __init__(self, args, custom_args, platform_util=None): if self.args.batch_size != -1 else "1") + \ " --file=" + self.args.decode_from_file + \ " --file_out=" + translate_file + \ - " --vocab_file=" + self.args.vocab_file + " --vocab_file=" + self.args.vocab_file +\ + " --num_inter=" + str(self.args.num_inter_threads) +\ + " --num_intra=" + str(self.args.num_intra_threads) self.bleu_params += " --translation=" + translate_file + \ " --reference=" + self.args.reference diff --git a/benchmarks/launch_benchmark.py b/benchmarks/launch_benchmark.py old mode 100644 new mode 100755 index e3e982e70..6da9d7cb6 --- a/benchmarks/launch_benchmark.py +++ b/benchmarks/launch_benchmark.py @@ -29,7 +29,9 @@ import sys from argparse import ArgumentParser from common import base_benchmark_util -from common.utils.validators import check_no_spaces +from common import platform_util +from common.utils.validators import check_no_spaces, check_volume_mount +from common.base_model_init import BaseModelInitializer class LaunchBenchmark(base_benchmark_util.BaseBenchmarkUtil): @@ -67,6 +69,13 @@ def parse_args(self): "If no docker image is specified, then no docker container will be used.", dest="docker_image", default=None, type=check_no_spaces) + arg_parser.add_argument( + "--volume", + help="Specify a custom volume to mount in the container, which follows the same format as the " + "docker --volume flag (https://docs.docker.com/storage/volumes/). " + "This argument can only be used in conjunction with a --docker-image.", + action="append", dest="custom_volumes", type=check_volume_mount) + arg_parser.add_argument( "--debug", help="Launches debug mode which doesn't execute " "start.sh when running in a docker container.", action="store_true") @@ -86,6 +95,17 @@ def validate_args(self): if not self.args.benchmark_only and not self.args.accuracy_only: self.args.benchmark_only = True + # default disable_tcmalloc=False for int8 and disable_tcmalloc=True for other precisions + if not self.args.disable_tcmalloc: + self.args.disable_tcmalloc = str(self.args.precision != "int8") + + if self.args.custom_volumes and not self.args.docker_image: + raise ValueError("Volume mounts can only be used when running in a docker container " + "(a --docker-image must be specified when using --volume).") + + if self.args.mode == "inference" and self.args.checkpoint: + print("Warning: The --checkpoint argument is being deprecated in favor of using frozen graphs.") + def get_model_use_case(self, benchmark_scripts): """ Infers the use case based on the directory structure for the specified model. @@ -161,6 +181,8 @@ def get_env_vars(self, benchmark_scripts, use_case, intelai_models): "BENCHMARK_ONLY": args.benchmark_only, "ACCURACY_ONLY": args.accuracy_only, "OUTPUT_RESULTS": args.output_results, + "DISABLE_TCMALLOC": args.disable_tcmalloc, + "TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD": args.tcmalloc_large_alloc_report_threshold, "DOCKER": str(args.docker_image is not None), "PYTHON_EXE": sys.executable if not args.docker_image else "python" } @@ -193,13 +215,66 @@ def run_bare_metal(self, benchmark_scripts, intelai_models, env_var_dict): # setup volume directories to be the local system directories, since we aren't # mounting volumes when running bare metal, but start.sh expects these args args = self.args - mount_benchmark = benchmark_scripts - mount_external_models_source = args.model_source_dir - mount_intelai_models = intelai_models workspace = os.path.join(benchmark_scripts, "common", args.framework) + mount_benchmark = benchmark_scripts in_graph_path = args.input_graph - dataset_path = args.data_location checkpoint_path = args.checkpoint + dataset_path = args.data_location + + # To Launch Tensorflow Serving benchmark we need only --in-graph arg. + # It does not support checkpoint files. + if args.framework == "tensorflow_serving": + if args.docker_image: + raise ValueError("--docker-image arg is not supported with tensorflow serving benchmarking, " + "as script automatically builds image and supplies it.") + + if checkpoint_path: + raise ValueError("--checkpoint-path arg is not supported with tensorflow serving benchmarking") + + if args.mode != "inference": + raise ValueError("--mode arg should be set to inference") + + if in_graph_path: + env_var_dict["IN_GRAPH"] = in_graph_path + else: + raise ValueError("--in-graph arg is required to run tensorflow serving benchmarking") + + for env_var_name in env_var_dict: + os.environ[env_var_name] = str(env_var_dict[env_var_name]) + + # We need this env to be set for the platform util + os.environ["PYTHON_EXE"] = str(sys.executable if not args.docker_image else "python") + + # Get Platformutil + platform_util_obj = None or platform_util.PlatformUtil(self.args) + + # Configure num_inter_threads and num_intra_threads + base_obj = BaseModelInitializer(args=self.args, custom_args=[], platform_util=platform_util_obj) + base_obj.set_num_inter_intra_threads() + + # Update num_inter_threads and num_intra_threads in env dictionary + env_var_dict["NUM_INTER_THREADS"] = self.args.num_inter_threads + env_var_dict["NUM_INTRA_THREADS"] = self.args.num_intra_threads + + # Set OMP_NUM_THREADS + env_var_dict["OMP_NUM_THREADS"] = self.args.num_intra_threads + + else: + mount_external_models_source = args.model_source_dir + mount_intelai_models = intelai_models + + # Add env vars with bare metal settings + env_var_dict["MOUNT_EXTERNAL_MODELS_SOURCE"] = mount_external_models_source + env_var_dict["MOUNT_INTELAI_MODELS_SOURCE"] = mount_intelai_models + + if in_graph_path: + env_var_dict["IN_GRAPH"] = in_graph_path + + if checkpoint_path: + env_var_dict["CHECKPOINT_DIRECTORY"] = checkpoint_path + + if dataset_path: + env_var_dict["DATASET_LOCATION"] = dataset_path # if using the default output directory, get the full path if args.output_dir == "/models/benchmarks/common/tensorflow/logs": @@ -208,19 +283,8 @@ def run_bare_metal(self, benchmark_scripts, intelai_models, env_var_dict): # Add env vars with bare metal settings env_var_dict["WORKSPACE"] = workspace env_var_dict["MOUNT_BENCHMARK"] = mount_benchmark - env_var_dict["MOUNT_EXTERNAL_MODELS_SOURCE"] = mount_external_models_source - env_var_dict["MOUNT_INTELAI_MODELS_SOURCE"] = mount_intelai_models env_var_dict["OUTPUT_DIR"] = args.output_dir - if in_graph_path: - env_var_dict["IN_GRAPH"] = in_graph_path - - if checkpoint_path: - env_var_dict["CHECKPOINT_DIRECTORY"] = checkpoint_path - - if dataset_path: - env_var_dict["DATASET_LOCATION"] = dataset_path - # Set env vars for bare metal for env_var_name in env_var_dict: os.environ[env_var_name] = str(env_var_dict[env_var_name]) @@ -307,6 +371,10 @@ def run_docker_container(self, benchmark_scripts, intelai_models, env_var_dict): volume_mounts.extend([ "--volume", "{}:{}".format(in_graph_dir, "/in_graph")]) + if args.custom_volumes: + for custom_volume in args.custom_volumes: + volume_mounts.extend(["--volume", custom_volume]) + docker_run_cmd = ["docker", "run"] # only use -it when debugging, otherwise we might get TTY error diff --git a/benchmarks/object_detection/tensorflow/__init__.py b/benchmarks/object_detection/tensorflow/__init__.py index cf793ec6a..d9c4123de 100644 --- a/benchmarks/object_detection/tensorflow/__init__.py +++ b/benchmarks/object_detection/tensorflow/__init__.py @@ -1,7 +1,7 @@ # # -*- coding: utf-8 -*- # -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmarks/object_detection/tensorflow/faster_rcnn/README.md b/benchmarks/object_detection/tensorflow/faster_rcnn/README.md index c5a64aecf..ff3dfce3f 100644 --- a/benchmarks/object_detection/tensorflow/faster_rcnn/README.md +++ b/benchmarks/object_detection/tensorflow/faster_rcnn/README.md @@ -44,8 +44,8 @@ sed -i.bak 95s/input_config/input_config[0]/ offline_eval_map_corloc.py ``` -2. Download the 2017 validation -[COCO dataset](http://cocodataset.org/#home) and annotations: +2. Download and unzip the 2017 validation +[COCO dataset](http://cocodataset.org/#home) images: ``` $ mkdir val @@ -53,7 +53,10 @@ $ cd val $ wget http://images.cocodataset.org/zips/val2017.zip $ unzip val2017.zip $ cd .. +``` +3. Download and unzip the coco dataset annotations file: +``` $ mkdir annotations $ cd annotations $ wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip @@ -73,13 +76,15 @@ $ echo "{ \"images\": {}, \"categories\": {}}" > empty.json $ cd .. ``` -3. Now that you have the raw COCO dataset, we need to convert it to the +4. Now that you have the raw COCO dataset and annotations files, we need to convert it to the TF records format in order to use it with the inference script. We will do this by running the `create_coco_tf_record.py` file in the TensorFlow models repo. +Follow [instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md#dependencies) to install the required dependencies (`cocoapi` and `Protobuf 3.0.0`). Follow the steps below to navigate to the proper directory and point the -script to the raw COCO dataset files that you have downloaded in step 2. +script to the raw COCO dataset files that you have downloaded in step 2 +and the annotations files that you downloaded and created in step 3. The `--output_dir` is the location where the TF record files will be located after the script has completed. @@ -112,13 +117,13 @@ $ git checkout master The `coco_val.record` file is what we will use in this inference example. -4. Download and extract the pre-trained model. +5. Download and extract the pre-trained model. ``` $ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/faster_rcnn_resnet50_fp32_coco_pretrained_model.tar.gz $ tar -xzvf faster_rcnn_resnet50_fp32_coco_pretrained_model.tar.gz ``` -5. Clone the [intelai/models](https://github.com/intelai/models) repo. +6. Clone the [intelai/models](https://github.com/intelai/models) repo. This repo has the launch script for running the model. ``` @@ -132,10 +137,10 @@ Receiving objects: 100% (11/11), done. Resolving deltas: 100% (3/3), done. ``` -6. Run the `launch_benchmark.py` script from the intelai/models repo +7. Run the `launch_benchmark.py` script from the intelai/models repo , with the appropriate parameters including: the -`coco_val.record` data location (from step 3), the pre-trained model -`pipeline.config` file and the checkpoint location (from step 4, and the +`coco_val.record` data location (from step 4), the pre-trained model +`pipeline.config` file and the checkpoint location (from step 5), and the location of your `tensorflow/models` clone (from step 1). Run for batch and online inference: @@ -151,27 +156,27 @@ $ python launch_benchmark.py \ --mode inference \ --socket-id 0 \ --checkpoint /home//faster_rcnn_resnet50_fp32_coco \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ -- config_file=pipeline.config ``` Or for accuracy where the `--data-location` is the path the directory where your `coco_val.record` file is located and the `--in-graph` is -the pre-trained graph located in the pre-trained model directory (from step 4): +the pre-trained graph located in the pre-trained model directory (from step 5): ``` python launch_benchmark.py \ --model-name faster_rcnn \ --mode inference \ --precision fp32 \ --framework tensorflow \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ --data-location /home//coco/output \ --in-graph /home//faster_rcnn_resnet50_fp32_coco/frozen_inference_graph.pb \ --accuracy-only ``` -7. The log file is saved to the value of `--output-dir`. +8. The log file is saved to the value of `--output-dir`. Below is a sample log file tail when running for batch and online inference: @@ -179,8 +184,6 @@ and online inference: ``` Time spent : 167.353 seconds. Time spent per BATCH: 0.167 seconds. -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Received these standard args: Namespace(accuracy_only=False, batch_size=1, benchmark_only=False, checkpoint='/checkpoints', data_location='/dataset', framework='tensorflow', input_graph=None, intelai_models='/workspace/intelai_models', mode='inference', model_args=[], model_name='faster_rcnn', model_source_dir='/workspace/models', num_cores=-1, num_inter_threads=2, num_intra_threads=56, precision='fp32', socket_id=0, use_case='object_detection', verbose=True) Received these custom args: ['--config_file=pipeline.config'] Run model here. @@ -208,15 +211,24 @@ DONE (t=1.35s). Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.383 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = -1.000 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_faster_rcnn_inference_fp32_20190114_205714.log ``` ## Int8 Inference Instructions -1. Please follow step 1, 2 and 3 of Faster R-CNN FP32 instructions written above. +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + +1. Please follow the steps from the +[Faster R-CNN FP32 instructions](#fp32-inference-instructions) written +above for cloning dependecy repositories and getting the coco dataset: +* Performance bechmarking uses the raw coco dataset images. Follow steps +1 and 2 from the FP32 instructions. +* Accuracy testing requires the coco daataset to be in the TF records +format. Follow steps 1, 2, 3, and 4 from the FP32 instructions. 2. Download the pre-trained model. ``` @@ -242,12 +254,15 @@ with the appropriate parameters. To run on single socket use `--socket_id` switc by default it will be using all available sockets. Optional parameter `number_of_steps` (default value = 5000) can be added at the end of command after `--` as shown below: -Run for batch and online inference: +Run batch and online inference using the following command. +The `--data-location` is the path to the directory that contains the raw coco dataset +validation images which you downloaded and unzipped: + ``` $ cd /home//models/benchmarks $ python launch_benchmark.py \ - --data-location /home//coco/output/ \ + --data-location /home//val2017 \ --model-source-dir /home//tensorflow/models \ --model-name faster_rcnn \ --framework tensorflow \ @@ -255,7 +270,7 @@ $ python launch_benchmark.py \ --mode inference \ --socket-id 0 \ --in-graph /home//faster_rcnn_int8_pretrained_model.pb \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --benchmark-only \ -- number_of_steps=5000 ``` @@ -270,19 +285,13 @@ python launch_benchmark.py \ --precision int8 \ --framework tensorflow \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ - --data-location /home//coco_dataset/coco_val.record \ + --data-location /home//output/coco_val.record \ --in-graph /home//faster_rcnn_int8_pretrained_model.pb \ --accuracy-only ``` -The docker image (`intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl`) -used in the commands above were built using -[TensorFlow](git@github.com:tensorflow/tensorflow.git) master -([e889ea1](https://github.com/tensorflow/tensorflow/commit/e889ea1dd965c31c391106aa3518fc23d2689954)) and -[PR #25765](https://github.com/tensorflow/tensorflow/pull/25765). - 5. The log file is saved to the value of `--output-dir`. Below is a sample log file tail when running for batch @@ -295,8 +304,6 @@ Step 4970: 0.070191860199 seconds Step 4980: 0.0755469799042 seconds Step 4990: 0.0742928981781 seconds Avg. Duration per Step:0.0760930150986 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size -1 Log location outside container: {--output-dir value}/benchmark_faster_rcnn_inference_int8_20190117_232539.log ``` @@ -317,8 +324,6 @@ DONE (t=1.34s). Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.375 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = -1.000 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size -1 Log location outside container: {--output-dir value}/benchmark_faster_rcnn_inference_int8_20190117_231937.log ``` diff --git a/benchmarks/object_detection/tensorflow/faster_rcnn/inference/fp32/config.json b/benchmarks/object_detection/tensorflow/faster_rcnn/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/faster_rcnn/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/faster_rcnn/inference/fp32/model_init.py b/benchmarks/object_detection/tensorflow/faster_rcnn/inference/fp32/model_init.py index 3e0167f75..c30f39ada 100644 --- a/benchmarks/object_detection/tensorflow/faster_rcnn/inference/fp32/model_init.py +++ b/benchmarks/object_detection/tensorflow/faster_rcnn/inference/fp32/model_init.py @@ -43,7 +43,8 @@ def __init__(self, args, custom_args, platform_util=None): self.set_num_inter_intra_threads() # Set KMP env vars, if they haven't already been set - self.set_kmp_vars() + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) @@ -64,7 +65,7 @@ def __init__(self, args, custom_args, platform_util=None): self.args.intelai_models, self.args.mode, self.args.precision, "eval.py") self.command_prefix = \ - self.get_numactl_command(self.args.socket_id) + self.python_exe + " " + \ + self.get_command_prefix(self.args.socket_id) + self.python_exe + " " + \ benchmark_script config_file_path = os.path.join(self.args.checkpoint, diff --git a/benchmarks/object_detection/tensorflow/faster_rcnn/inference/int8/config.json b/benchmarks/object_detection/tensorflow/faster_rcnn/inference/int8/config.json new file mode 100644 index 000000000..6f1228ba7 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/faster_rcnn/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/faster_rcnn/inference/int8/model_init.py b/benchmarks/object_detection/tensorflow/faster_rcnn/inference/int8/model_init.py index 749026f3c..37eaf2722 100644 --- a/benchmarks/object_detection/tensorflow/faster_rcnn/inference/int8/model_init.py +++ b/benchmarks/object_detection/tensorflow/faster_rcnn/inference/int8/model_init.py @@ -41,8 +41,9 @@ def __init__(self, args, custom_args=[], platform_util=None): self.args.intelai_models, self.args.mode, self.args.precision, self.RFCN_ACCURACY_SCRIPT) - # Set KMP env vars, except override the default KMP_BLOCKTIME value - self.set_kmp_vars(kmp_blocktime="0") + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) self.validate_args() @@ -82,7 +83,7 @@ def parse_args(self): def run_perf_command(self): set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) self.parse_args() - command = self.get_numactl_command(self.args.socket_id) + command = self.get_command_prefix(self.args.socket_id) command += " {} ".format(self.python_exe) + self.perf_script_path command += " -g " + self.args.input_graph if self.custom_args: diff --git a/benchmarks/object_detection/tensorflow/rfcn/README.md b/benchmarks/object_detection/tensorflow/rfcn/README.md index 02db3210a..6e4a519df 100644 --- a/benchmarks/object_detection/tensorflow/rfcn/README.md +++ b/benchmarks/object_detection/tensorflow/rfcn/README.md @@ -10,6 +10,11 @@ for other precisions are coming later. ## Int8 Inference Instructions +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + 1. Clone the [tensorflow/models](https://github.com/tensorflow/models) and [cocodataset/cocoapi](https://github.com/cocodataset/cocoapi) repositories: ``` @@ -44,7 +49,7 @@ sed -i.bak 95s/input_config/input_config[0]/ offline_eval_map_corloc.py ``` -2. Download the 2017 validation +2. Download the 2017 validation [COCO dataset](http://cocodataset.org/#home) and annotations: ``` @@ -78,6 +83,7 @@ TF records format in order to use it with the inference script. We will do this by running the `create_coco_tf_record.py` file in the TensorFlow models repo. +Follow [instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md#dependencies) to install the required dependencies (`cocoapi` and `Protobuf 3.0.0`). Follow the steps below to navigate to the proper directory and point the script to the raw COCO dataset files that you have downloaded in step 2. The `--output_dir` is the location where the TF record files will be @@ -133,7 +139,7 @@ python launch_benchmark.py \ --mode inference \ --precision int8 \ --framework tensorflow \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ --data-location /home//val/val2017 \ --in-graph /home//rfcn_resnet101_int8_coco_pretrained_model.pb \ @@ -150,7 +156,7 @@ python launch_benchmark.py \ --mode inference \ --precision int8 \ --framework tensorflow \ - --docker-image intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ --data-location /home//coco/output/coco_val.record-00000-of-00001 \ --in-graph /home//rfcn_resnet101_int8_coco_pretrained_model.pb \ @@ -158,12 +164,6 @@ python launch_benchmark.py \ -- split="accuracy_message" ``` -The docker image (`intelaipg/intel-optimized-tensorflow:PR25765-devel-mkl`) -used in the commands above were built using -[TensorFlow](git@github.com:tensorflow/tensorflow.git) master -([e889ea1](https://github.com/tensorflow/tensorflow/commit/e889ea1dd965c31c391106aa3518fc23d2689954)) and -[PR #25765](https://github.com/tensorflow/tensorflow/pull/25765). - Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands to get additional debug output or change the default output location. @@ -173,18 +173,16 @@ to get additional debug output or change the default output location. Below is a sample log file tail when running for batch and online inference: ``` -Step 0: 10.6923000813 seconds -Step 10: 0.168856859207 seconds +Step 0: 11.4450089931 seconds +Step 10: 0.25656080246 seconds ... -Step 460: 0.181148052216 seconds -Step 470: 0.202737092972 seconds -Step 480: 0.117042064667 seconds -Step 490: 0.103501081467 seconds -Avg. Duration per Step:0.169812122345 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu +Step 460: 0.256786823273 seconds +Step 470: 0.267828941345 seconds +Step 480: 0.141321897507 seconds +Step 490: 0.127830982208 seconds +Avg. Duration per Step:0.195356227875 Ran inference with batch size -1 -Log location outside container: {--output-dir}/benchmark_rfcn_inference_int8_20190227_191959.log +Log location outside container: {--output-dir}/benchmark_rfcn_inference_int8_20190416_182445.log ``` And here is a sample log file tail when running for accuracy: @@ -204,8 +202,6 @@ DONE (t=1.03s). Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.150 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = -1.000 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size -1 Log location outside container: {--output-dir}/benchmark_rfcn_inference_int8_20190227_194752.log ``` @@ -225,7 +221,7 @@ $ git clone https://github.com/cocodataset/cocoapi.git The TensorFlow models repo will be used for running inference as well as converting the coco dataset to the TF records format. -2. Download the 2017 validation +2. Download the 2017 validation [COCO dataset](http://cocodataset.org/#home) and annotations: ``` @@ -259,6 +255,7 @@ TF records format in order to use it with the inference script. We will do this by running the `create_coco_tf_record.py` file in the TensorFlow models repo. +Follow [instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md#dependencies) to install the required dependencies (`cocoapi` and `Protobuf 3.0.0`). Follow the steps below to navigate to the proper directory and point the script to the raw COCO dataset files that you have downloaded in step 2. The `--output_dir` is the location where the TF record files will be @@ -334,7 +331,7 @@ $ python launch_benchmark.py \ --mode inference \ --socket-id 0 \ --checkpoint /home//rfcn_resnet101_fp32_coco \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ -- config_file=rfcn_pipeline.config ``` @@ -347,7 +344,7 @@ python launch_benchmark.py \ --mode inference \ --precision fp32 \ --framework tensorflow \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ --data-location /home//coco/output/coco_val.record \ --in-graph /home//rfcn_resnet101_fp32_coco/frozen_inference_graph.pb \ @@ -363,8 +360,6 @@ online inference: ``` Average time per step: 0.262 sec -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Received these standard args: Namespace(accuracy_only=False, batch_size=1, benchmark_only=False, checkpoint='/checkpoints', data_location='/dataset', framework='tensorflow', input_graph=None, intelai_models='/workspace/intelai_models', mode='inference', model_args=[], model_name='rfcn', model_source_dir='/workspace/models', num_cores=-1, num_inter_threads=2, num_intra_threads=56, precision='fp32, socket_id=0, use_case='object_detection', verbose=True) Received these custom args: ['--config_file=rfcn_pipeline.config'] Run model here. @@ -391,8 +386,7 @@ DONE (t=1.19s). Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.400 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.400 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = -1.000lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu -Ran inference with batch size 1 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = -1.000 + Ran inference with batch size 1 Log location outside container: {--output-dir value}/benchmark_rfcn_inference_fp32_20181221_211905.log ``` diff --git a/benchmarks/object_detection/tensorflow/rfcn/inference/fp32/config.json b/benchmarks/object_detection/tensorflow/rfcn/inference/fp32/config.json new file mode 100644 index 000000000..d7f51a4c2 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/rfcn/inference/fp32/config.json @@ -0,0 +1,6 @@ +{ + "optimization_parameters": { + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/rfcn/inference/fp32/model_init.py b/benchmarks/object_detection/tensorflow/rfcn/inference/fp32/model_init.py index 712da5777..031c0f2ca 100644 --- a/benchmarks/object_detection/tensorflow/rfcn/inference/fp32/model_init.py +++ b/benchmarks/object_detection/tensorflow/rfcn/inference/fp32/model_init.py @@ -45,8 +45,9 @@ def __init__(self, args, custom_args, platform_util): self.args.intelai_models, self.args.mode, self.args.precision, "eval.py") - # Set KMP env vars, except override the default KMP_BLOCKTIME and KMP_AFFINITY values - self.set_kmp_vars(kmp_blocktime="0", kmp_affinity=None) + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) self.run_inference_sanity_checks(self.args, self.custom_args) self.parse_custom_args() @@ -54,7 +55,7 @@ def __init__(self, args, custom_args, platform_util): "research") def run_benchmark(self): - command_prefix = self.get_numactl_command(self.args.socket_id) + \ + command_prefix = self.get_command_prefix(self.args.socket_id) + \ self.python_exe + " " + self.benchmark_script # set num_inter_threads and num_intra_threads diff --git a/benchmarks/object_detection/tensorflow/rfcn/inference/int8/config.json b/benchmarks/object_detection/tensorflow/rfcn/inference/int8/config.json new file mode 100644 index 000000000..6f1228ba7 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/rfcn/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/rfcn/inference/int8/model_init.py b/benchmarks/object_detection/tensorflow/rfcn/inference/int8/model_init.py index eec69455d..f52eed9b4 100755 --- a/benchmarks/object_detection/tensorflow/rfcn/inference/int8/model_init.py +++ b/benchmarks/object_detection/tensorflow/rfcn/inference/int8/model_init.py @@ -54,8 +54,9 @@ def __init__(self, args, custom_args=[], platform_util=None): self.parse_args() - # Set KMP env vars with defaults, except for KMP_BLOCKTIME - self.set_kmp_vars(kmp_blocktime=0) + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # Set num_inter_threads and num_intra_threads self.set_num_inter_intra_threads() @@ -110,6 +111,8 @@ def validate_args(self): format(self.args.model_source_dir)) def run_perf_command(self): + # Get the command previx, but numactl is added later in run_perf_command() + self.command.append(self.get_command_prefix(self.args.socket_id, numactl=False)) num_cores = str(self.platform_util.num_cores_per_socket) if self.args.num_cores != -1: num_cores = str(self.args.num_cores) @@ -157,7 +160,8 @@ def run_perf_command(self): def run_accuracy_command(self): # already validated by parent - self.command = "FROZEN_GRAPH=" + self.args.input_graph + self.command = self.get_command_prefix(self.args.socket_id, numactl=False) + self.command += "FROZEN_GRAPH=" + self.args.input_graph if self.args.data_location and os.path.exists( self.args.data_location): diff --git a/benchmarks/object_detection/tensorflow/rfcn/requirements.txt b/benchmarks/object_detection/tensorflow/rfcn/requirements.txt index 92d9e0ba5..3ebb25335 100644 --- a/benchmarks/object_detection/tensorflow/rfcn/requirements.txt +++ b/benchmarks/object_detection/tensorflow/rfcn/requirements.txt @@ -1,6 +1,6 @@ Cython contextlib2 -pillow +pillow==5.3.0 lxml jupyter matplotlib diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md b/benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md index 8dc015d61..c6400197c 100644 --- a/benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md @@ -10,6 +10,11 @@ for other precisions are coming later. ## Int8 Inference Instructions +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + 1. Clone the [tensorflow/models](https://github.com/tensorflow/models) repository at the specified SHA and clone the [cocoapi repo](git clone https://github.com/cocodataset/cocoapi.git) in @@ -61,6 +66,7 @@ TF records format in order to use it with the inference script. We will do this by running the `create_coco_tf_record.py` file in the TensorFlow models repo. +Follow [instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md#dependencies) to install the required dependencies (`cocoapi` and `Protobuf 3.0.0`). Follow the steps below to navigate to the proper directory and point the script to the raw COCO dataset files that you have downloaded in step 2. The `--output_dir` is the location where the TF record files will be @@ -115,7 +121,7 @@ python launch_benchmark.py \ --precision int8 \ --framework tensorflow \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-prs-b5d67b7-avx2-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ --data-location /home//val/val2017 \ --in-graph /home//ssdmobilenet_int8_pretrained_model.pb \ @@ -123,8 +129,8 @@ python launch_benchmark.py \ --batch-size 1 ``` -Or for accuracy where the `--data-location` is the path the directory -where your `coco_val.record` file is located: +Or for accuracy where the `--data-location` is the path to +the tf record file that you generated in step 2: ``` python launch_benchmark.py \ --model-name ssd-mobilenet \ @@ -132,19 +138,14 @@ python launch_benchmark.py \ --precision int8 \ --framework tensorflow \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-prs-b5d67b7-avx2-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//tensorflow/models \ - --data-location /home//coco/output \ + --data-location /home//coco/output/coco_val.record \ --in-graph /home//ssdmobilenet_int8_pretrained_model.pb \ --accuracy-only \ --batch-size 1 ``` -Note that it is required to use the docker image specified in the -commands above (`intelaipg/intel-optimized-tensorflow:latest-prs-b5d67b7`) -to run SSD-MobileNet Int8, as it includes PRs that are required to run -this model. - Note that the `--verbose` or `--output-dir` flag can be added to any of the above commands to get additional debug output or change the default output location. @@ -154,15 +155,13 @@ Below is a sample log file tail when running for batch and online inference: ``` -Step 4970: 0.0340421199799 seconds -Step 4980: 0.0429329872131 seconds -Step 4990: 0.0358219146729 seconds -Avg. Duration per Step:0.0364457404137 -Avg. Duration per Step:0.0365921088491 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu +Step 4970: 0.0305020809174 seconds +Step 4980: 0.0294089317322 seconds +Step 4990: 0.0301029682159 seconds +Avg. Duration per Step:0.0300041775227 +Avg. Duration per Step:0.0301246762276 Ran inference with batch size 1 -Log location outside container: /benchmark_ssd-mobilenet_inference_int8_20181203_232524.log +Log location outside container: /benchmark_ssd-mobilenet_inference_int8_20190417_175418.log ``` And here is a sample log file tail when running for accuracy: @@ -185,8 +184,6 @@ DONE (t=1.10s). Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.212 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = -1.000 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size 1 Log location outside container: /benchmark_ssd-mobilenet_inference_int8_20181204_185432.log ``` @@ -245,6 +242,7 @@ TF records format in order to use it with the inference script. We will do this by running the `create_coco_tf_record.py` file in the TensorFlow models repo. +Follow [instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md#dependencies) to install the required dependencies (`cocoapi` and `Protobuf 3.0.0`). Follow the steps below to navigate to the proper directory and point the script to the raw COCO dataset files that you have downloaded in step 2. The `--output_dir` is the location where the TF record files will be @@ -351,7 +349,7 @@ $ python launch_benchmark.py \ --precision fp32 \ --mode inference \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:1.12.0-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --benchmark-only ``` @@ -370,7 +368,7 @@ $ python launch_benchmark.py \ --precision fp32 \ --mode inference \ --socket-id 0 \ - --docker-image intelaipg/intel-optimized-tensorflow:1.12.0-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --accuracy-only ``` @@ -382,8 +380,6 @@ Below is a sample log file tail when running for performance: INFO:tensorflow:Processed 5001 images... moving average latency 37 ms INFO:tensorflow:Finished processing records Latency: min = 33.8, max = 6635.9, mean= 38.4, median = 37.2 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size -1 Log location outside container: {--output-dir value}/benchmark_ssd-mobilenet_inference_fp32_20190130_225108.log ``` @@ -403,8 +399,6 @@ Below is a sample log file tail when testing accuracy: Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.264 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = -1.000 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size -1 Log location outside container: {--output-dir value}/benchmark_ssd-mobilenet_inference_fp32_20190123_225145.log ``` diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/config.json b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/config.json new file mode 100644 index 000000000..6f1228ba7 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/model_init.py b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/model_init.py index 379e47c67..927f73048 100644 --- a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/model_init.py +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/model_init.py @@ -44,8 +44,9 @@ def __init__(self, args, custom_args, platform_util): self.run_inference_sanity_checks(self.args, self.custom_args) self.research_dir = os.path.join(args.model_source_dir, "research") - # Set KMP env vars, except override the default KMP_BLOCKTIME value - self.set_kmp_vars(kmp_blocktime="0") + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # set num_inter_threads and num_intra_threads (override inter threads to 2) self.set_num_inter_intra_threads(num_inter_threads=2) @@ -67,7 +68,7 @@ def __init__(self, args, custom_args, platform_util): self.args.precision, "infer_detections.py") # get command with numactl - self.run_cmd = self.get_numactl_command( + self.run_cmd = self.get_command_prefix( self.args.socket_id) + "{} {}".format(self.python_exe, benchmark_script) output_tf_record_path = os.path.join(os.path.dirname( diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/config.json b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/model_init.py b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/model_init.py index 4fdfb3a06..28522ada4 100644 --- a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/model_init.py +++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/model_init.py @@ -31,7 +31,10 @@ class ModelInitializer(BaseModelInitializer): def __init__(self, args, custom_args=[], platform_util=None): super(ModelInitializer, self).__init__(args, custom_args, platform_util) - self.set_kmp_vars(kmp_blocktime="0") + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # set num_inter_threads and num_intra_threads (override inter threads to 2) self.set_num_inter_intra_threads(num_inter_threads=2) @@ -49,7 +52,7 @@ def __init__(self, args, custom_args=[], platform_util=None): benchmark_script = os.path.join( self.args.intelai_models, self.args.mode, self.args.precision, "run_frozen_graph_ssdmob.py") - self.command_prefix = self.get_numactl_command(self.args.socket_id) + \ + self.command_prefix = self.get_command_prefix(self.args.socket_id) + \ "{} {}".format(self.python_exe, benchmark_script) set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) @@ -64,7 +67,7 @@ def __init__(self, args, custom_args=[], platform_util=None): accuracy_script = os.path.join( self.args.intelai_models, self.args.mode, self.args.precision, "coco_int8.sh") - self.command_prefix = "sh {} {} {}/coco_val.record".format( + self.command_prefix = "sh {} {} {}".format( accuracy_script, self.args.input_graph, self.args.data_location) diff --git a/benchmarks/object_detection/tensorflow/ssd-resnet34/README.md b/benchmarks/object_detection/tensorflow/ssd-resnet34/README.md index 4171e1984..e7b3528fb 100644 --- a/benchmarks/object_detection/tensorflow/ssd-resnet34/README.md +++ b/benchmarks/object_detection/tensorflow/ssd-resnet34/README.md @@ -3,6 +3,7 @@ This document has instructions for how to run SSD-ResNet34 for the following modes/precisions: * [FP32 inference](#fp32-inference-instructions) +* [INT8 inference](#int8-inference-instructions) Instructions and scripts for model training and inference for other precisions are coming later. @@ -61,6 +62,7 @@ TF records format in order to use it with the inference script. We will do this by running the `create_coco_tf_record.py` file in the TensorFlow models repo. +Follow [instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md#dependencies) to install the required dependencies (`cocoapi` and `Protobuf 3.0.0`). Follow the steps below to navigate to the proper directory and point the script to the raw COCO dataset files that you have downloaded in step 2. The `--output_dir` is the location where the TF record files will be @@ -95,7 +97,11 @@ $ git checkout f505cecde2d8ebf6fe15f40fb8bc350b2b1ed5dc The `coco_val.record` file is what we will use in this inference example. -5. A link to download the pre-trained model is coming soon. +5. Download the pretrained model: + +``` +$ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/ssd_resnet34_fp32_bs1_pretrained_model.pb +``` 6. Clone the [intelai/models](https://github.com/intelai/models) repo. This repo has the launch script for running the model, which we will @@ -109,20 +115,18 @@ $ git clone https://github.com/IntelAI/models.git [intelai/models](https://github.com/intelai/models) repo that was just cloned in the previous step. SSD-ResNet34 can be run for batch and online inference, or accuracy. Note that we are running -SSD-ResNet34 with a TensorFlow 1.13 docker image. +SSD-ResNet34 with a TensorFlow 1.14 docker image. To run for batch and online inference, use the following command, -but replace in your path to the unzipped coco dataset images from step 3 -for the `--dataset-location`, the path to the frozen graph that you -downloaded in step 5 as the `--in-graph`, and use the `--benchmark-only` +the path to the frozen graph that you downloaded in step 5 as +the `--in-graph`, and use the `--benchmark-only` flag: ``` $ cd /home//models/benchmarks $ python launch_benchmark.py \ - --data-location /home//coco/output/ \ - --in-graph /home//ssd_resnet34_coco_pretained_model/ssd_resnet34_bs1.pb \ + --in-graph /home//ssd_resnet34_fp32_bs1_pretrained_model.pb \ --model-source-dir /home//tensorflow/models \ --model-name ssd-resnet34 \ --framework tensorflow \ @@ -130,7 +134,7 @@ $ python launch_benchmark.py \ --mode inference \ --socket-id 0 \ --batch-size=1 \ - --docker-image intelaipg/intel-optimized-tensorflow:1.13.1-devel-mkl-py3 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --benchmark-only ``` @@ -142,7 +146,7 @@ the path to the frozen graph that you downloaded in step 5 as the ``` $ python launch_benchmark.py \ --data-location /home//coco/output/ \ - --in-graph /home//ssd_resnet34_coco_pretained_model/ssd_resnet34_bs1.pb \ + --in-graph /home//ssd_resnet34_fp32_bs1_pretrained_model.pb \ --model-source-dir /home//tensorflow/models \ --model-name ssd-resnet34 \ --framework tensorflow \ @@ -150,7 +154,7 @@ $ python launch_benchmark.py \ --mode inference \ --socket-id 0 \ --batch-size=1 \ - --docker-image intelaipg/intel-optimized-tensorflow:1.13.1-devel-mkl-py3 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --accuracy-only ``` @@ -180,8 +184,180 @@ Below is a sample log file tail when testing accuracy: Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.334 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.494 Current AP: 0.21082 -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu -Ran inference with batch size -1 -Log location outside container: {--output-dir value}/benchmark_ssd-mobilenet_inference_fp32_20190123_225145.log +``` + +## INT8 Inference Instructions + +1. Clone the `tensorflow/models` repository with the specified SHA, +since we are using an older version of the models repo for +SSD-ResNet34. + +``` +$ git clone https://github.com/tensorflow/models.git +$ cd models +$ git checkout f505cecde2d8ebf6fe15f40fb8bc350b2b1ed5dc +$ git clone https://github.com/cocodataset/cocoapi.git +``` + +The TensorFlow models repo will be used for running inference as well as +converting the coco dataset to the TF records format. + +2. Follow the TensorFlow models object detection +[installation instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md#installation) +to get your environment setup with the required dependencies. + +3. Download the 2017 validation +[COCO dataset](http://cocodataset.org/#home) and annotations: + +``` +$ mkdir val +$ cd val +$ wget http://images.cocodataset.org/zips/val2017.zip +$ unzip val2017.zip +$ cd .. + +$ mkdir annotations +$ cd annotations +$ wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip +$ unzip annotations_trainval2017.zip +$ cd .. +``` + +Since we are only using the validation dataset in this example, we will +create an empty directory and empty annotations json file to pass as the +train and test directories in the next step. + +``` +$ mkdir empty_dir + +$ cd annotations +$ echo "{ \"images\": {}, \"categories\": {}}" > empty.json +$ cd .. +``` + +4. Now that you have the raw COCO dataset, we need to convert it to the +TF records format in order to use it with the inference script. We will +do this by running the `create_coco_tf_record.py` file in the TensorFlow +models repo. + +Follow the steps below to navigate to the proper directory and point the +script to the raw COCO dataset files that you have downloaded in step 2. +The `--output_dir` is the location where the TF record files will be +located after the script has completed. + +``` + +# We are going to use an older version of the conversion script to checkout the git commit +$ cd models +$ git checkout 7a9934df2afdf95be9405b4e9f1f2480d748dc40 + +$ cd research/object_detection/dataset_tools/ +$ python create_coco_tf_record.py --logtostderr \ + --train_image_dir="/home//coco/empty_dir" \ + --val_image_dir="/home//coco/val/val2017" \ + --test_image_dir="/home//coco/empty_dir" \ + --train_annotations_file="/home//coco/annotations/empty.json" \ + --val_annotations_file="/home//coco/annotations/instances_val2017.json" \ + --testdev_annotations_file="/home//coco/annotations/empty.json" \ + --output_dir="/home//coco/output" + +$ ll /home//coco/output +total 1598276 +-rw-rw-r--. 1 0 Nov 2 21:46 coco_testdev.record +-rw-rw-r--. 1 0 Nov 2 21:46 coco_train.record +-rw-rw-r--. 1 818336740 Nov 2 21:46 coco_val.record + +# Go back to the main models directory and checkout the SHA that we are using for SSD-ResMet34 +$ cd /home//models +$ git checkout f505cecde2d8ebf6fe15f40fb8bc350b2b1ed5dc +``` + +The `coco_val.record` file is what we will use in this inference example. + +5. Download the pretrained model: + +``` +$ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/ssd_resnet34_int8_bs1_pretrained_model.pb +``` + +6. Clone the [intelai/models](https://github.com/intelai/models) repo. +This repo has the launch script for running the model, which we will +use in the next step. + +``` +$ git clone https://github.com/IntelAI/models.git +``` + +7. Next, navigate to the `benchmarks` directory of the +[intelai/models](https://github.com/intelai/models) repo that was just +cloned in the previous step. SSD-ResNet34 can be run for testing batch or online inference, or testing accuracy. Note that we are running +SSD-ResNet34 with a TensorFlow 1.14 docker image. + +To run for batch and online inference, use the following command, +the path to the frozen graph that you downloaded in step 5 as +the `--in-graph`, and use the `--benchmark-only` +flag: + +``` +$ cd /home//models/benchmarks + +$ python launch_benchmark.py \ + --in-graph /home//ssd_resnet34_int8_bs1_pretrained_model.pb \ + --model-source-dir /home//tensorflow/models \ + --model-name ssd-resnet34 \ + --framework tensorflow \ + --precision int8 \ + --mode inference \ + --socket-id 0 \ + --batch-size=1 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --benchmark-only +``` + +To test accuracy, use the following command but replace in your path to +the tf record file that you generated in step 4 for the `--data-location`, +the path to the frozen graph that you downloaded in step 5 as the +`--in-graph`, and use the `--accuracy-only` flag: + +``` +$ python launch_benchmark.py \ + --data-location /home//coco/output/ \ + --in-graph /home//ssd_resnet34_int8_bs1_pretrained_model.pb \ + --model-source-dir /home//tensorflow/models \ + --model-name ssd-resnet34 \ + --framework tensorflow \ + --precision int8 \ + --mode inference \ + --socket-id 0 \ + --batch-size=1 \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --accuracy-only +``` + +8. The log file is saved to the value of `--output-dir`. + +Below is a sample log file tail when testing performance: + +``` +Batchsize: 1 +Time spent per BATCH: 12.0245 ms +Total samples/sec: 83.1635 samples/s +``` + +Below is a sample log file tail when testing accuracy: + +``` + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.204 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.360 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.208 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.051 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.213 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.335 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.210 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.294 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.301 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.083 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.327 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.484 +Current AP: 0.20408 ``` diff --git a/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/fp32/config.json b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/fp32/model_init.py b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/fp32/model_init.py index 0e6657a11..0b53a0112 100644 --- a/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/fp32/model_init.py +++ b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/fp32/model_init.py @@ -42,7 +42,11 @@ def __init__(self, args, custom_args, platform_util): super(ModelInitializer, self).__init__(args, custom_args, platform_util) self.run_inference_sanity_checks(self.args, self.custom_args) - self.set_kmp_vars() + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + self.set_num_inter_intra_threads() set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) @@ -53,7 +57,7 @@ def __init__(self, args, custom_args, platform_util): benchmark_script = os.path.join(self.model_dir, "infer_detections.py") # get command with numactl - self.run_cmd = self.get_numactl_command(self.args.socket_id) + self.run_cmd = self.get_command_prefix(self.args.socket_id) self.run_cmd += "{0} {1}".format(self.python_exe, benchmark_script) self.run_cmd += " --input-graph {0}".format(self.args.input_graph) self.run_cmd += " --batch-size {0}".format(args.batch_size) @@ -65,8 +69,8 @@ def __init__(self, args, custom_args, platform_util): self.run_cmd += " --data-location {0}".format(self.args.data_location) def run(self): - print(self.run_cmd) old_python_path = os.environ["PYTHONPATH"] os.environ["PYTHONPATH"] = os.path.join(self.args.model_source_dir, "research") + os.environ["PYTHONPATH"] += ":/tmp/benchmarks/scripts/tf_cnn_benchmarks/" self.run_command(self.run_cmd) os.environ["PYTHONPATH"] = old_python_path diff --git a/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/int8/__init__.py b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/int8/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/int8/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/int8/config.json b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/int8/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/int8/model_init.py b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/int8/model_init.py new file mode 100644 index 000000000..0b53a0112 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd-resnet34/inference/int8/model_init.py @@ -0,0 +1,76 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +import os +import sys + +from common.base_model_init import BaseModelInitializer +from common.base_model_init import set_env_var + + +class ModelInitializer(BaseModelInitializer): + def run_inference_sanity_checks(self, args, custom_args): + if not args.input_graph: + sys.exit("Please provide a path to the frozen graph directory" + " via the '--in-graph' flag.") + if not args.data_location and self.args.accuracy_only: + sys.exit("Please provide a path to the data directory via the " + "'--data-location' flag.") + if args.socket_id == -1 and args.num_cores == -1: + print("***Warning***: Running inference on all cores could degrade" + " performance. Pass a '--socket-id' to specify running on a" + " single socket instead.\n") + + def __init__(self, args, custom_args, platform_util): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) + + self.run_inference_sanity_checks(self.args, self.custom_args) + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + + self.set_num_inter_intra_threads() + + set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) + + self.model_dir = os.path.join(self.args.intelai_models, self.args.mode, self.args.precision) + + # get benchmark command + benchmark_script = os.path.join(self.model_dir, "infer_detections.py") + + # get command with numactl + self.run_cmd = self.get_command_prefix(self.args.socket_id) + self.run_cmd += "{0} {1}".format(self.python_exe, benchmark_script) + self.run_cmd += " --input-graph {0}".format(self.args.input_graph) + self.run_cmd += " --batch-size {0}".format(args.batch_size) + self.run_cmd += " --inter-op-parallelism-threads {0}".format(self.args.num_inter_threads) + self.run_cmd += " --intra-op-parallelism-threads {0}".format(self.args.num_intra_threads) + + if self.args.accuracy_only: + self.run_cmd += " --accuracy-only " + self.run_cmd += " --data-location {0}".format(self.args.data_location) + + def run(self): + old_python_path = os.environ["PYTHONPATH"] + os.environ["PYTHONPATH"] = os.path.join(self.args.model_source_dir, "research") + os.environ["PYTHONPATH"] += ":/tmp/benchmarks/scripts/tf_cnn_benchmarks/" + self.run_command(self.run_cmd) + os.environ["PYTHONPATH"] = old_python_path diff --git a/benchmarks/object_detection/tensorflow/ssd_vgg16/README.md b/benchmarks/object_detection/tensorflow/ssd_vgg16/README.md new file mode 100644 index 000000000..971311f75 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd_vgg16/README.md @@ -0,0 +1,285 @@ +# SSD-VGG16 + +This document has instructions for how to run SSD-VGG16 for the +following modes/precisions: +* [Int8 inference](#int8-inference-instructions) +* [FP32 inference](#fp32-inference-instructions) + +Instructions and scripts for model training and inference +other precisions are coming later. + +## Int8 Inference Instructions + +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + +1. Clone the [original model](https://github.com/HiKapok/SSD.TensorFlow) repository: +``` +$ git clone https://github.com/HiKapok/SSD.TensorFlow.git +$ cd SSD.TensorFlow +$ git checkout 2d8b0cb9b2e70281bf9dce438ff17ffa5e59075c +``` + +2. Clone the [intelai/models](https://github.com/intelai/models) repository. +It will be used to run the SSD-VGG16 model accuracy and inference performance tests. + +3. Download the 2017 validation images file: +[COCO dataset](http://cocodataset.org/#home) and annotations: +This is required if you would like to run the accuracy test, +or batch/online inference with real data. + +``` +$ wget http://images.cocodataset.org/zips/val2017.zip +$ unzip val2017.zip +``` + +Download the validation annotations file: +``` +$ wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip +$ unzip annotations_trainval2017.zip +``` + +4. Convert the COCO dataset to TF records format: + +We provide a script `generate_coco_records.py` to convert the raw dataset to the TF records required pattern. +* Some dependencies are required to be installed to run the script such as `python3`, `Tensorflow` and `tqdm`, also, the `SSD.TensorFlow/dataset` from the original model directory (from step 1). + +Follow the steps below get the COCO TF records: + +* Copy the `generate_coco_records.py` script from `models/object_detection/tensorflow/ssd_vgg16/inference/generate_coco_records.py` +from the `models` directory (step 2) to `SSD.TensorFlow/dataset` in the original model directory (step 1). + +``` +$ cp /home//models/models/object_detection/tensorflow/ssd_vgg16/inference/generate_coco_records.py /home//SSD.TensorFlow/dataset +``` + +* Create directory for the output TF records: +``` +$ mkdir tf_records +``` + +* Run the script to generate the TF records with the required prefix `val`, COCO raw dataset and annotation file (step 3): +``` +$ cd /home//SSD.TensorFlow/dataset +$ python generate_coco_records.py \ +--image_path /home//val2017/ \ +--annotations_file /home//annotations/instances_val2017.json \ +--output_prefix val \ +--output_path /home//tf_records/ +``` + +Now, you can use the `/home//tf_records/` as the dataset location to run inference with real data, and test the model accuracy. +``` +$ ls -l /home//tf_records +total 792084 +-rw-r--r--. 1 170038836 Mar 17 21:35 val-00000-of-00005 +-rw-r--r--. 1 167260232 Mar 17 21:35 val-00001-of-00005 +-rw-r--r--. 1 167326957 Mar 17 21:35 val-00002-of-00005 +-rw-r--r--. 1 166289231 Mar 17 21:35 val-00003-of-00005 +-rw-r--r--. 1 140168531 Mar 17 21:35 val-00004-of-00005 +``` + +5. Download the pretrained model: + +``` +$ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/ssdvgg16_int8_pretrained_model.pb +``` + +6. Navigate to the `benchmarks` directory (step 2), and run the model scripts for either batch or online +inference or accuracy. +``` +$ cd models/benchmarks +``` + +* Run the model for batch or online inference where the `--model-source-dir` is the model source directory from step 1, +and the `--in-graph` is the pretrained model graph from step 5. +If you specify the `--data-location` which is the path to the tf record file that you generated in step 4, +the model will run with real data, otherwise dummy data will be used: +``` +python launch_benchmark.py \ + --model-name ssd_vgg16 \ + --mode inference \ + --precision int8 \ + --framework tensorflow \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --model-source-dir /home//SSD.TensorFlow \ + --data-location /home//tf_records \ + --in-graph /home//ssdvgg16_int8_pretrained_model.pb \ + --batch-size 1 \ + --socket-id 0 \ + --num-inter-threads 11 \ + --num-intra-threads 21 \ + --data-num-inter-threads 21 \ + --data-num-intra-threads 28 \ + -- warmup-steps=100 steps=500 +``` + +* For the accuracy test: + + * Clone the customized [cocoapi repo](https://github.com/waleedka/coco) in +the model directory `SSD.TensorFlow` from step 1. + ``` + $ git clone https://github.com/waleedka/coco.git + + ``` + * The `--data-location` is required, which is the path to the tf record file that you generated in step 4. + * Copy the annotation file `instances_val2017.json` (from step 3) to the dataset directory `/home//tf_records/`. + * Use the `--accuracy-only` flag: +``` +python launch_benchmark.py \ + --model-name ssd_vgg16 \ + --mode inference \ + --precision int8 \ + --framework tensorflow \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --model-source-dir /home//SSD.TensorFlow \ + --data-location /home//tf_records \ + --in-graph /home//ssdvgg16_int8_pretrained_model.pb \ + --accuracy-only \ + --batch-size 1 +``` + +>Notes: +>* For batch and online inference, we recommend the provided values for the arguments: `--num-inter-threads=11`, `--num-intra-threads=21`, `--data-num-inter-threads=21`, + `--data-num-intra-threads=28` for optimized performance on `28-cores Cascade Lake (CLX)` machine. + +>* SSD-VGG16 model accuracy test works only with the `Python3` based docker images. + +>* The `--verbose` or `--output-dir` flag can be added to any of the above commands +to get additional debug output or change the default output location. + +7. The log file is saved to the value of `--output-dir`. + +Below is a sample log file tail when running the model for batch +and online inference, the following results are based on CLX 28-cores with hyper-threading enabled: + +``` +Batch size = 1 +Throughput: 30.382 images/sec +Latency: 32.915 ms +Ran inference with batch size 1 +Log location outside container: {--output-dir value}/benchmark_ssd_vgg16_inference_int8_20190417_231832.log +``` + +And here is a sample log file tail when running for accuracy: + +``` + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.231 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.386 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.243 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.058 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.265 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.391 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.224 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.330 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.355 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.091 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.420 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.558 +``` + +## FP32 Inference Instructions + +Use the steps 1, 2,3 and 4 as above. + +5. Download the pretrained model: +``` +$ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/ssdvgg16_fp32_pretrained_model.pb +``` + +6. Navigate to the `benchmarks` directory (step 2), and run the model scripts for either batch +and online inference or accuracy. +``` +$ cd models/benchmarks +``` + +* Run the model for batch and online inference where the `--model-source-dir` is the model source directory from step 1, +and the `--in-graph` is the pretrained model graph from step 5, +if you specify the `--data-location` which is the path to the tf record file that you generated in step 4, +the benchmark will run with real data, otherwise dummy data will be used: +``` +$ cd /home//models/benchmarks + +$ python launch_benchmark.py \ + --data-location /home//tf_records \ + --in-graph /home//ssdvgg16_fp32_pretrained_model.pb \ + --model-source-dir /home//SSD.TensorFlow \ + --model-name ssd_vgg16 \ + --framework tensorflow \ + --precision fp32 \ + --mode inference \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --batch-size 1 \ + --socket-id 0 \ + --num-inter-threads 11 \ + --num-intra-threads 21 \ + --data-num-inter-threads 21 \ + --data-num-intra-threads 28 \ + -- warmup-steps=100 steps=500 +``` + +* For the accuracy test: + + * Clone the customized [cocoapi repo](https://github.com/waleedka/coco) in +the model directory `SSD.TensorFlow` from step 1. + ``` + $ git clone https://github.com/waleedka/coco.git + + ``` + * The `--data-location` is required, which is the path to the tf record file that you generated in step 3. + * Copy the annotation file `instances_val2017.json` (from step 3) to the dataset directory `/home//tf_records/`. + * Use the `--accuracy-only` flag: +``` +python launch_benchmark.py \ + --model-name ssd_vgg16 \ + --mode inference \ + --precision fp32 \ + --framework tensorflow \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --model-source-dir /home//SSD.TensorFlow \ + --data-location /home//tf_records \ + --in-graph /home//ssdvgg16_fp32_pretrained_model.pb \ + --accuracy-only \ + --batch-size 1 +``` + +>Notes: +>* For batch and online inference, we recommend the provided values for the arguments: `--num-inter-threads=11`, `--num-intra-threads=21`, `--data-num-inter-threads=21`, + `--data-num-intra-threads=28` for optimized performance on `28-cores Cascade Lake (CLX)` machine. + +>* SSD-VGG16 model accuracy test works only with the `Python3` based docker images. + +>* The `--verbose` or `--output-dir` flag can be added to any of the above commands +to get additional debug output or change the default output location. + +7. The log file is saved to the value of `--output-dir`. + +Below is a sample log file tail when running batch and online inference, +the following results are based on CLX 28-cores with hyper-threading enabled: + +``` +Batch size = 1 +Throughput: 15.662 images/sec +Latency: 63.848 ms +Ran inference with batch size 1 +Log location outside container: {--output-dir value}/benchmark_ssd_vgg16_inference_fp32_20190417_232130.log +``` + +Below is a sample log file tail when testing accuracy: + +``` + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.236 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.391 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.248 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.058 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.264 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.399 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.227 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.334 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.358 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.091 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.423 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.564 +``` diff --git a/benchmarks/object_detection/tensorflow/ssd_vgg16/__init__.py b/benchmarks/object_detection/tensorflow/ssd_vgg16/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd_vgg16/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/__init__.py b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/config.json b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/config.json new file mode 100644 index 000000000..14d129748 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/config.json @@ -0,0 +1,6 @@ +{ + "optimization_parameters": { + "KMP_SETTINGS": 1, + "TF_ENABLE_WINOGRAD_NONFUSED": 1 + } +} diff --git a/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/fp32/__init__.py b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/fp32/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/fp32/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/fp32/model_init.py b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/fp32/model_init.py new file mode 100644 index 000000000..5698700f4 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/fp32/model_init.py @@ -0,0 +1,28 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +from object_detection.tensorflow.ssd_vgg16.inference.ssd_vgg16_model_init import SSDVGG16ModelInitializer + + +class ModelInitializer(SSDVGG16ModelInitializer): + """Model initializer for SSD-VGG16 FP32 inference""" + + def __init__(self, args, custom_args=[], platform_util=None): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) diff --git a/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/int8/__init__.py b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/int8/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/int8/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/int8/model_init.py b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/int8/model_init.py new file mode 100644 index 000000000..01d1822ba --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/int8/model_init.py @@ -0,0 +1,28 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +from object_detection.tensorflow.ssd_vgg16.inference.ssd_vgg16_model_init import SSDVGG16ModelInitializer + + +class ModelInitializer(SSDVGG16ModelInitializer): + """Model initializer for SSD-VGG16 Int8 inference""" + + def __init__(self, args, custom_args=[], platform_util=None): + super(ModelInitializer, self).__init__(args, custom_args, platform_util) diff --git a/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/ssd_vgg16_model_init.py b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/ssd_vgg16_model_init.py new file mode 100644 index 000000000..c54994170 --- /dev/null +++ b/benchmarks/object_detection/tensorflow/ssd_vgg16/inference/ssd_vgg16_model_init.py @@ -0,0 +1,107 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import argparse + +from common.base_model_init import BaseModelInitializer, set_env_var + + +class SSDVGG16ModelInitializer(BaseModelInitializer): + """Common model initializer for SSD-VGG16 inference""" + + def run_inference_sanity_checks(self, args, custom_args): + if not args.input_graph: + sys.exit("Please provide a path to the frozen graph directory" + " via the '--in-graph' flag.") + if not args.data_location and self.args.accuracy_only: + sys.exit("For accuracy test, please provide a path to the data directory via the " + "'--data-location' flag.") + if args.batch_size != -1 and args.batch_size != 1: + sys.exit("SSD-VGG16 inference supports 'batch-size=1' " + + "only, please modify via the '--batch_size' flag.") + + def __init__(self, args, custom_args, platform_util): + super(SSDVGG16ModelInitializer, self).__init__(args, custom_args, platform_util) + + self.parse_custom_args() + self.run_inference_sanity_checks(self.args, self.custom_args) + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) + + self.set_num_inter_intra_threads(num_inter_threads=self.args.num_inter_threads, + num_intra_threads=self.args.num_intra_threads) + + omp_num_threads = str(int(platform_util.num_cores_per_socket / 2))\ + if self.args.precision == "int8" else platform_util.num_cores_per_socket + + set_env_var("OMP_NUM_THREADS", omp_num_threads + if self.args.num_cores == -1 else self.args.num_cores) + + script_path = os.path.join( + self.args.intelai_models, self.args.mode, "eval_ssd.py") + + self.run_cmd = self.get_command_prefix( + self.args.socket_id) + "{} {}".format(self.python_exe, script_path) + + self.run_cmd += " --input-graph={} " \ + " --num-inter-threads={} --num-intra-threads={} ". \ + format(self.args.input_graph, self.args.num_inter_threads, + self.args.num_intra_threads) + + if self.args.data_num_inter_threads: + self.run_cmd += " --data-num-inter-threads={} ".format( + self.args.data_num_inter_threads) + + if self.args.data_num_intra_threads: + self.run_cmd += " --data-num-intra-threads={} ".format( + self.args.data_num_intra_threads) + + if self.args.benchmark_only: + self.run_cmd += " --warmup-steps={} --steps={} ". \ + format(self.args.warmup_steps, self.args.steps) + + # if the data location directory is not empty, then include the arg + if self.args.data_location and os.listdir(self.args.data_location): + self.run_cmd += " --data-location={} ".format(self.args.data_location) + + if self.args.accuracy_only: + self.run_cmd += "--accuracy-only " + + def parse_custom_args(self): + if self.custom_args: + parser = argparse.ArgumentParser() + parser.add_argument("--warmup-steps", type=int, default=10, + help="number of warmup steps") + parser.add_argument("--steps", type=int, default=50, + help="number of steps") + + self.args = parser.parse_args(self.custom_args, + namespace=self.args) + + def run(self): + self.run_command(self.run_cmd) diff --git a/benchmarks/recommendation/tensorflow/ncf/README.md b/benchmarks/recommendation/tensorflow/ncf/README.md index 2ee7b070b..a86a56b1f 100644 --- a/benchmarks/recommendation/tensorflow/ncf/README.md +++ b/benchmarks/recommendation/tensorflow/ncf/README.md @@ -14,13 +14,13 @@ This model uses official tensorflow models repo, where [ncf](https://github.com/ model automatically downloads movielens ml-1m dataset as default if the `--data-location` flag is not set. If you want to download movielens 1M dataset and provide that path to `--data-location`, check this [reference](https://grouplens.org/datasets/movielens/1m/) -2. Clone the official `tensorflow/models` repository with tag `v1.11` +2. Clone the official `tensorflow/models` repository with tag `v1.11` and make a small change to `data_async_generation.py`, commenting out a line that causes a crash in the model script. ``` $ git clone https://github.com/tensorflow/models.git $ cd models $ git checkout v1.11 -$ pwd +$ sed -i.bak 's/atexit.register/# atexit.register/g' official/recommendation/data_async_generation.py ``` 3. Now clone `IntelAI/models` repository and then navigate to the `benchmarks` folder: @@ -53,7 +53,7 @@ $ python launch_benchmark.py \ --framework tensorflow \ --precision fp32 \ --mode inference \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` The tail of batch inference log, looks as below. @@ -83,7 +83,7 @@ $ python launch_benchmark.py \ --framework tensorflow \ --precision fp32 \ --mode inference \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` The tail of online inference log, looks as below. @@ -115,7 +115,7 @@ $ python launch_benchmark.py \ --framework tensorflow \ --precision fp32 \ --mode inference \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 ``` The tail of accuracy log, looks as below. diff --git a/benchmarks/recommendation/tensorflow/ncf/inference/fp32/config.json b/benchmarks/recommendation/tensorflow/ncf/inference/fp32/config.json new file mode 100644 index 000000000..273b45b40 --- /dev/null +++ b/benchmarks/recommendation/tensorflow/ncf/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/recommendation/tensorflow/ncf/inference/fp32/model_init.py b/benchmarks/recommendation/tensorflow/ncf/inference/fp32/model_init.py index 1b6eb1eda..960c2523a 100644 --- a/benchmarks/recommendation/tensorflow/ncf/inference/fp32/model_init.py +++ b/benchmarks/recommendation/tensorflow/ncf/inference/fp32/model_init.py @@ -40,7 +40,8 @@ def __init__(self, args, custom_args=[], platform_util=None): self.args.batch_size = 256 # Set KMP env vars, if they haven't already been set - self.set_kmp_vars() + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # set num_inter_threads and num_intra_threads self.set_num_inter_intra_threads() @@ -49,7 +50,7 @@ def __init__(self, args, custom_args=[], platform_util=None): self.args.intelai_models, self.args.mode, self.args.precision, "ncf_main.py") - self.benchmark_command = self.get_numactl_command(args.socket_id) + \ + self.benchmark_command = self.get_command_prefix(args.socket_id) + \ self.python_exe + " " + benchmark_script set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) diff --git a/benchmarks/recommendation/tensorflow/wide_deep/README.md b/benchmarks/recommendation/tensorflow/wide_deep/README.md index e6698bd5d..8ace58237 100644 --- a/benchmarks/recommendation/tensorflow/wide_deep/README.md +++ b/benchmarks/recommendation/tensorflow/wide_deep/README.md @@ -56,7 +56,7 @@ use in the next step. --batch-size 1 \ --data-location /home//widedeep_dataset \ --checkpoint /home//path/to/wide_deep_fp32_pretrained_model \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --verbose ``` * Running the model in batch inference mode, set `--batch-size` = `1024` @@ -72,7 +72,7 @@ use in the next step. --batch-size 1024 \ --data-location /home//path/to/dataset \ --checkpoint /home//path/to/wide_deep_fp32_pretrained_model \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --verbose ``` 6. The log file is saved to the value of `--output-dir`. @@ -94,8 +94,6 @@ use in the next step. recall: 0.0 End-to-End duration is %s 36.5971579552 Latency is: %s 0.00224784460139 - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu current path: /workspace/benchmarks search path: /workspace/benchmarks/*/tensorflow/wide_deep/inference/fp32/model_init.py Using model init: /workspace/benchmarks/classification/tensorflow/wide_deep/inference/fp32/model_init.py diff --git a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/README.md b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/README.md index d4fb5fef4..e2467d45f 100755 --- a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/README.md +++ b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/README.md @@ -55,6 +55,11 @@ Instructions and scripts for model training coming later. ## INT8 Inference Instructions +These instructions use the TCMalloc memory allocator, which produces +better performance results for Int8 precision models with smaller batch sizes. +If you want to disable the use of TCMalloc, set `--disable-tcmalloc=True` +when calling `launch_benchmark.py` and the script will run without TCMalloc. + 1. Download and extract the pre-trained model. ``` wget https://storage.googleapis.com/intel-optimized-tensorflow/models/wide_deep_int8_pretrained_model.pb @@ -72,7 +77,7 @@ Instructions and scripts for model training coming later. ``` cd /home//models/benchmarks - python launch_benchmark.py + python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision int8 \ --mode inference \ @@ -92,7 +97,7 @@ Instructions and scripts for model training coming later. ``` cd /home//models/benchmarks - python launch_benchmark.py + python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision int8 \ --mode inference \ @@ -109,7 +114,7 @@ Instructions and scripts for model training coming later. ``` cd /home//models/benchmarks - python launch_benchmark.py + python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision int8 \ --mode inference \ @@ -158,7 +163,7 @@ Instructions and scripts for model training coming later. ``` cd /home//models/benchmarks - python launch_benchmark.py + python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision fp32 \ --mode inference \ @@ -166,7 +171,7 @@ Instructions and scripts for model training coming later. --batch-size 1000 \ --socket-id 0 \ --accuracy-only \ - --docker-image docker.io/intelaipg/intel-optimized-tensorflow:nightly-latestprs-bdw \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /root/user/wide_deep_files/wide_deep_fp32_pretrained_model.pb \ --data-location /root/user/wide_deep_files/dataset_preprocessed_eval.tfrecords ``` @@ -178,7 +183,7 @@ Instructions and scripts for model training coming later. ``` cd /home//models/benchmarks - python launch_benchmark.py + python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision fp32 \ --mode inference \ @@ -186,7 +191,7 @@ Instructions and scripts for model training coming later. --benchmark-only \ --batch-size 1 \ --socket-id 0 \ - --docker-image docker.io/intelaipg/intel-optimized-tensorflow:nightly-latestprs-bdw \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /root/user/wide_deep_files/wide_deep_fp32_pretrained_model.pb \ --data-location /root/user/wide_deep_files/dataset_preprocessed_test.tfrecords \ -- num_parallel_batches=1 @@ -195,7 +200,7 @@ Instructions and scripts for model training coming later. ``` cd /home//models/benchmarks - python launch_benchmark.py + python launch_benchmark.py \ --model-name wide_deep_large_ds \ --precision fp32 \ --mode inference \ @@ -203,7 +208,7 @@ Instructions and scripts for model training coming later. --benchmark-only \ --batch-size 512 \ --socket-id 0 \ - --docker-image docker.io/intelaipg/intel-optimized-tensorflow:nightly-latestprs-bdw \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --in-graph /root/user/wide_deep_files/wide_deep_fp32_pretrained_model.pb \ --data-location /root/user/wide_deep_files/dataset_preprocessed_test.tfrecords ``` diff --git a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/config.json b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/config.json new file mode 100644 index 000000000..4efe60b15 --- /dev/null +++ b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "noverbose,warnings,respect,granularity=core,none", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/model_init.py b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/model_init.py index 8f3e15359..6293b3d0c 100755 --- a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/model_init.py +++ b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/model_init.py @@ -36,9 +36,10 @@ def __init__(self, args, custom_args=[], platform_util=None): # Set the num_inter_threads and num_intra_threads self.set_num_inter_intra_threads(num_inter_threads=platform_util.num_cores_per_socket, num_intra_threads=1) - # Use default KMP AFFINITY values, override KMP_BLOCKTIME & enable KMP SETTINGS - self.set_kmp_vars(kmp_settings="1", kmp_blocktime="0", - kmp_affinity="noverbose,warnings,respect,granularity=core,none") + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # Set env vars, if they haven't already been set set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) @@ -61,7 +62,7 @@ def run_benchmark(self): script_args_list = ["input_graph", "num_parallel_batches", "batch_size", "num_inter_threads", "num_intra_threads", "accuracy_only", "data_location"] - cmd_prefix = self.get_numactl_command(self.args.socket_id) + \ + cmd_prefix = self.get_command_prefix(self.args.socket_id) + \ self.python_exe + " " + benchmark_script cmd = self.add_args_to_command(cmd_prefix, script_args_list) self.run_command(cmd) diff --git a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/config.json b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/config.json new file mode 100644 index 000000000..4efe60b15 --- /dev/null +++ b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/config.json @@ -0,0 +1,7 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "noverbose,warnings,respect,granularity=core,none", + "KMP_BLOCKTIME": 0, + "KMP_SETTINGS": 1 + } +} diff --git a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/model_init.py b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/model_init.py index 2bd55b5a5..c6a3b25fd 100755 --- a/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/model_init.py +++ b/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/int8/model_init.py @@ -36,9 +36,10 @@ def __init__(self, args, custom_args=[], platform_util=None): # Set the num_inter_threads and num_intra_threads self.set_num_inter_intra_threads(num_inter_threads=platform_util.num_cores_per_socket, num_intra_threads=1) - # Use default KMP AFFINITY values, override KMP_BLOCKTIME & enable KMP SETTINGS - self.set_kmp_vars(kmp_settings="1", kmp_blocktime="0", - kmp_affinity="noverbose,warnings,respect,granularity=core,none") + + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) # Set env vars, if they haven't already been set set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads) @@ -61,7 +62,7 @@ def run_benchmark(self): script_args_list = ["input_graph", "num_parallel_batches", "batch_size", "num_inter_threads", "num_intra_threads", "accuracy_only", "data_location"] - cmd_prefix = self.get_numactl_command(self.args.socket_id) + \ + cmd_prefix = self.get_command_prefix(self.args.socket_id) + \ self.python_exe + " " + benchmark_script cmd = self.add_args_to_command(cmd_prefix, script_args_list) self.run_command(cmd) diff --git a/benchmarks/text_to_speech/tensorflow/wavenet/README.md b/benchmarks/text_to_speech/tensorflow/wavenet/README.md index fa193aa07..963d892d3 100644 --- a/benchmarks/text_to_speech/tensorflow/wavenet/README.md +++ b/benchmarks/text_to_speech/tensorflow/wavenet/README.md @@ -41,7 +41,7 @@ $ pwd 2. Clone this [intelai/models](https://github.com/intelai/models) repo. This repo has the launch script for running the model, as well as checkpoint files for a pre-trained model. After cloning the repo, -navigate to the benchmarks directory, which is where the launch script +navigate to the `benchmarks` directory, which is where the launch script is located. ``` @@ -71,7 +71,7 @@ python launch_benchmark.py \ --framework tensorflow \ --socket-id 0 \ --num-cores 1 \ - --docker-image intelaipg/intel-optimized-tensorflow:latest-devel-mkl \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ --model-source-dir /home//wavenet/tensorflow-wavenet \ --checkpoint /home//wavenet_checkpoints \ -- checkpoint_name=model.ckpt-99 sample=8510 @@ -99,8 +99,6 @@ Sample: 8500 Average Throughput of whole run: Samples / sec: 289.351783 Average Latency of whole run: msec / sample: 3.456001 Finished generating. The result can be viewed in TensorBoard. -lscpu_path_cmd = command -v lscpu -lscpu located here: /usr/bin/lscpu Ran inference with batch size -1 Log location outside container: {--output-dir value}/benchmark_wavenet_inference_fp32_20190105_015022.log ``` diff --git a/benchmarks/text_to_speech/tensorflow/wavenet/inference/fp32/config.json b/benchmarks/text_to_speech/tensorflow/wavenet/inference/fp32/config.json new file mode 100644 index 000000000..f0b327528 --- /dev/null +++ b/benchmarks/text_to_speech/tensorflow/wavenet/inference/fp32/config.json @@ -0,0 +1,6 @@ +{ + "optimization_parameters": { + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0", + "KMP_BLOCKTIME": 1 + } +} diff --git a/benchmarks/text_to_speech/tensorflow/wavenet/inference/fp32/model_init.py b/benchmarks/text_to_speech/tensorflow/wavenet/inference/fp32/model_init.py index 91ebe227c..1756e33ae 100644 --- a/benchmarks/text_to_speech/tensorflow/wavenet/inference/fp32/model_init.py +++ b/benchmarks/text_to_speech/tensorflow/wavenet/inference/fp32/model_init.py @@ -32,8 +32,9 @@ def __init__(self, args, custom_args, platform_util): self.command = "" command_prefix = "{} generate.py".format(self.python_exe) - # Set default KMP env vars, except for KMP_SETTINGS - self.set_kmp_vars(kmp_settings=None) + # Set KMP env vars, if they haven't already been set + config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json") + self.set_kmp_vars(config_file_path) self.parse_custom_args() # Set the num_inter_threads and num_intra_threads (override inter threads to 1) diff --git a/benchmarks_directory_structure.png b/benchmarks_directory_structure.png new file mode 100644 index 000000000..1bf56d912 Binary files /dev/null and b/benchmarks_directory_structure.png differ diff --git a/docs/README.md b/docs/README.md index 7ade8475e..c5933030c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -12,11 +12,13 @@ ## Tutorials by Use Case * Inference with IntelĀ® Optimization of Tensorflow: - * [Image Recognition](/docs/image_recognition/tensorflow/Tutorial.md) (ResNet50, ResNet101, and InceptionV3) + * [Image Recognition](/docs/image_recognition/tensorflow/Tutorial.md) (ResNet50, ResNet101, and InceptionV3) + * [Language Translation](/docs/language_translation/tensorflow/Tutorial.md) (Transformer-LT) * [Recommendation Systems](/docs/recommendation/tensorflow/Tutorial.md) (Wide and Deep) * Inference with IntelĀ® Optimization of Tensorflow Serving: * [Image Recognition](/docs/image_recognition/tensorflow_serving/Tutorial.md) (ResNet50 and InceptionV3) - * [Object Detection](/docs/object_detection/tensorflow_serving/Tutorial.md) (R-FCN) + * [Object Detection](/docs/object_detection/tensorflow_serving/Tutorial.md) (R-FCN and SSD-MobileNet) + * [Language Translation](/docs/language_translation/tensorflow_serving/Tutorial.md) (Transformer-LT) * Model Quantization and Optimization * [Image Recognition](/docs/image_recognition/quantization/Tutorial.md) (ResNet50) diff --git a/docs/general/tensorflow/LaunchBenchmark.md b/docs/general/tensorflow/LaunchBenchmark.md index ad358b6aa..14e38385e 100644 --- a/docs/general/tensorflow/LaunchBenchmark.md +++ b/docs/general/tensorflow/LaunchBenchmark.md @@ -23,14 +23,17 @@ Below the general description is an [index of links](#model-scripts-for-tensorfl * Image Recognition * ResNet50: [init](/benchmarks/image_recognition/tensorflow/resnet50/inference/fp32/model_init.py) | - [inference](/models/image_recognition/tensorflow/resnet50/fp32/eval_image_classifier_inference.py) | - [preprocessing](/models/image_recognition/tensorflow/resnet50/fp32/preprocessing.py) + [inference](/models/image_recognition/tensorflow/resnet50/inference/eval_image_classifier_inference.py) | + [preprocessing](/models/image_recognition/tensorflow/resnet50/inference/preprocessing.py) * ResNet101: [init](/benchmarks/image_recognition/tensorflow/resnet101/inference/fp32/model_init.py) | - [inference](/models/image_recognition/tensorflow/resnet101/fp32/benchmark.py) | - [preprocessing](/models/image_recognition/tensorflow/resnet101/fp32/preprocessing.py) + [inference](/models/image_recognition/tensorflow/resnet101/inference/eval_image_classifier_inference.py) | + [preprocessing](/models/image_recognition/tensorflow/resnet101/inference/preprocessing.py) * InceptionV3: [init](/benchmarks/image_recognition/tensorflow/inceptionv3/inference/fp32/model_init.py) | - [inference](/models/image_recognition/tensorflow/inceptionv3/fp32/eval_image_classifier_inference.py) | - [preprocessing](/models/image_recognition/tensorflow/inceptionv3/fp32/preprocessing.py) + [inference](/models/image_recognition/tensorflow/inceptionv3/fp32/eval_image_classifier_inference.py) | + [preprocessing](/models/image_recognition/tensorflow/inceptionv3/fp32/preprocessing.py) +* Language Translation + * Transformer-LT: [init](/benchmarks/language_translation/tensorflow/transformer_lt_official/inference/fp32/model_init.py) | + [inference](/models/language_translation/tensorflow/transformer_lt_official/inference/fp32/infer_ab.py) * Recommendation Systems * Wide and Deep: [init](/benchmarks/recommendation/tensorflow/wide_deep_large_ds/inference/fp32/model_init.py) | [inference](/models/recommendation/tensorflow/wide_deep_large_ds/inference/inference.py) | @@ -101,11 +104,170 @@ optional arguments: conjunction with --accuracy-only and --mode=inference. --output-dir OUTPUT_DIR Folder to dump output into. + --disable-tcmalloc {True,False} + When TCMalloc is enabled, the google-perftools are + installed (if running using docker) and the LD_PRELOAD + environment variable is set to point to the TCMalloc + library file. The TCMalloc memory allocator produces + better performance results with smaller batch sizes. + This flag disables the use of TCMalloc when set to + True. For int8 benchmarking, TCMalloc is enabled by + default (--disable-tcmalloc=False). For other + precisions, the flag is --disable-tcmalloc=True by + default. + --tcmalloc-large-alloc-report-threshold TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD + Sets the TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD + environment variable to the specified value. The + environment variable sets the threshold (in bytes) for + when large memory allocation messages will be + displayed. -g INPUT_GRAPH, --in-graph INPUT_GRAPH Full path to the input graph + --volume CUSTOM_VOLUMES + Specify a custom volume to mount in the container, + which follows the same format as the docker --volume + flag (https://docs.docker.com/storage/volumes/). This + argument can only be used in conjunction with a + --docker-image. --debug Launches debug mode which doesn't execute start.sh ``` +## Volume mounts + +When running the launch script using a docker image, volumes will +automatically get mounted in the container for the following +directories: + +| Directory | Mount location in the container | +|-----------|---------------------------------| +| Model zoo `/benchmarks` code | `/workspace/benchmarks` | +| Model zoo `/models` code | `/workspace/intelai_models` | +| `--model-source-dir` code | `/workspace/models` | +| `--checkpoints` directory | `/checkpoints` | +| `--in-graph` file | `/in_graph` | +| `--dataset-location` | `/dataset` | + +If you would like additional directories mounted in the docker +container, you can specify them by using the `--volume` flag using the +same `:` separated field format [as docker](https://docs.docker.com/storage/volumes/). +For example, the following command will mount `/home//custom_folder_1` +in the container at `custom_folder_1` and `/home//custom_folder_2` +in the container at `custom_folder_2`: + +``` +$ python launch_benchmark.py \ + --in-graph /home//resnet50_fp32_pretrained_model.pb \ + --model-name resnet50 \ + --framework tensorflow \ + --precision fp32 \ + --mode inference \ + --batch-size 1 \ + --socket-id 0 \ + --data-location /home//Imagenet_Validation \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --volume /home//custom_folder_1:/custom_folder_1 \ + --volume /home//custom_folder_2:/custom_folder_2 +``` + +Note that volume mounting only applies when running in a docker +container. When running on [bare metal](#alpha-feature-running-on-bare-metal), +files are accessed in their original location. + +## Debugging + +The `--debug` flag in the `launch_benchmarks.py` script gives you a +shell into the docker container with the [volumes mounted](#volume-mounts) +for any dataset, pretrained model, model source code, etc that has been +provided by the other flags. It does not execute the `start.sh` script, +and is intended as a way to setup an environment for quicker iteration +when debugging and doing development. From the shell, you can manually +execute the `start.sh` script and select to not re-install dependencies +each time that you re-run, so that the script takes less time to run. + +Below is an example showing how to use the `--debug` flag: + +1. Run the model using your model's `launch_benchmark.py` command, but + add on the `--debug` flag, which will take you to a shell. If you + list the files in the directory at that prompt, you will see the + `start.sh` file: + + ``` + $ python launch_benchmark.py \ + --in-graph /home//resnet50_fp32_pretrained_model.pb \ + --model-name resnet50 \ + --framework tensorflow \ + --precision fp32 \ + --mode inference \ + --batch-size=1 \ + --socket-id 0 \ + --data-location /home//Imagenet_Validation \ + --docker-image gcr.io/deeplearning-platform-release/tf-cpu.1-14 \ + --debug + + # ls + __init__.py logs run_tf_benchmark.py start.sh + ``` + +2. Flags that were passed to the launch script are set as environment + variables in the container: + + ``` + # env + EXTERNAL_MODELS_SOURCE_DIRECTORY=None + IN_GRAPH=/in_graph/resnet50_fp32_pretrained_model.pb + WORKSPACE=/workspace/benchmarks/common/tensorflow + MODEL_NAME=resnet50 + PRECISION=fp32 + BATCH_SIZE=1 + MOUNT_EXTERNAL_MODELS_SOURCE=/workspace/models + DATASET_LOCATION=/dataset + BENCHMARK_ONLY=True + ACCURACY_ONLY=False + ... + ``` +3. Run the `start.sh` script, which will setup the `PYTHONPATH`, install + dependencies, and then run the model: + ``` + # bash start.sh + ... + Iteration 48: 0.011513 sec + Iteration 49: 0.011664 sec + Iteration 50: 0.011802 sec + Average time: 0.011650 sec + Batch size = 1 + Latency: 11.650 ms + Throughput: 85.833 images/sec + Ran inference with batch size 1 + Log location outside container: /benchmark_resnet50_inference_fp32_20190403_212048.log + ``` + +4. Code changes that are made locally will also be made in the container + (and vice versa), since the directories are mounted in the docker + container. Once code changes are made, you can rerun the start + script, except set the `NOINSTALL` variable, since dependencies were + already installed in the previous run. You can also change the + environment variable values for other settings, like the batch size. + + ``` + # NOINSTALL=True + # BATCH_SIZE=128 + # bash start.sh + ... + Iteration 48: 0.631819 sec + Iteration 49: 0.625606 sec + Iteration 50: 0.618813 sec + Average time: 0.625285 sec + Batch size = 128 + Throughput: 204.707 images/sec + Ran inference with batch size 128 + Log location outside container: /benchmark_resnet50_inference_fp32_20190403_212310.log + ``` + +5. Once you are done with the session, exit out of the docker container: + ``` + # exit + ``` + ## Alpha feature: Running on bare metal We recommend using [Docker](https://www.docker.com) to run the @@ -173,3 +335,11 @@ the following command can be used: --batch-size=1 \ --socket-id 0 ``` + +> When running on bare metal, be aware of environment variables that you +have set on your system. The model zoo scripts intentionally do not +overwrite environment variables that have already been set, such as +`OMP_NUM_THREADS`. The same is true when running in a docker container, +but since a new docker container instance is started with each run, you +won't have previously set environment variables, like you may have on +bare metal. diff --git a/docs/general/tensorflow_serving/InstallationGuide.md b/docs/general/tensorflow_serving/InstallationGuide.md index 2ef0489eb..0aa6a03b7 100644 --- a/docs/general/tensorflow_serving/InstallationGuide.md +++ b/docs/general/tensorflow_serving/InstallationGuide.md @@ -36,7 +36,7 @@ We will break down the installation into 2 steps: * Step 1: Build the Intel Optimized TensorFlow Serving Docker image * Step 2: Verify the Docker image by serving a simple model - half_plus_two -### Step 1: Build TensorFlow Serving Docker image +### Step 1: Build TensorFlow Serving Docker image. The recommended way to use TensorFlow Serving is with Docker images. Letā€™s build a docker image with TensorFlow Serving optimized for IntelĀ® Processors. * Login into your machine via SSH and clone the [Tensorflow Serving](https://github.com/tensorflow/serving/) repository and save the path of this cloned directory (Also, adding it to `.bashrc` ) for ease of use for the remainder of this tutorial. @@ -45,7 +45,9 @@ The recommended way to use TensorFlow Serving is with Docker images. Letā€™s bui $ export TF_SERVING_ROOT=$(pwd)/serving $ echo "export TF_SERVING_ROOT=$(pwd)/serving" >> ~/.bashrc ``` - + +* You can also build image using [this](/benchmarks/common/tensorflow_serving/build_tfserving_image.sh) script, run as per comments mentioned. Or Continue manual steps as below. + * Using `Dockerfile.devel-mkl`, build an image with Intel optimized ModelServer. This creates an image with all the required development tools and builds from sources. The image size will be around 5GB and will take some time. On AWS c5.4xlarge instance (16 logical cores), it took about 25min. **NOTE**: It is recommended that you build an official release version using `--build-arg TF_SERVING_VERSION_GIT_BRANCH=""`, but if you wish to build the (unstable) head of master, omit the build argument and master will be used by default. @@ -54,6 +56,7 @@ The recommended way to use TensorFlow Serving is with Docker images. Letā€™s bui $ cd $TF_SERVING_ROOT/tensorflow_serving/tools/docker/ $ docker build \ -f Dockerfile.devel-mkl \ + --build-arg TF_SERVING_BAZEL_OPTIONS="--incompatible_disallow_data_transition=false --incompatible_disallow_filetype=false" \ --build-arg TF_SERVING_VERSION_GIT_BRANCH="1.13.0" \ -t tensorflow/serving:latest-devel-mkl . ``` @@ -257,7 +260,7 @@ $ curl -s http://download.tensorflow.org/models/official/20181001_resnet/savedmo $ cd ~ $ virtualenv tfserving_venv $ source tfserving_venv/bin/activate - (tfserving_venv)$ pip install grpc requests tensorflow tensorflow-serving-api + (tfserving_venv)$ pip install requests tensorflow tensorflow-serving-api ``` * Run the example `resnet_client_grpc.py` script from the TensorFlow Serving repository, which you cloned earlier. ``` diff --git a/docs/image_recognition/tensorflow/Tutorial.md b/docs/image_recognition/tensorflow/Tutorial.md index 235ea4109..7b24c9557 100644 --- a/docs/image_recognition/tensorflow/Tutorial.md +++ b/docs/image_recognition/tensorflow/Tutorial.md @@ -359,8 +359,6 @@ Note: As per the recommended settings `socket-id` is set to 0 for InceptionV3. T steps = 30, ... images/sec steps = 40, ... images/sec steps = 50, ... images/sec - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu Ran inference with batch size 128 Log location outside container: {--output-dir value}/benchmark_resnet50 @@ -384,9 +382,7 @@ you can implement the same strategy on different use cases demoed in Step 3. --debug Example Output - - lscpu_path_cmd = command -v lscpu - lscpu located here: b'/usr/bin/lscpu' + root@a78677f56d69:/workspace/benchmarks/common/tensorflow# To rerun the bechmarking script, execute the ```start.sh``` bash script from your existing directory with additional or modified flags. For e.g to rerun with the best batch inference (batch size=128) settings run with ```BATCH_SIZE``` @@ -397,7 +393,7 @@ and to skip the run from reinstalling packages pass ```True``` to ```NOINSTALL`` NOINSTALL=True BATCH_SIZE=128 ./start.sh -All other flags will be defaulted to values passed in the first ```launch_benchmark.py``` that starts the container. [See here](google.com) to get the full list of flags. +All other flags will be defaulted to values passed in the first ```launch_benchmark.py``` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags. Example Output @@ -429,8 +425,6 @@ All other flags will be defaulted to values passed in the first ```launch_benchm . Batch size = 128 Throughput: ... images/sec - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu Ran inference with batch size 128 Log location outside container: {--output-dir value}/benchmark_resnet50_inference_fp32_20190205_201632.log diff --git a/docs/image_recognition/tensorflow_serving/Tutorial.md b/docs/image_recognition/tensorflow_serving/Tutorial.md index 0c9ad527f..e5e9b0153 100644 --- a/docs/image_recognition/tensorflow_serving/Tutorial.md +++ b/docs/image_recognition/tensorflow_serving/Tutorial.md @@ -1,10 +1,12 @@ # Image Recognition with TensorFlow Serving on CPU + ### Online and Batch Inference -Models: ResNet50, InceptionV3 +Model and Precision: InceptionV3 FP32, ResNet50 FP32, and ResNet50 Int8 ## Goal -This tutorial will introduce you to the CPU performance considerations for image recognition deep learning models and how to use IntelĀ® Optimizations for [TensorFlow Serving](https://www.tensorflow.org/serving/) to improve inference time on CPUs. +This tutorial will introduce you to the CPU performance considerations for image recognition deep learning models with different precisions and +how to use IntelĀ® Optimizations for [TensorFlow Serving](https://www.tensorflow.org/serving/) to improve inference time on CPUs. It also provides sample code that you can use to get your optimized TensorFlow model server and GRPC client up and running quickly. ## Prerequisites @@ -22,26 +24,36 @@ This tutorial assumes you have already: Convolutional neural networks (CNNs) for image recognition are computationally expensive. The IntelĀ® Math Kernel Library for Deep Neural Networks (IntelĀ® MKL-DNN) offers significant performance improvements for convolution, pooling, normalization, activation, and other operations via efficient vectorization and multi-threading. Tuning TensorFlow Serving to take full advantage of your hardware for image recognition deep learning inference involves: -1. Working through this tutorial to set up servable versions of the well-known [ResNet50](https://arxiv.org/pdf/1512.03385.pdf) and [InceptionV3](https://arxiv.org/pdf/1512.00567v1.pdf) CNN models +1. Working through this tutorial to set up servable versions of the well-known [ResNet50](https://arxiv.org/pdf/1512.03385.pdf) and [InceptionV3](https://arxiv.org/pdf/1512.00567v1.pdf) CNN models with different precisions. 2. Running a TensorFlow Serving docker container configured for performance given your hardware resources 3. Running a client script to measure online and batch inference performance 4. Experimenting with the TensorFlow Serving settings on your own to further optimize for your model and use case -## Hands-on Tutorial - ResNet50 or InceptionV3 +## Hands-on Tutorial - InceptionV3 and Resnet50 + +This section shows a step-by-step example for how to serve one of the following Image Recognition models +`(ResNet50 FP32, ResNet50 Int8, and InceptionV3 FP32)` using TensorFlow Serving. +It also explains the possible ways to manage the available CPU resources and tune it for the optimal performance. + +For steps 1 and 2, refer to the Intel Model Zoo READMEs: +* **FP32 precision:** use the Intel Model Zoo `FP32` README sections, + * [InceptionV3 FP32 README](/benchmarks/image_recognition/tensorflow/inceptionv3#fp32-inference-instructions), and + * [ResNet50 FP32 README](/benchmarks/image_recognition/tensorflow/resnet50#fp32-inference-instructions) + +* **Int8 precision:** use the Intel Model Zoo `Int8` README sections, + * [ResNet50 Int8 README](/benchmarks/image_recognition/tensorflow/resnet50#int8-inference-instructions) -For steps 1 and 2, refer to the Intel Model Zoo FP32 READMEs: -* [ResNet50 README](/benchmarks/image_recognition/tensorflow/resnet50#fp32-inference-instructions) -* [InceptionV3 README](/benchmarks/image_recognition/tensorflow/inceptionv3#fp32-inference-instructions) +>NOTE: The below example shows InceptionV3 (FP32). The same code snippets will work for ResNet50 (FP32 and Int8) by replacing the model name to `resnet50`. -1. **Download the Model**: Download and extract the ResNet50 or InceptionV3 pre-trained model (FP32), using the instructions in one of the READMEs above. +1. **Download the Model**: Download and extract the InceptionV3 pre-trained model, using the instructions in above README. 2. **(Optional) Download Data**: If you are interested only in testing performance, not accuracy, you can skip this step and use synthetic data. If you want to verify prediction accuracy by testing on real data, follow the instructions in one of the READMEs above to download the ImageNet dataset. -3. **Clone this repository**: Clone the [intelai/models](https://github.com/intelai/models) repository and `cd` into the `docs/image_recognition/tensorflow_serving/src` directory. +3. **Clone this repository**: Clone the [intelai/models](https://github.com/intelai/models) repository and `cd` into the `models/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32` directory. ``` $ git clone https://github.com/IntelAI/models.git - $ cd models/docs/image_recognition/tensorflow_serving/src + $ cd models/benchmarks/image_recognition/tensorflow_serving/inceptionv3/inference/fp32 ``` 4. **Set up your environment**: In this tutorial, we use a virtual environment to install a few required Python packages. @@ -51,15 +63,13 @@ For steps 1 and 2, refer to the Intel Model Zoo FP32 READMEs: $ pip install virtualenv $ virtualenv venv ``` - Then activate the virtual environment and install `grpc`, `requests`, `tensorflow`, and `tensorflow-serving-api` (at the time of this writing, the order of installation matters): + Then activate the virtual environment and install `requests`, `tensorflow`, and `tensorflow-serving-api`: ``` $ source venv/bin/activate - (venv)$ pip install grpc - (venv)$ pip install requests - (venv)$ pip install intel-tensorflow - (venv)$ pip install tensorflow-serving-api + (venv)$ pip install requests intel-tensorflow tensorflow-serving-api ``` 5. **Create a SavedModel**: Using the conversion script `model_graph_to_saved_model.py`, convert the pre-trained model graph to a SavedModel. + (For ResNet50, substitute the name of the ResNet50 FP32 or the ResNet50 Int8 pre-trained model.) Example: ``` @@ -118,13 +128,13 @@ For steps 1 and 2, refer to the Intel Model Zoo FP32 READMEs: To see average online inference performance (in ms), run the script `image_recognition_benchmark.py` using batch_size 1: ``` (venv)$ python image_recognition_benchmark.py --batch_size 1 --model inceptionv3 - Iteration 1: 0.017 sec + Iteration 1: ... sec ... - Iteration 40: 0.016 sec - Average time: 0.016 sec + Iteration 40: ... sec + Average time: ... sec Batch size = 1 - Latency: 16.496 ms - Throughput: 60.619 images/sec + Latency: ... ms + Throughput: ... images/sec ``` In some cases, it is desirable to constrain the inference server to a single core or socket. @@ -156,12 +166,12 @@ For steps 1 and 2, refer to the Intel Model Zoo FP32 READMEs: To see average batch inference performance (in images/sec), run the script `image_recognition_benchmark.py` using batch_size 128: ``` (venv)$ python image_recognition_benchmark.py --batch_size 128 --model inceptionv3 - Iteration 1: 1.706 sec + Iteration 1: ... sec ... - Iteration 40: 0.707 sec - Average time: 0.693 sec + Iteration 40: ... sec + Average time: ... sec Batch size = 128 - Throughput: 184.669 images/sec + Throughput: ... images/sec ``` 11. **Clean up**: @@ -171,7 +181,7 @@ For steps 1 and 2, refer to the Intel Model Zoo FP32 READMEs: ## Conclusion -You have now seen two end-to-end examples of serving an image recognition model for inference using TensorFlow Serving, and learned: +You have now seen three end-to-end examples of serving an image recognition model for inference using TensorFlow Serving, and learned: 1. How to create a SavedModel from a TensorFlow model graph 2. How to choose good values for the performance-related runtime parameters exposed by the `docker run` command 3. How to verify that the served model can correctly classify an image using a GRPC client diff --git a/docs/image_recognition/tensorflow_serving/src/image_recognition_benchmark.py b/docs/image_recognition/tensorflow_serving/src/image_recognition_benchmark.py deleted file mode 100644 index 658812cd9..000000000 --- a/docs/image_recognition/tensorflow_serving/src/image_recognition_benchmark.py +++ /dev/null @@ -1,117 +0,0 @@ -# -# -*- coding: utf-8 -*- -# -# Copyright (c) 2019 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: EPL-2.0 -# - -"""Send simulated image data to tensorflow_model_server loaded with ResNet50 or InceptionV3 model. - -""" - -from __future__ import print_function - -import os -import sys -import random -import time -import grpc -import tensorflow as tf -import numpy as np - -from tensorflow_serving.apis import predict_pb2 -from tensorflow_serving.apis import prediction_service_pb2_grpc - -from util import preprocess_image, parse_example_proto - -tf.app.flags.DEFINE_string('server', 'localhost:8500', - 'PredictionService host:port') -tf.app.flags.DEFINE_integer('batch_size', 1, 'Batch size to use') -tf.app.flags.DEFINE_string('data_dir', '', 'path to images in TF records format') -tf.app.flags.DEFINE_string('model', 'resnet50', 'Name of model (resnet50 or inceptionv3).') -FLAGS = tf.app.flags.FLAGS - - -def sample_images(image_size): - """Pull a random batch of images from FLAGS.data_dir containing TF record formatted ImageNet validation set - - Returns: - ndarray of float32 with shape [FLAGS.batch_size, image_size, image_size, 3] - """ - - sample_file = random.choice(os.listdir(FLAGS.data_dir)) - dataset = tf.data.TFRecordDataset(os.path.join(FLAGS.data_dir, sample_file)) - dataset = dataset.map(lambda x: parse_example_proto(x)).shuffle(True).batch(FLAGS.batch_size) - iterator = dataset.make_one_shot_iterator() - next_element = iterator.get_next() - with tf.Session() as sess: - images, labels = sess.run(next_element) - images = np.array([sess.run(preprocess_image(x, FLAGS.model, image_size)) for x in images]) - - return images - -def main(_): - if FLAGS.model == 'resnet50': - image_size = 224 - elif FLAGS.model == 'inceptionv3': - image_size = 299 - else: - print('Please specify model as either resnet50 or inceptionv3.') - sys.exit(-1) - - channel = grpc.insecure_channel(FLAGS.server) - stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) - i = 0 - num_iteration = 40 - warm_up_iteration = 10 - total_time = 0 - for _ in range(num_iteration): - i += 1 - if FLAGS.data_dir: - image_np = sample_images(image_size) - else: - image_np = np.random.rand(FLAGS.batch_size, image_size, image_size, 3).astype(np.float32) - if FLAGS.model == 'resnet50': - # For ResNet50, rescale to [0, 256] - image_np *= 256.0 - elif FLAGS.model == 'inceptionv3': - # For InceptionV3, rescale to [-1, 1] - image_np = (image_np - 0.5) * 2.0 - - request = predict_pb2.PredictRequest() - request.model_spec.name = FLAGS.model - request.model_spec.signature_name = 'serving_default' - request.inputs['input'].CopyFrom( - tf.contrib.util.make_tensor_proto(image_np, shape=[FLAGS.batch_size, image_size, image_size, 3])) - start_time = time.time() - result = stub.Predict(request, 10.0) # 10 secs timeout - time_consume = time.time() - start_time - print('Iteration %d: %.3f sec' % (i, time_consume)) - if i > warm_up_iteration: - total_time += time_consume - - time_average = total_time / (num_iteration - warm_up_iteration) - print('Average time: %.3f sec' % (time_average)) - - print('Batch size = %d' % FLAGS.batch_size) - if (FLAGS.batch_size == 1): - print('Latency: %.3f ms' % (time_average * 1000)) - - print('Throughput: %.3f images/sec' % (FLAGS.batch_size / time_average)) - - -if __name__ == '__main__': - tf.app.run() diff --git a/docs/image_recognition/tensorflow_serving/src/util.py b/docs/image_recognition/tensorflow_serving/src/util.py deleted file mode 100644 index 8877e932d..000000000 --- a/docs/image_recognition/tensorflow_serving/src/util.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# -*- coding: utf-8 -*- -# -# Copyright (c) 2019 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: EPL-2.0 -# - -from __future__ import print_function - -import tensorflow as tf - -def preprocess_image(image_buffer, model, image_size): - """Preprocess JPEG encoded bytes to 3D float Tensor.""" - - # Decode the string as an RGB JPEG of unknown height and width. - image = tf.image.decode_jpeg(image_buffer, channels=3) - # Convert pixels to [0, 1) - image = tf.image.convert_image_dtype(image, dtype=tf.float32) - # Crop the central region to 87.5% of the original image. - image = tf.image.central_crop(image, central_fraction=0.875) - # Resize the image to image_size x image_size. - image = tf.expand_dims(image, 0) - image = tf.image.resize_bilinear(image, [image_size, image_size], align_corners=False) - image = tf.squeeze(image, [0]) - if model == 'resnet50': - # For ResNet50, rescale to [0, 256] - image = tf.multiply(image, 256.0) - elif model == 'inceptionv3': - # For InceptionV3, rescale to [-1, 1] - image = tf.subtract(image, 0.5) - image = tf.multiply(image, 2.0) - return image - -def parse_example_proto(example_serialized): - - # Dense features in Example proto. - feature_map = { - 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, - default_value=''), - 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, - default_value=-1), - } - - features = tf.parse_single_example(example_serialized, feature_map) - label = tf.cast(features['image/class/label'], dtype=tf.int32) - - return features['image/encoded'], label - diff --git a/docs/language_translation/tensorflow/Tutorial.md b/docs/language_translation/tensorflow/Tutorial.md new file mode 100644 index 000000000..aee385c63 --- /dev/null +++ b/docs/language_translation/tensorflow/Tutorial.md @@ -0,0 +1,266 @@ +# Language Translation with Transformer-LT + + +## Goal +This tutorial will introduce CPU performance considerations of the deep learning Transformer-LT model for language translation and how to use IntelĀ® Optimizations for TensorFlow to improve inference time on CPUs. +This tutorial will also provide code examples to use Intel Model Zoo's pretrained English to German model that can be copy/pasted for quick off-the-ground implementation on real data. + +## Background +Language Translation with deep learning is a computationally expensive endeavor. This tutorial will show you how to reduce the inference runtime of your Transformer-LT network, a popular topology solution to translation. +It is based on an encoder-decoder architecture with an added attention mechanism. The encoder is used to encode the original sentence to a meaningful fixed-length vector, and the decoder is responsible for extracting the context data from the vector. +The encoder and decoder process the inputs and outputs, which are in the form of a time sequence. + +In a traditional encoder/decoder model, each element in the context vector is treated equally. This is typically not the ideal solution. +For instance, when you translate the phrase ā€œI travel by trainā€ from English into Chinese, the word ā€œIā€ has a greater influence than other words when producing its counterpart in Chinese. +Thus, the attention mechanism was introduced to differentiate contributions of each element in the source sequence to their counterpart in the destination sequence, through the use of a hidden matrix. +This matrix contains weights of each element in the source sequence when producing elements in the destination sequence. + + +## Recommended Settings +In addition to TensorFlow optimizations that use the IntelĀ® Math Kernel Library for Deep Neural Networks (IntelĀ® MKL-DNN) to utilize instruction sets appropriately, the runtime settings also significantly contribute to improved performance. +Tuning these options to optimize CPU workloads is vital to optimize performance of TensorFlow on IntelĀ® processors. +Below are the set of run-time options tested empirically on Transformer-LT and recommended by Intel: + + +| Run-time options | Recommendations | +| ------------- | ------------- | +| Batch Size | 64. Regardless of the hardware | +| Hyperthreading | Enabled. Turn on in BIOS. Requires a restart. | +|intra_op_parallelism_threads |# physical cores | +|inter_op_parallelism_threads | 1 | +|NUMA Controls| --cpunodebind=0 --membind=0 | +|KMP_AFFINITY| KMP_AFFINITY=granularity=fine,verbose,compact,1,0| +|KMP_BLOCKTIME| 1 | +|OMP_NUM_THREADS |physical cores| + +Note 1: Refer to this [link](https://software.intel.com/en-us/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference) to learn more about the run time options. + +Note 2: You can remove `verbose` from `KMP_AFFINITY` setting to avoid verbose output at runtime. + +Run the following commands to get your processor information: + +a. #physical cores per socket : `lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs` + +b. #all physical cores: `lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l` + +Below is a code snippet you can incorporate into your existing TensorFlow application to set the best settings. +You can either set them in the CLI or in the Python script. Note that inter and intra_op_parallelism_threads settings can only be set +in the Python script. + +```bash +export OMP_NUM_THREADS=physical cores +export KMP_AFFINITY="granularity=fine,verbose,compact,1,0" +export KMP_BLOCKTIME=1 +export KMP_SETTINGS=1 +``` +(or) +``` +import os +os.environ["KMP_BLOCKTIME"] = "1" +os.environ["KMP_SETTINGS"] = "1" +os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0" +os.environ["OMP_NUM_THREADS"]= <# physical cores> +config = tf.ConfigProto() +config.intra_op_parallelism_threads = <# physical cores> +config.inter_op_parallelism_threads = 1 +tf.Session(config=config) +``` + +## Hands-on Tutorial +This section shows how to measure inference performance on Intel's Model Zoo pretrained model (or your pretrained model) by setting the above-discussed run time flags. +### FP32 inference + +### Initial Setup + +1. The model source is based off a specific commit from the TensorFlow models repo. Follow the instructions below to clone an older commit into your home directory. + +``` +cd ~ +mkdir tensorflow-models +cd tensorflow-models +git clone https://github.com/tensorflow/models.git +cd models +git checkout 8367cf6dabe11adf7628541706b660821f397dce +``` + +2. Clone IntelAI models and download into your home directory, skip this step if you already have Intel AI models installed. + +```bash +cd ~ +git clone https://github.com/IntelAI/models.git +``` + +3. Skip to step 4 if you already have a pretrained model or download the file `transformer_lt_official_fp32_pretrained_model.tar.gz` into your ~/transformer_LT_german location. +``` +mkdir ~/transformer_LT_german +cd ~/transformer_LT_german +wget https://storage.googleapis.com/intel-optimized-tensorflow/models/transformer_lt_official_fp32_pretrained_model.tar.gz +tar -xzvf transformer_lt_official_fp32_pretrained_model.tar.gz +``` + +4. After extraction, you should see the following folders and files in the `transformer_lt_official_fp32_pretrained_model` directory: +``` +$ ls -l transformer_lt_official_fp32_pretrained_model/* + +transformer_lt_official_fp32_pretrained_model/data: +total 1064 +-rw-r--r--. 1 359898 Feb 20 16:05 newstest2014.en +-rw-r--r--. 1 399406 Feb 20 16:05 newstest2014.de +-rw-r--r--. 1 324025 Mar 15 17:31 vocab.txt + +transformer_lt_official_fp32_pretrained_model/graph: +total 241540 +-rwx------. 1 247333269 Mar 15 17:29 fp32_graphdef.pb + +``` +`newstest2014.en`: Input file with English text
+`newstest2014.de`: German translation of the input file for measuring accuracy
+`vocab.txt`: A dictionary of vocabulary
+`fp32_graphdef.pb`: Pretrained model + +Or, if you have your own model/data, ensure the folder structure following the structure depicted below to run the pretrained model in Intel Model Zoo. + +``` +ā”œā”€ transformer_LT_german +ā”‚ ā”œā”€ā”€ transformer_pretrained_model +ā”‚ ā”œā”€ā”€ data +ā”‚ ā”‚ ā”œā”€ā”€ newstest2014.en(Input file) +ā”‚ ā”‚ ā”œā”€ā”€ newstest2014.de (Reference file, this is optional) +ā”‚ ā”‚ ā””ā”€ā”€ vocab.txt +ā”‚ ā””ā”€ā”€ graph +ā”‚ ā””ā”€ā”€ pretrained_model.pb +``` +5. Install [Docker](https://docs.docker.com/v17.09/engine/installation/) since the tutorial runs in a Docker container. + +### Run inference + +1. Pull the relevant Intel-optimized TensorFlow Docker image. + [Click here](https://software.intel.com/en-us/articles/intel-optimization-for-tensorflow-installation-guide) to find all the available Docker images. +```bash +docker pull docker.io/intelaipg/intel-optimized-tensorflow:latest +``` +2. cd to the inference script directory in local IntelAI repo +```bash +cd ~/models/benchmarks +``` +3. Run the Python script ``` launch_benchmark.py``` with the pretrained model. +```launch_benchmark.py``` script can be treated as an entry point to conveniently perform out-of-box high performance +inference on pretrained models trained of popular topologies. +The script will automatically set the recommended run-time options for supported topologies, +but if you choose to set your own options, refer to full of available flags and a detailed +explanation on ```launch_benchmarking.py``` script [here](/docs/general/tensorflow/LaunchBenchmark.md). + This step will automatically launch a new container on every run and terminate. Go to [Step 4](#step_4) to interactively run the script on the container. + +Substitute the `--model-source-dir` for the location where you cloned the +[tensorflow/models](https://github.com/tensorflow/models.git) repo + + +``` +~/tensorflow-models/models +``` +3.1. *Online inference* (using `--socket-id 0` and `--batch-size 1`) + +If you wish to calculate the [BLEU](https://en.wikipedia.org/wiki/BLEU) metric to find out the machine-translation quality, pass the file as `reference` flag. +`newstest2014.en` file must have only one sentence per line + + +console in: +```bash +python launch_benchmark.py \ + --model-name transformer_lt_official \ + --precision fp32 \ + --mode inference \ + --framework tensorflow \ + --batch-size 1 \ + --socket-id 0 \ + --docker-image intelaipg/intel-optimized-tensorflow:latest \ + --model-source-dir ~/tensorflow-models/models \ + --in-graph ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \ + --data-location ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/data \ + -- file=newstest2014.en \ + vocab_file=vocab.txt \ + file_out=translate.txt \ + reference=newstest2014.de +``` + +The translated German text will be in the file `translation.txt` located at `~/models/benchmarks/common/tensorflow/logs` + +3.2. *Batch inference* (using `--socket-id 0` and `--batch-size 64`) + +```bash +python launch_benchmark.py \ + --model-name transformer_lt_official \ + --precision fp32 \ + --mode inference \ + --framework tensorflow \ + --batch-size 64 \ + --socket-id 0 \ + --docker-image intelaipg/intel-optimized-tensorflow:latest \ + --model-source-dir ~/tensorflow-models/models \ + --in-graph ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \ + --data-location ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/data \ + -- file=newstest2014.en \ + vocab_file=vocab.txt \ + file_out=translate.txt \ + reference=newstest2014.de +``` +console out: +``` +Graph parsed in ..... s +import_graph_def took .....s +tokenizer took ..... s +Translating 3003 sentences from English to German. +Total inferencing time:.... +Throughput:.... sentences/second +Total number of sentences translated:3003 +I0419 22:50:49.856748 140013257643776 compute_bleu.py:106] Case-insensitive results: 27.510020 +I0419 22:50:51.203501 140013257643776 compute_bleu.py:110] Case-sensitive results: 26.964748 +Ran inference with batch size 64 +Log location outside container: /~/models/benchmarks/common/tensorflow/logs/benchmark_transformer_lt_official_inference_fp32_20190419_224047.log +``` + +The logs are captured in a directory outside of the container.
+ +4. If you want to run the ```launch_benchmark.py``` interactively from within the docker container, add flag ```--debug```. This will launch a docker container based on the ```--docker_image```, +performs necessary installs, runs the ```launch_benchmark.py``` script and does not terminate the container process. As an example, this step will demonstrate online inference (--batch-size 1), but you can implement the same strategy for batch inference (--batch-size 64)." + +console in: +```bash +python launch_benchmark.py \ + --model-name transformer_lt_official \ + --precision fp32 \ + --mode inference \ + --framework tensorflow \ + --batch-size 64 \ + --socket-id 0 \ + --docker-image intelaipg/intel-optimized-tensorflow:latest \ + --model-source-dir ~/tensorflow-models/models \ + --in-graph ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \ + --data-location ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/data \ + --debug + -- file=newstest2014.en \ + vocab_file=vocab.txt \ + file_out=translate.txt \ + reference=newstest2014.de + +``` +console out: +```bash + lscpu_path_cmd = command -v lscpu + lscpu located here: b'/usr/bin/lscpu' + root@a78677f56d69:/workspace/benchmarks/common/tensorflow# +``` + +To rerun the bechmarking script, execute the ```start.sh``` bash script from your existing directory with the available flags, which inturn will run ```launch_benchmark.py```. For e.g to rerun with the different batch size (batch size=64) settings run with ```BATCH_SIZE``` +and to skip the run from reinstalling packages pass ```True``` to ```NOINSTALL```. + +```bash + chmod +x ./start.sh +``` +```bash + NOINSTALL=True BATCH_SIZE=64 ./start.sh +``` + +All other flags will be defaulted to values passed in the first ```launch_benchmark.py``` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags. + + diff --git a/docs/language_translation/tensorflow_serving/Tutorial.md b/docs/language_translation/tensorflow_serving/Tutorial.md new file mode 100644 index 000000000..c584495c1 --- /dev/null +++ b/docs/language_translation/tensorflow_serving/Tutorial.md @@ -0,0 +1,211 @@ + +# Language Translation with TensorFlow Serving on CPU using Transformer-LT + +## Goal + +This tutorial will introduce you to the CPU performance considerations for language translation and how to use [IntelĀ® Optimizations for TensorFlow Serving](https://www.tensorflow.org/serving/) to improve inference time on CPUs. +This tutorial uses a pre-trained [Transformer-LT](https://arxiv.org/pdf/1706.03762.pdf) model for translating English to German and a sample of English news excerpts. +We provide sample code that you can use to get your optimized TensorFlow model server and GRPC client up and running quickly. +In this tutorial using Transformer-LT, you will measure inference performance in two situations: +* **Online inference**, where batch_size=1. In this case, a lower number means better runtime performance. +* **Batch inference**, where batch_size>1. In this case, a higher number means better runtime performance. + +**NOTE about GRPC vs. REST**: It [has been suggested](https://medium.com/@avidaneran/tensorflow-serving-rest-vs-grpc-e8cef9d4ff62) that GRPC has faster client-side serialization and de-serialization than REST, especially if you are optimizing for batch inference. +Please note however that this tutorial is focused on optimizing the model server, not the client that sends requests. +We use GRPC in this tutorial for illustration, not as a best practice, and offer another [tutorial](/docs/object_detection/tensorflow_serving/Tutorial.md) that illustrates the use of the REST API with TensorFlow Serving, if you are interested in that protocol. + +## Prerequisites + +This tutorial assumes you have already: +* [Installed TensorFlow Serving](/docs/general/tensorflow_serving/InstallationGuide.md) +* Read and understood the [General Best Practices](/docs/general/tensorflow_serving/GeneralBestPractices.md), + especially these sections: + * [Performance Metrics](/docs/general/tensorflow_serving/GeneralBestPractices.md#performance-metrics) + * [TensorFlow Serving Configuration Settings](/docs/general/tensorflow_serving/GeneralBestPractices.md#tensorflow-serving-configuration-settings) +* Ran an example end-to-end using a GRPC client, such as the [one in the Installation Guide](/docs/general/tensorflow_serving/InstallationGuide.md#option-2-query-using-grpc) + +## Background + +The Transformer-LT model is a popular solution for language translation. +It is based on an encoder-decoder architecture with an added attention mechanism. +The encoder is used to encode the original sentence to a meaningful fixed-length vector, and the decoder is responsible for extracting the context data from the vector. +The encoder and decoder process the inputs and outputs, which are in the form of a time sequence. + +In a traditional encoder/decoder model, each element in the context vector is treated equally, but this is typically not the ideal solution. +For instance, when you translate the phrase ā€œI travel by trainā€ from English into Chinese, the word ā€œIā€ has a greater influence than other words when producing its counterpart in Chinese. +Thus, the attention mechanism was introduced to differentiate contributions of each element in the source sequence to their counterpart in the destination sequence, through the use of a hidden matrix. +This matrix contains weights of each element in the source sequence when producing elements in the destination sequence. + +[IntelĀ® Math Kernel Library for Deep Neural Networks (IntelĀ® MKL-DNN)](https://github.com/intel/mkl-dnn) offers significant performance improvements for many neural network operations. +Tuning TensorFlow Serving to take full advantage of your hardware for language translation inference involves: +1. Running a TensorFlow Serving docker container configured for performance given your hardware resources +2. Running a GRPC client to verify prediction accuracy and measure online and batch inference performance +3. Experimenting with the TensorFlow Serving settings on your own to further optimize for your model and use case + +## Hands-on Tutorial with pre-trained Transformer-LT (Official) model + +1. **Clone this repository**: Clone the [intelai/models](https://github.com/intelai/models) repository into your home directory. + + ``` + cd ~ + git clone https://github.com/IntelAI/models.git + ``` + +2. **Clone the tensorflow/models repository**: Tokenization of the input data requires utility functions in a specific commit of the tensorflow/models repository. + + ``` + cd ~ + mkdir tensorflow-models + cd tensorflow-models + git clone https://github.com/tensorflow/models.git + cd models + git checkout 8367cf6dabe11adf7628541706b660821f397dce + ``` + + Now add the required directory to the `PYTHONPATH` variable: + + ``` + export PYTHONPATH=$PYTHONPATH:$(pwd)/official/transformer + ``` + +3. **Set up the client environment**: We need to create a virtual environment for this tutorial. + + - We will use a virtual environment to install the required packages. If you do not have pip or virtualenv, you will need to get them first: + + ``` + sudo apt-get install -y python python-pip virtualenv + ``` + + - Create and activate the python virtual environment in your home directory and install the `tensorflow`, `pandas`, and `tensorflow-serving-api` packages. + + ``` + cd ~ + virtualenv lt_venv + source lt_venv/bin/activate + pip install intel-tensorflow pandas tensorflow-serving-api + ``` + +4. **Download the pre-trained model and test data**: Download and extract the packaged pre-trained model and dataset ```transformer_lt_official_fp32_pretrained_model.tar.gz``` + (refer to the [model README](/benchmarks/language_translation/tensorflow/transformer_lt_official) to get the latest location of this archive). + + ``` + wget https://storage.googleapis.com/intel-optimized-tensorflow/models/transformer_lt_official_fp32_pretrained_model.tar.gz + tar -xzvf transformer_lt_official_fp32_pretrained_model.tar.gz + ``` + + After extraction, you should see the following folders and files in the `transformer_lt_official_fp32_pretrained_model` directory: + + ``` + $ ls -l transformer_lt_official_fp32_pretrained_model/* + + transformer_lt_official_fp32_pretrained_model/data: + total 1064 + -rw-r--r--. 1 359898 Feb 20 16:05 newstest2014.en + -rw-r--r--. 1 399406 Feb 20 16:05 newstest2014.de + -rw-r--r--. 1 324025 Mar 15 17:31 vocab.txt + + transformer_lt_official_fp32_pretrained_model/graph: + total 241540 + -rwx------. 1 247333269 Mar 15 17:29 fp32_graphdef.pb + ``` + + - `newstest2014.en`: Input file with English text + - `newstest2014.de`: German translation of the input file for measuring accuracy + - `vocab.txt`: Dictionary of vocabulary + - `fp32_graphdef.pb`: Pre-trained model + +5. **Create a SavedModel**: Using the conversion script `transformer_graph_to_saved_model.py`, convert the pre-trained model graph to a SavedModel. + + ``` + cd ~/models/docs/language_translation/tensorflow_serving + python transformer_graph_to_saved_model.py --import_path ~/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb + ``` + + This will create a `/tmp/1/` directory with a `saved_model.pb` file in it. This is the file we will serve from TensorFlow Serving. + The [`transformer_graph_to_saved_model.py`](transformer_graph_to_saved_model.py) script attaches a signature definition to the model in order to make it compatible with TensorFlow Serving. + You can take a look at the script, its flags/options, and these resources for more information: + * [SavedModel](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/saved_model) + * [SignatureDefs](https://www.tensorflow.org/serving/signature_defs) + +6. **Discover the number of physical cores**: Compute *num_physical_cores* by executing the `lscpu` command and multiplying `Core(s) per socket` by `Socket(s)`. + For example, for a machine with `Core(s) per socket: 28` and `Socket(s): 2`, `num_physical_cores = 28 * 2 = 56`. + To compute *num_physical_cores* with bash commands: + ``` + cores_per_socket=`lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs` + num_sockets=`lscpu | grep "Socket(s)" | cut -d':' -f2 | xargs` + num_physical_cores=$((cores_per_socket * num_sockets)) + echo $num_physical_cores + ``` + +7. **Recommended Settings**: To optimize overall performance, start with the following settings from the [General Best Practices](/docs/general/tensorflow_serving/GeneralBestPractices.md). + Playing around with these settings can improve performance even further, so you should experiment with your own hardware and model if you have strict performance requirements. + + | Options | Recommendations| + | ------------- | ------------- | + |TENSORFLOW_INTER_OP_PARALLELISM | 2 | + |TENSORFLOW_INTRA_OP_PARALLELISM| Number of physical cores | + |OMP_NUM_THREADS |Number of physical cores| + | Batch Size | 64 | + +8. **Start the server**: We can now start up the TensorFlow model server. Using `-d` (for "detached") runs the container as a background process. + + ``` + cd ~ + docker run \ + --name=tfserving \ + -d \ + -p 8500:8500 \ + -v "/tmp:/models/transformer" \ + -e MODEL_NAME=transformer \ + -e OMP_NUM_THREADS=$num_physical_cores \ + -e TENSORFLOW_INTER_OP_PARALLELISM=2 \ + -e TENSORFLOW_INTRA_OP_PARALLELISM=$num_physical_cores \ + tensorflow/serving:mkl + ``` + + You can make sure the container is running using the `docker ps` command. + +9. **Online and batch performance**: Run `transformer_benchmark.py` [python script](/docs/language_translation/tensorflow_serving/transformer_benchmark.py), which can measure both online and batch performance. + + If you are not already there, go to the tutorial directory: + ``` + cd ~/models/docs/language_translation/tensorflow_serving + ``` + + **Online Inference** (batch_size=1): + ``` + python transformer_benchmark.py \ + -d ~/transformer_lt_official_fp32_pretrained_model/data/newstest2014.en \ + -v ~/transformer_lt_official_fp32_pretrained_model/data/vocab.txt \ + -b 1 + ``` + + **Batch Inference** (batch_size=64): + ``` + python transformer_benchmark.py \ + -d ~/transformer_lt_official_fp32_pretrained_model/data/newstest2014.en \ + -v ~/transformer_lt_official_fp32_pretrained_model/data/vocab.txt \ + -b 64 + ``` + + Note: If you want an output file of translated sentences, set the `-o` flag to an output file name of your choice. + If this option is set, the script will take a significantly longer time to finish. + +10. **Clean up**: + * After you are finished sending requests to the server, you can stop the container running in the background. To restart the container with the same name, you need to stop and remove the container from the registry. To view your running containers run `docker ps`. + + ``` + docker rm -f tfserving + ``` + + * Deactivate your virtual environment with `deactivate`. + + +## Conclusion +You have now seen an end-to-end example of serving a language translation model for inference using TensorFlow Serving, and learned: +1. How to create a SavedModel from a Transformer-LT TensorFlow model graph +2. How to choose good values for the performance-related runtime parameters exposed by the `docker run` command +3. How to test online and batch inference metrics using a GRPC client + +With this knowledge and the example code provided, you should be able to get started serving your own custom language translation model with good performance. +If desired, you should also be able to investigate a variety of different settings combinations to see if further performance improvements are possible. diff --git a/docs/language_translation/tensorflow_serving/transformer_benchmark.py b/docs/language_translation/tensorflow_serving/transformer_benchmark.py new file mode 100644 index 000000000..a5cf43654 --- /dev/null +++ b/docs/language_translation/tensorflow_serving/transformer_benchmark.py @@ -0,0 +1,181 @@ +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from __future__ import print_function + +import os +import sys +import time +import argparse +import grpc +import numpy as np +import pandas as pd +import tensorflow as tf + +from tensorflow_serving.apis import predict_pb2 +from tensorflow_serving.apis import prediction_service_pb2_grpc + +from utils import tokenizer +from utils.tokenizer import Subtokenizer + +def check_for_link(value): + """ + Throws an error if the specified path is a link. os.islink returns + True for sym links. For files, we also look at the number of links in + os.stat() to determine if it's a hard link. + """ + if os.path.islink(value) or \ + (os.path.isfile(value) and os.stat(value).st_nlink > 1): + raise argparse.ArgumentTypeError("{} cannot be a link.".format(value)) + +def check_valid_file_or_folder(value): + """verifies filename exists and isn't a link""" + if value is not None: + if not os.path.isfile(value) and not os.path.isdir(value): + raise argparse.ArgumentTypeError("{} does not exist or is not a file/folder.". + format(value)) + check_for_link(value) + return value + +def input_generator_ts(file_path, vocab_file): + """Read and sort lines based on token count from the file + sorted by decreasing length based on token sorting. + + Args: + file_path: String path of file to read + vocab_file: String path of vocab file + Returns: + Sorted list of inputs, and dictionary mapping original index->sorted index + of each element. + """ + with tf.gfile.Open(file_path) as f: + records = f.read().split("\n") + inputs = [record.strip() for record in records] + if not inputs[-1]: + inputs.pop() + + subtokenizer = Subtokenizer(vocab_file) + + batch = [] + token_lens = [] + for i, line in enumerate(inputs): + enc = subtokenizer.encode(line, add_eos=True) + token_lens.append((i, len(enc))) + + sorted_by_token_input_lens = sorted(token_lens, key=lambda x: x[1], reverse=True) + sorted_inputs = [None] * len(sorted_by_token_input_lens) + sorted_keys = [0] * len(sorted_by_token_input_lens) + + for i, (index, _) in enumerate(sorted_by_token_input_lens): + sorted_inputs[i] = inputs[index] + sorted_keys[index] = i + enc = subtokenizer.encode(sorted_inputs[i], add_eos=True) + batch.append(enc) + + return batch, sorted_keys + +def _trim_and_decode(ids, vocab_file): + """Trim EOS and PAD tokens from ids, and decode to return a string.""" + subtokenizer = Subtokenizer(vocab_file) + try: + index = list(ids).index(tokenizer.EOS_ID) + return subtokenizer.decode(ids[:index]) + except ValueError: # No EOS found in sequence + return subtokenizer.decode(ids) + +def benchmark(batch_size=1, num_iteration=20, warm_up_iteration=10): + channel = grpc.insecure_channel(SERVER_URL) + stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) + request = predict_pb2.PredictRequest() + request.model_spec.name = 'transformer' + request.model_spec.signature_name = 'serving_default' + + batches, sorted_keys = input_generator_ts(DATA_FILE, VOCAB_FILE) + + translations = [] + batch = [] + inference_time = 0.0 + sentences_to_translate = min(batch_size * num_iteration, len(batches)) + sentences_after_warmup = 0 + + for i, line in enumerate(batches[0:sentences_to_translate]): + batch.append(line) + if (i + 1) % batch_size == 0 or i == sentences_to_translate - 1: + batch_num = (i // batch_size) + 1 + request.inputs['input'].CopyFrom( + tf.contrib.util.make_tensor_proto(pd.DataFrame(batch).fillna(0).values.astype(np.int64))) + start_time = time.time() + result = stub.Predict(request) + duration = time.time() - start_time + shape = [int(dim.size) for dim in result.outputs['output'].tensor_shape.dim] + translations += np.reshape(result.outputs['output'].int_val, shape).tolist() + print('Iteration %d: %.3f sec' % (batch_num, duration)) + if batch_num > warm_up_iteration: + inference_time += duration + sentences_after_warmup += len(batch) + batch = [] + + average_time = inference_time / sentences_after_warmup + print('Inferencing time: %s' % (inference_time)) + print('Batch size = %d' % batch_size) + if batch_size == 1: + print('Latency: %.3f ms' % (average_time * 1000)) + print('Throughput: %.3f sentences/sec' % (sentences_after_warmup / inference_time)) + + if OUT_FILE: + print('Decoding and saving translations to {}...'.format(OUT_FILE)) + decoded_translations = [] + for i, tr in enumerate(translations): + decoded_translations.append(_trim_and_decode(tr, VOCAB_FILE)) + + with tf.gfile.Open(OUT_FILE, "w") as f: + for i in sorted_keys: + if i < len(decoded_translations): + f.write("%s\n" % decoded_translations[i]) + print('Done!') + +if __name__ == '__main__': + ap = argparse.ArgumentParser() + ap.add_argument("-d", "--data_file", type=check_valid_file_or_folder, required=True, + help="Path to English language input file") + ap.add_argument("-v", "--vocab_file", type=check_valid_file_or_folder, required=True, + help="Path to vocabulary file") + ap.add_argument("-o", "--out_file", type=str, required=False, default='', + help="Path to output file (optional") + ap.add_argument("-b", "--batch_size", required=False, type=int, default=1, + help="Batch size to use") + ap.add_argument("-n", "--num_iteration", required=False, type=int, default=20, + help="Number of times to repeat") + ap.add_argument("-w", "--warm_up_iteration", required=False, type=int, default=10, + help="Number of initial iterations to ignore in benchmarking") + + args = vars(ap.parse_args()) + + SERVER_URL = 'localhost:8500' + DATA_FILE = args['data_file'] + VOCAB_FILE = args['vocab_file'] + OUT_FILE = args['out_file'] + BATCH_SIZE = args['batch_size'] + NUM_ITERATION = args['num_iteration'] + WARM_UP_ITERATION = args['warm_up_iteration'] + + tf.logging.set_verbosity(tf.logging.WARN) + + print('\n SERVER_URL: {} \n DATA_FILE: {}'.format(SERVER_URL, DATA_FILE)) + + print('\nStarting Transformer-LT (Official) model benchmarking for Latency with batch_size={}, num_iteration={}, warm_up_iteration={}'.format(BATCH_SIZE, NUM_ITERATION, WARM_UP_ITERATION)) + benchmark(batch_size=BATCH_SIZE, num_iteration=NUM_ITERATION, warm_up_iteration=WARM_UP_ITERATION) + diff --git a/docs/language_translation/tensorflow_serving/transformer_graph_to_saved_model.py b/docs/language_translation/tensorflow_serving/transformer_graph_to_saved_model.py new file mode 100644 index 000000000..c5cc250ce --- /dev/null +++ b/docs/language_translation/tensorflow_serving/transformer_graph_to_saved_model.py @@ -0,0 +1,87 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +"""Import a Transformer-LT model graph and export a SavedModel. + +Usage: transformer_graph_to_saved_model.py [--model_version=y] import_path export_dir +""" + +from __future__ import print_function + +import sys +import tensorflow as tf + +tf.app.flags.DEFINE_integer('model_version', 1, 'Version number of the model.') +tf.app.flags.DEFINE_string('import_path', '', 'Model import path.') +tf.app.flags.DEFINE_string('export_dir', '/tmp', 'Export directory.') +FLAGS = tf.app.flags.FLAGS + + +def main(_): + if len(sys.argv) < 2 or sys.argv[-1].startswith('-'): + print('Usage: transformer_graph_to_saved_model.py [--model_version=y] import_path export_dir') + sys.exit(-1) + if FLAGS.import_path == '': + print('Please specify the path to the model graph you want to convert to SavedModel format.') + sys.exit(-1) + if FLAGS.model_version <= 0: + print('Please specify a positive value for version number.') + sys.exit(-1) + + # Import model graph + with tf.Session() as sess: + graph_def = tf.GraphDef() + with tf.gfile.GFile(FLAGS.import_path, 'rb') as input_file: + input_graph_content = input_file.read() + graph_def.ParseFromString(input_graph_content) + + sess.graph.as_default() + tf.import_graph_def(graph_def, name='') + sess.run(tf.global_variables_initializer()) + + # Build the signature_def_map. + in_data = sess.graph.get_tensor_by_name('input_tensor:0') + inputs = {'input': tf.saved_model.utils.build_tensor_info(in_data)} + + out_data = sess.graph.get_tensor_by_name('model/Transformer/strided_slice_19:0') + outputs = {'output': tf.saved_model.utils.build_tensor_info(out_data)} + + signature = tf.saved_model.signature_def_utils.build_signature_def( + inputs=inputs, + outputs=outputs, + method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME + ) + + # Save out the SavedModel + print('Exporting trained model to', FLAGS.export_dir + '/' + str(FLAGS.model_version)) + builder = tf.saved_model.builder.SavedModelBuilder(FLAGS.export_dir + '/' + str(FLAGS.model_version)) + builder.add_meta_graph_and_variables( + sess, [tf.saved_model.tag_constants.SERVING], + signature_def_map={ + tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature + } + ) + builder.save() + + print('Done!') + + +if __name__ == '__main__': + tf.app.run() diff --git a/docs/object_detection/tensorflow_serving/ObjectDetection.ipynb b/docs/object_detection/tensorflow_serving/ObjectDetection.ipynb new file mode 100644 index 000000000..5e975ae0c --- /dev/null +++ b/docs/object_detection/tensorflow_serving/ObjectDetection.ipynb @@ -0,0 +1,322 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Object Detection: R-FCN and SSD-MobileNet" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "\n", + "import os\n", + "import time\n", + "import random\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from PIL import Image\n", + "\n", + "from object_detection.utils.visualization_utils import visualize_boxes_and_labels_on_image_array\n", + "\n", + "%matplotlib inline\n", + "import matplotlib\n", + "from matplotlib import pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL = 'rfcn' # Use 'rfcn' for R-FCN or 'ssdmobilenet' for SSD-MobileNet\n", + "PROTOCOL = 'grpc' # Use 'grpc' for GRPC or 'rest' for REST\n", + "IMAGES_PATH = '/home//coco/val/val2017' # Edit this to your COCO validation directory" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "if PROTOCOL == 'grpc':\n", + " import grpc\n", + " import tensorflow as tf\n", + " from tensorflow_serving.apis import predict_pb2\n", + " from tensorflow_serving.apis import prediction_service_pb2_grpc\n", + " SERVER_URL = 'localhost:8500'\n", + "elif PROTOCOL == 'rest':\n", + " import requests\n", + " SERVER_URL = 'http://localhost:8501/v1/models/{}:predict'.format(MODEL)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def get_random_image(image_dir):\n", + " image_path = os.path.join(image_dir, random.choice(os.listdir(image_dir)))\n", + " image = Image.open(image_path)\n", + " (im_width, im_height) = image.size\n", + " return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)\n", + "\n", + "def visualize(output_dict, image_np):\n", + " new_dict = {}\n", + " if PROTOCOL == 'grpc':\n", + " new_dict['num_detections'] = int(output_dict['num_detections'].float_val[0])\n", + " new_dict['detection_classes'] = np.array(output_dict['detection_classes'].float_val).astype(np.uint8)\n", + " new_dict['detection_boxes'] = np.array(output_dict['detection_boxes'].float_val).reshape((-1,4))\n", + " new_dict['detection_scores'] = np.array(output_dict['detection_scores'].float_val)\n", + " new_dict['instance_masks'] = np.array(output_dict['instance_masks'].float_val)\n", + " elif PROTOCOL == 'rest':\n", + " new_dict['num_detections'] = int(output_dict['num_detections'])\n", + " new_dict['detection_classes'] = np.array(output_dict['detection_classes']).astype(np.uint8)\n", + " new_dict['detection_boxes'] = np.array(output_dict['detection_boxes'])\n", + " new_dict['detection_scores'] = np.array(output_dict['detection_scores'])\n", + "\n", + " # Visualize the results of a detection\n", + " visualize_boxes_and_labels_on_image_array(\n", + " image_np,\n", + " new_dict['detection_boxes'],\n", + " new_dict['detection_classes'],\n", + " new_dict['detection_scores'],\n", + " {1: {'id': 1, 'name': 'object'}}, # Empty category index\n", + " instance_masks=None,\n", + " use_normalized_coordinates=True,\n", + " line_thickness=8)\n", + " plt.figure()\n", + " plt.imshow(image_np)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test Object Detection" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 1\n", + "np_image = get_random_image(IMAGES_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\n", + "For more information, please see:\n", + " * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n", + " * https://github.com/tensorflow/addons\n", + "If you depend on functionality not listed there, please file an issue.\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "if PROTOCOL == 'grpc':\n", + " np_image = np.repeat(np.expand_dims(np_image, 0), batch_size, axis=0)\n", + " channel = grpc.insecure_channel(SERVER_URL)\n", + " stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)\n", + " request = predict_pb2.PredictRequest()\n", + " request.model_spec.name = 'ssdmobilenet'\n", + " request.model_spec.signature_name = 'serving_default'\n", + " request.inputs['inputs'].CopyFrom(tf.contrib.util.make_tensor_proto(np_image))\n", + " result = stub.Predict(request)\n", + " visualize(result.outputs, np_image[0])\n", + "elif PROTOCOL == 'rest':\n", + " predict_request = '{\"instances\" : %s}' % np.expand_dims(np_image, 0).tolist()\n", + " result = requests.post(SERVER_URL, data=predict_request)\n", + " visualize(result.json()['predictions'][0], np_image)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Measure Performance" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def make_request(batch_size):\n", + " if PROTOCOL == 'rest':\n", + " np_images = np.repeat(np.expand_dims(get_random_image(IMAGES_PATH), 0).tolist(), batch_size, axis=0).tolist()\n", + " return '{\"instances\" : %s}' % np_images\n", + " elif PROTOCOL == 'grpc':\n", + " np_images = np.repeat(np.expand_dims(get_random_image(IMAGES_PATH), 0), batch_size, axis=0)\n", + " channel = grpc.insecure_channel(SERVER_URL)\n", + " stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)\n", + " request = predict_pb2.PredictRequest()\n", + " request.model_spec.name = MODEL\n", + " request.model_spec.signature_name = 'serving_default'\n", + " request.inputs['inputs'].CopyFrom(tf.contrib.util.make_tensor_proto(np_images))\n", + " return (stub, request)\n", + "\n", + "def send_request(predict_request):\n", + " if PROTOCOL == 'rest':\n", + " requests.post(SERVER_URL, data=predict_request)\n", + " elif PROTOCOL == 'grpc':\n", + " predict_request[0].Predict(predict_request[1])\n", + "\n", + "def benchmark(batch_size=1, num_iteration=10, warm_up_iteration=2):\n", + " i = 0\n", + " total_time = 0\n", + " for _ in range(num_iteration):\n", + " i += 1\n", + " np_images = np.repeat(np.expand_dims(get_random_image(IMAGES_PATH), 0), batch_size, axis=0)\n", + " predict_request = make_request(batch_size)\n", + " start_time = time.time()\n", + " send_request(predict_request)\n", + " time_consume = time.time() - start_time\n", + " print('Iteration %d: %.3f sec' % (i, time_consume))\n", + " if i > warm_up_iteration:\n", + " total_time += time_consume\n", + "\n", + " time_average = total_time / (num_iteration - warm_up_iteration)\n", + " print('Average time: %.3f sec' % (time_average))\n", + " print('Batch size = %d' % batch_size)\n", + " if batch_size == 1:\n", + " print('Latency: %.3f ms' % (time_average * 1000))\n", + " print('Throughput: %.3f images/sec' % (batch_size / time_average))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real-time Inference (latency, batch_size=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration 1: 0.059 sec\n", + "Iteration 2: 0.098 sec\n", + "Iteration 3: 0.055 sec\n", + "Iteration 4: 0.052 sec\n", + "Iteration 5: 0.056 sec\n", + "Iteration 6: 0.051 sec\n", + "Iteration 7: 0.056 sec\n", + "Iteration 8: 0.052 sec\n", + "Iteration 9: 0.050 sec\n", + "Iteration 10: 0.048 sec\n", + "Average time: 0.052 sec\n", + "Batch size = 1\n", + "Latency: 52.392 ms\n", + "Throughput: 19.087 images/sec\n" + ] + } + ], + "source": [ + "benchmark()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Throughput (batch_size=128)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration 1: 4.414 sec\n", + "Iteration 2: 2.699 sec\n", + "Iteration 3: 2.654 sec\n", + "Iteration 4: 2.409 sec\n", + "Iteration 5: 2.485 sec\n", + "Iteration 6: 2.476 sec\n", + "Iteration 7: 2.457 sec\n", + "Iteration 8: 2.497 sec\n", + "Iteration 9: 2.575 sec\n", + "Iteration 10: 2.539 sec\n", + "Average time: 2.511 sec\n", + "Batch size = 128\n", + "Throughput: 50.967 images/sec\n" + ] + } + ], + "source": [ + "benchmark(batch_size=128)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/object_detection/tensorflow_serving/RFCN.ipynb b/docs/object_detection/tensorflow_serving/RFCN.ipynb deleted file mode 100644 index 2f96cf5e7..000000000 --- a/docs/object_detection/tensorflow_serving/RFCN.ipynb +++ /dev/null @@ -1,207 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Object Detection: R-FCN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import print_function\n", - "\n", - "import os\n", - "import time\n", - "import random\n", - "import requests\n", - "import numpy as np\n", - "import tensorflow as tf\n", - "from PIL import Image\n", - "\n", - "from object_detection.utils.visualization_utils import visualize_boxes_and_labels_on_image_array\n", - "\n", - "%matplotlib inline\n", - "import matplotlib\n", - "from matplotlib import pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SERVER_URL = 'http://localhost:8501/v1/models/rfcn:predict'\n", - "IMAGES_PATH = '/home//coco/val/val2017' # Edit this to your COCO validation directory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_random_image(image_dir):\n", - " image_path = os.path.join(image_dir, random.choice(os.listdir(image_dir)))\n", - " image = Image.open(image_path)\n", - " (im_width, im_height) = image.size\n", - " \n", - " return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)\n", - "\n", - "def visualize(output_dict, image_np):\n", - " output_dict['num_detections'] = int(output_dict['num_detections'])\n", - " output_dict['detection_classes'] = np.array(output_dict['detection_classes']).astype(np.uint8)\n", - " output_dict['detection_boxes'] = np.array(output_dict['detection_boxes'])\n", - " output_dict['detection_scores'] = np.array(output_dict['detection_scores'])\n", - "\n", - " # Visualize the results of a detection\n", - " visualize_boxes_and_labels_on_image_array(\n", - " image_np,\n", - " output_dict['detection_boxes'],\n", - " output_dict['detection_classes'],\n", - " output_dict['detection_scores'],\n", - " {1: {'id': 1, 'name': 'object'}}, # Empty category index\n", - " instance_masks=output_dict.get('detection_masks'),\n", - " use_normalized_coordinates=True,\n", - " line_thickness=8)\n", - " plt.figure()\n", - " plt.imshow(image_np)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test Object Detection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np_image = get_random_image(IMAGES_PATH)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predict_request = '{\"instances\" : %s}' % np.expand_dims(np_image, 0).tolist()\n", - "result = requests.post(SERVER_URL, data=predict_request)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "visualize(result.json()['predictions'][0], np_image)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Measure Performance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def benchmark(batch_size=1, num_iteration=40, warm_up_iteration=10):\n", - " i = 0\n", - " total_time = 0\n", - " for _ in range(num_iteration):\n", - " i += 1\n", - " np_images = np.repeat(np.expand_dims(get_random_image(IMAGES_PATH), 0).tolist(), batch_size, axis=0).tolist()\n", - " predict_request = '{\"instances\" : %s}' % np_images\n", - " start_time = time.time()\n", - " requests.post(SERVER_URL, data=predict_request)\n", - " time_consume = time.time() - start_time\n", - " print('Iteration %d: %.3f sec' % (i, time_consume))\n", - " if i > warm_up_iteration:\n", - " total_time += time_consume\n", - "\n", - " time_average = total_time / (num_iteration - warm_up_iteration)\n", - " print('Average time: %.3f sec' % (time_average))\n", - " print('Batch size = %d' % batch_size)\n", - " if batch_size == 1:\n", - " print('Latency: %.3f ms' % (time_average * 1000))\n", - " print('Throughput: %.3f images/sec' % (batch_size / time_average))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Real-time Inference (latency, batch_size=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "benchmark()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Throughput (batch_size=128)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "benchmark(batch_size=128)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/object_detection/tensorflow_serving/Tutorial.md b/docs/object_detection/tensorflow_serving/Tutorial.md index dfb3d724f..224943f5c 100644 --- a/docs/object_detection/tensorflow_serving/Tutorial.md +++ b/docs/object_detection/tensorflow_serving/Tutorial.md @@ -1,15 +1,17 @@ -# Object Detection with TensorFlow Serving on CPU using R-FCN model +# Object Detection with TensorFlow Serving on CPU +Models: R-FCN and SSD-MobileNet ## Goal This tutorial will introduce you to the CPU performance considerations for object detection in deep learning models and how to use [IntelĀ® Optimizations for TensorFlow Serving](https://www.tensorflow.org/serving/) to improve inference time on CPUs. -This tutorial uses a pre-trained Region-based Fully Convolutional Network (R-FCN) model for object detection and provides sample code that you can use to get your optimized TensorFlow model server and REST client up and running quickly. In this tutorial using R-FCN, you will measure inference performance in two situations: -* **Online inference**, where batch_size=1. In this case, lower time to result means better runtime performance. +This tutorial uses two pre-trained models - a [Region-based Fully Convolutional Network (R-FCN)](https://arxiv.org/pdf/1605.06409.pdf) and a [Single-Shot MultiBox Detector MobileNet (SSD-MobileNet)](https://arxiv.org/pdf/1704.04861.pdf) - for object detection and provides sample code that you can use to get your optimized TensorFlow model server and client up and running quickly. +In this tutorial you will choose between R-FCN and SSD-MobileNet, and between the REST client and GRPC client, and then measure inference performance in two situations: +* **Online inference**, where batch_size=1. In this case, a lower number means better runtime performance. * **Batch inference**, where batch_size>1. In this case, a higher number means better runtime performance. **NOTE about REST vs. GRPC**: This tutorial is focused on optimizing the model server, not the client that sends requests. For optimal client-side serialization and de-serialization, you may want to use TensorFlow Serving's GRPC option instead of the REST API, especially if you are optimizing for batch inference (here is one [article](https://medium.com/@avidaneran/tensorflow-serving-rest-vs-grpc-e8cef9d4ff62) with a relevant analysis). -We use REST in this tutorial for illustration, not as a best practice, and offer another [tutorial](/docs/image_recognition/tensorflow_serving/Tutorial.md) that illustrates the use of GRPC with TensorFlow Serving. +We show both GRPC and REST in this tutorial for illustration, not as a best practice. Feel free to compare and choose the protocol that works best for you. ## Prerequisites @@ -19,140 +21,178 @@ This tutorial assumes you have already: especially these sections: * [Performance Metrics](/docs/general/tensorflow_serving/GeneralBestPractices.md#performance-metrics) * [TensorFlow Serving Configuration Settings](/docs/general/tensorflow_serving/GeneralBestPractices.md#tensorflow-serving-configuration-settings) -* Ran an example end-to-end using a REST client, such as the one in the [Installation Guide](/docs/general/tensorflow_serving/InstallationGuide.md) +* Ran an example end-to-end using a REST or GRPC client, such as the one in the [Installation Guide](/docs/general/tensorflow_serving/InstallationGuide.md) ## Background -[IntelĀ® Math Kernel Library for Deep Neural Networks (IntelĀ® MKL-DNN)](https://github.com/intel/mkl-dnn) offers significant performance improvements for convolution, pooling, normalization, activation, and other operations for object detection, using efficient vectorization and multi-threading. Tuning TensorFlow Serving to take full advantage of your hardware for object detection deep learning inference involves: +[IntelĀ® Math Kernel Library for Deep Neural Networks (IntelĀ® MKL-DNN)](https://github.com/intel/mkl-dnn) offers significant performance improvements for convolution, pooling, normalization, activation, and other operations for object detection, using efficient vectorization and multi-threading. +Tuning TensorFlow Serving to take full advantage of your hardware for object detection deep learning inference involves: 1. Running a TensorFlow Serving docker container configured for performance given your hardware resources -2. Running a REST client notebook to verify object detection and measure online and batch inference performance +2. Running a REST or GRPC client to verify object detection and measure online and batch inference 3. Experimenting with the TensorFlow Serving settings on your own to further optimize for your model and use case -## Hands-on Tutorial with pre-trained R-FCN model +## Hands-on Tutorial -1. **Set up your environment**: We need to setup two things for this tutorial - #### 1.1 Install the [requests](http://docs.python-requests.org) package for making REST HTTP requests. - We will use a virtual environment to install the required packages. If you do not have pip or virtualenv, you will need to get them first: - ``` - $ sudo apt-get install -y python python-pip - $ pip install virtualenv - ``` - - Create and activate the python virtual envirnoment in your home directory and install the [`requests`](http://docs.python-requests.org) package. +1. **Download the data and clone the Model Zoo**: + + 1.1 Download the 2017 validation COCO dataset (~780MB) (**note**: do not convert the COCO dataset to TF records format): + ``` - $ cd ~ - $ virtualenv rfcn_venv - $ source rfcn_venv/bin/activate - (rfcn_venv)$ pip install requests + cd ~ + mkdir -p coco/val + wget http://images.cocodataset.org/zips/val2017.zip + unzip val2017.zip -d coco/val + export COCO_VAL_DATA=$(pwd)/coco/val/val2017 + echo "export COCO_VAL_DATA=$(pwd)/coco/val/val2017" >> ~/.bashrc ``` - #### 1.2 Install [Tensorflow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection) - For detailed instructions, [click here](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md). Following are the instructions for Ubuntu 16.04. - - - 1.2.1 Install Tensorflow Object Detection API dependencies - ``` - (rfcn_venv)$ sudo apt-get install -y protobuf-compiler python-pil python-lxml python-tk - (rfcn_venv)$ pip install tensorflow Cython contextlib2 jupyter matplotlib pillow lxml - ``` - - 1.2.2 Clone the tensorflow models repo into your home directory. - ``` - (rfcn_venv)$ cd ~ - (rfcn_venv)$ git clone https://github.com/tensorflow/models - (rfcn_venv)$ export TF_MODELS_ROOT=$(pwd)/models - (rfcn_venv)$ echo "export TF_MODELS_ROOT=$(pwd)/models" >> ~/.bashrc - ``` + 1.2 Clone the Intel Model Zoo into your home directory: + + ``` + cd ~ + git clone https://github.com/IntelAI/models.git + ``` + +2. **Choose your model and download the pre-trained SavedModel**: Select either R-FCN or SSD-MobileNet. + Then download and extract the pre-trained model and copy the `saved_model.pb` to `~/obj_detection/1` (the `1` subdirectory is important - don't skip it!). + This is the file we will serve from TensorFlow Serving. Finally, define a variable for your chosen model to use in later steps. + Refer to the [TensorFlow documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/saved_model) for more information about SavedModels, and refer to the FP32 model READMEs for [R-FCN](/benchmarks/object_detection/tensorflow/rfcn/README.md#download_fp32_pretrained_model) and [SSD-MobileNet](/benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md#fp32-inference-instructions) to get the latest location of the pre-trained models. + + Highlight and copy one of the following download links: + * R-FCN: `https://storage.googleapis.com/intel-optimized-tensorflow/models/rfcn_resnet101_fp32_coco_pretrained_model.tar.gz` + * SSD-MobileNet: `http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tar.gz` + + Then execute the following bash commands after customizing them for the model you have chosen: + + ``` + cd ~ + wget + tar -xzvf + mkdir -p obj_detection/1 + cp /saved_model/saved_model.pb obj_detection/1 + model_name= + ``` - 1.2.3 Install COCO API - ``` - (rfcn_venv)$ cd ~ - (rfcn_venv)$ git clone https://github.com/cocodataset/cocoapi.git - (rfcn_venv)$ cd cocoapi/PythonAPI - (rfcn_venv)$ make - (rfcn_venv)$ cp -r pycocotools $TF_MODELS_ROOT/research/ - ``` +3. **Set up your virtual environment**: We will use a virtual environment to install the required packages. - 1.2.4 Manually install the protobuf-compiler v3.0.0, run the compilation process, add Libraries to PYTHONPATH and to your `.bashrc` and test the installation of Tensorflow Object Detection API - ``` - (rfcn_venv)$ cd $TF_MODELS_ROOT/research/ - (rfcn_venv)$ wget -O protobuf.zip https://github.com/google/protobuf/releases/download/v3.0.0/protoc-3.0.0-linux-x86_64.zip - (rfcn_venv)$ unzip protobuf.zip - (rfcn_venv)$ ./bin/protoc object_detection/protos/*.proto --python_out=. - (rfcn_venv)$ export PYTHONPATH=$PYTHONPATH:$(pwd):$(pwd)/slim - (rfcn_venv)$ echo "export PYTHONPATH=$PYTHONPATH:$(pwd):$(pwd)/slim" >> ~/.bashrc - (rfcn_venv)$ python object_detection/builders/model_builder_test.py - ``` - -2. **Download the Data**: Download the 2017 validation COCO dataset (~780MB) (**note**: do not convert the COCO dataset to TF records format): - + 3.1 If you do not have pip or virtualenv, you will need to get them first: + ``` + sudo apt-get install -y python python-pip virtualenv ``` - (rfcn_venv)$ cd ~ - (rfcn_venv)$ mkdir -p coco/val - (rfcn_venv)$ wget http://images.cocodataset.org/zips/val2017.zip - (rfcn_venv)$ unzip val2017.zip -d coco/val - (rfcn_venv)$ export COCO_VAL_DATA=$(pwd)/coco/val/val2017 - (rfcn_venv)$ echo "export COCO_VAL_DATA=$(pwd)/coco/val/val2017" >> ~/.bashrc + + 3.2 Create and activate the python virtual environment in your home directory: + ``` + cd ~ + virtualenv od_venv + source od_venv/bin/activate ``` -3. **Download and Prepare the pre-trained SavedModel**: Download and extract the pre-trained model and copy the `rfcn_resnet101_fp32_coco/saved_model/saved_model.pb` to `rfcn/1` (the `1` subdirectory is important - don't skip it!). This is the file we will serve from TensorFlow Serving. - Refer to the [TensorFlow documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/saved_model) for more information about SavedModels, and refer to this [README file](/benchmarks/object_detection/tensorflow/rfcn/README.md#download_fp32_pretrained_model) to get the latest location of the pre-trained model. + 3.3 Install the required packages using `requirements.txt`: ``` - (rfcn_venv)$ cd ~/ - (rfcn_venv)$ wget https://storage.googleapis.com/intel-optimized-tensorflow/models/rfcn_resnet101_fp32_coco_pretrained_model.tar.gz - (rfcn_venv)$ tar -xzvf rfcn_resnet101_fp32_coco_pretrained_model.tar.gz - (rfcn_venv)$ mkdir -p rfcn/1 - (rfcn_venv)$ cp rfcn_resnet101_fp32_coco/saved_model/saved_model.pb rfcn/1 + pip install -r models/docs/object_detection/tensorflow_serving/requirements.txt ``` -4. **Discover the number of physical cores**: Compute *num_physical_cores* by executing the `lscpu` command and multiplying `Core(s) per socket` by `Socket(s)`. For example, for a machine with `Core(s) per socket: 28` and `Socket(s): 2`, `num_physical_cores = 28 * 2 = 56`. To compute *num_physical_cores* and *tf_session_parallelism* with bash commands: + 3.3 Choose between the REST example or the GRPC example (the environment dependencies are different depending on the protocol you use, + and GRPC is usually faster, especially when using larger batch sizes). Define a variable for your desired protocol. + + **REST**: + ``` + protocol_name=rest + ``` + + **GRPC**: + ``` + protocol_name=grpc + ``` + +4. **Install [TensorFlow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection)**: + For detailed instructions, [click here](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md). + We have already installed the required python packages for the API. Following are the rest of the instructions for Ubuntu 16.04. + + 4.1 Clone the tensorflow models repo into a new folder in your home directory. ``` - (rfcn_venv)$ cores_per_socket=`lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs` - (rfcn_venv)$ num_sockets=`lscpu | grep "Socket(s)" | cut -d':' -f2 | xargs` - (rfcn_venv)$ num_physical_cores=$((cores_per_socket * num_sockets)) - (rfcn_venv)$ echo $num_physical_cores + cd ~ + git clone https://github.com/tensorflow/models tensorflow-models + export TF_MODELS_ROOT=$(pwd)/tensorflow-models + echo "export TF_MODELS_ROOT=$(pwd)/tensorflow-models" >> ~/.bashrc ``` -5. **Start the server**: Now let's start up the TensorFlow model server. With `&` at the end of the cmd, runs the container as a background process. Press enter after executing the following cmd. -To optimize overall performance, use the following recommended settings from the [General Best Practices](/docs/general/tensorflow_serving/GeneralBestPractices.md): - * OMP_NUM_THREADS=*num_physical_cores* - * TENSORFLOW_INTER_OP_PARALLELISM=2 - * TENSORFLOW_INTRA_OP_PARALLELISM=*num_physical_cores* + 4.2 Manually install the protobuf-compiler v3.0.0, run the compilation process, add libraries to PYTHONPATH and to your `.bashrc` and test the installation of Tensorflow Object Detection API. + ``` + cd $TF_MODELS_ROOT/research/ + wget -O protobuf.zip https://github.com/protocolbuffers/protobuf/releases/download/v3.0.0/protoc-3.0.0-linux-x86_64.zip + unzip protobuf.zip + ./bin/protoc object_detection/protos/*.proto --python_out=. + export PYTHONPATH=$PYTHONPATH:$(pwd):$(pwd)/slim + echo "export PYTHONPATH=$PYTHONPATH:$(pwd):$(pwd)/slim" >> ~/.bashrc + python object_detection/builders/model_builder_test.py + ``` +5. **Discover the number of physical cores**: Compute *num_physical_cores* by executing the `lscpu` command and multiplying `Core(s) per socket` by `Socket(s)`. + For example, for a machine with `Core(s) per socket: 28` and `Socket(s): 2`, `num_physical_cores = 28 * 2 = 56`. + To compute *num_physical_cores* with bash commands: ``` - (rfcn_venv)$ cd ~ - (rfcn_venv)$ docker run \ - --name=tfserving_rfcn \ - -p 8501:8501 \ - -v "$(pwd)/rfcn:/models/rfcn" \ - -e MODEL_NAME=rfcn \ - -e OMP_NUM_THREADS=$num_physical_cores \ - -e TENSORFLOW_INTER_OP_PARALLELISM=2 \ - -e TENSORFLOW_INTRA_OP_PARALLELISM=$num_physical_cores \ - tensorflow/serving:mkl & - ``` - **Note**: For some models, playing around with these settings values can improve performance even further. - We recommend that you experiment with your own hardware and model if you have strict performance requirements. - -6. *Measure Online and Batch inference performance**: Clone the Intel Model Zoo into a directory called `intel-models` and run `rfcn-benchmark.py` [python script](/docs/object_detection/tensorflow_serving/rfcn-benchmark.py), which will test both Online and Batch performance. - ``` - (rfcn_venv)$ git clone https://github.com/IntelAI/models.git intel-models - (rfcn_venv)$ python intel-models/docs/object_detection/tensorflow_serving/rfcn-benchmark.py \ - -i $COCO_VAL_DATA + cores_per_socket=`lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs` + num_sockets=`lscpu | grep "Socket(s)" | cut -d':' -f2 | xargs` + num_physical_cores=$((cores_per_socket * num_sockets)) + echo $num_physical_cores ``` +6. **Start the server**: Now start up the TensorFlow model server. Using `-d` (for "detached") runs the container as a background process. + We will publish the ports for both REST (`-p 8501:8501`) and GRPC (`-p 8500:8500`). + To optimize overall performance, use the following recommended settings from the [General Best Practices](/docs/general/tensorflow_serving/GeneralBestPractices.md): + * OMP_NUM_THREADS=*num_physical_cores* + * TENSORFLOW_INTER_OP_PARALLELISM=2 + * TENSORFLOW_INTRA_OP_PARALLELISM=*num_physical_cores* + + ``` + cd ~ + docker run \ + --name=tfserving \ + -d \ + -p 8500:8500 \ + -p 8501:8501 \ + -v "$(pwd)/obj_detection:/models/$model_name" \ + -e MODEL_NAME=$model_name \ + -e OMP_NUM_THREADS=$num_physical_cores \ + -e TENSORFLOW_INTER_OP_PARALLELISM=2 \ + -e TENSORFLOW_INTRA_OP_PARALLELISM=$num_physical_cores \ + tensorflow/serving:mkl + ``` + + **Note**: For some models, playing around with the parallelism settings can improve performance even further. + We recommend that you experiment with your own hardware and model if you have strict performance requirements. -7. **Visualize Object Detection Output**: To visually see the output of object detection results, we will use Jupyter notebook via web browser. If you are using a system that does not have a browser, such as a VM on GCP or AWS, a workaround is to use local port forwarding of port 8888 to relay the jupyter service to your localhost. You will need to quit your SSH session and log back in with port forwarding configured. -For example, with a GCP VM, add `--ssh-flag="-L 8888:localhost:8888"` to your ssh command. Once you are connected again with port forwarding, reactivate the virtual environment, navigate to the tutorial directory, and start jupyter notebook. Continue with the next instruction. - ``` - $ cd ~ - $ source rfcn_venv/bin/activate - (rfcn_venv)$ cd intel-models/docs/object_detection/tensorflow_serving - (rfcn_venv)$ jupyter notebook +7. **Measure online and batch inference performance**: Run the `object_detection_benchmark.py` [python script](/docs/object_detection/tensorflow_serving/object_detection_benchmark.py), which will test both online and batch inference performance. + + ``` + cd ~ + python models/docs/object_detection/tensorflow_serving/object_detection_benchmark.py \ + -i $COCO_VAL_DATA \ + -m $model_name \ + -p $protocol_name ``` - After running `jupyter notebook` , paste the generated link into your browser and open the `RFCN.ipynb` file. You will need to edit the code in one place - in the second cell, insert the path to your downloaded COCO validation data set. Then, execute the cells in order. The output of the "Test Object Detection" section should be an image with objects correctly detected by the R-FCN model. -8. (Optional) **Using a single core**: In some cases, it is desirable to constrain the inference server to a single core or socket. Docker has many runtime flags that allow you to control the container's access to the host system's CPUs, memory, and other resources. See the [Docker document on this topic](https://docs.docker.com/config/containers/resource_constraints/#cpu) for all the options and their definitions. For example, to run the container so that a single CPU is used, you can use these settings: +8. **Visualize object detection output**: To visually see the results of object detection, we will use a Jupyter notebook via web browser. + If you are using a system that does not have a browser, such as a VM on GCP or AWS, a workaround is to use local port forwarding of port 8888 to relay the jupyter service to your localhost. + You will need to quit your SSH session and log back in with port forwarding configured. For example, with a GCP VM, add `--ssh-flag="-L 8888:localhost:8888"` to your ssh command. + Once you are connected again with port forwarding, reactivate the virtual environment, navigate to the tutorial directory, and start the jupyter notebook service. + + ``` + cd ~ + source od_venv/bin/activate + cd models/docs/object_detection/tensorflow_serving + jupyter notebook + ``` + + After running `jupyter notebook`, paste the generated link into your browser and open the `ObjectDetection.ipynb` file. + You will need to edit the code in one cell - in the second cell, insert the path to your downloaded COCO validation data set and name of your chosen model and protocol. + Then, execute the cells in order. The output of the "Test Object Detection" section should be an image with objects detected by the served model. + +9. (Optional) **Using a single core**: In some cases, it is desirable to constrain the inference server to a single core or socket. + Docker has many runtime flags that allow you to control the container's access to the host system's CPUs, memory, and other resources. + See the [Docker document on this topic](https://docs.docker.com/config/containers/resource_constraints/#cpu) for all the options and their definitions. + For example, to run the container so that a single CPU is used, you can use these settings: * `--cpuset-cpus="0"` * `--cpus="1"` * `OMP_NUM_THREADS=1` @@ -160,33 +200,39 @@ For example, with a GCP VM, add `--ssh-flag="-L 8888:localhost:8888"` to your ss * `TENSORFLOW_INTRA_OP_PARALLELISM=1` ``` - (rfcn_venv)$ docker run \ - --name=tfserving_rfcn_1 \ - -p 8500:8500 \ - --cpuset-cpus="0" \ - --cpus="1" \ - -v "$(pwd)/rfcn:/models/rfcn" \ - -e MODEL_NAME=rfcn \ - -e OMP_NUM_THREADS=1 \ - -e TENSORFLOW_INTER_OP_PARALLELISM=1 \ - -e TENSORFLOW_INTRA_OP_PARALLELISM=1 \ - tensorflow/serving:mkl & + cd ~ + docker run \ + --name=tfserving_1core \ + -d \ + -p 8500:8500 \ + -p 8501:8501 \ + --cpuset-cpus="0" \ + --cpus="1" \ + -v "$(pwd)/obj_detection:/models/$model_name" \ + -e MODEL_NAME=$model_name \ + -e OMP_NUM_THREADS=1 \ + -e TENSORFLOW_INTER_OP_PARALLELISM=1 \ + -e TENSORFLOW_INTRA_OP_PARALLELISM=1 \ + tensorflow/serving:mkl ``` - + 10. **Clean up**: * After saving any changes you made to the Jupyter notebook, close the file and stop the Jupyter server by clicking `Quit` from the main file browser. - * After you are fininshed with querying, you can stop the container which is running in the background. To restart the container with the same name, you need to stop and remove the container from the registry. To view your running containers run `docker ps`. - ``` - (rfcn_venv)$ docker rm -f tfserving_rfcn - ``` + * After you are finished with querying, you can stop the container which is running in the background. + To restart the container with the same name, you need to stop and remove the container from the registry. + To view your running containers run `docker ps`. + + ``` + docker rm -f tfserving + ``` + * Deactivate your virtual environment with `deactivate`. - ## Conclusion You have now seen an end-to-end example of serving an object detection model for inference using TensorFlow Serving, and learned: 1. How to choose good values for the performance-related runtime parameters exposed by the `docker run` command -2. How to verify that the served model can correctly detect objects in an image using a sample Jupyter notebook -3. How to measure online and batch inference metrics using a REST client +2. How to test online and batch inference metrics using a REST or GRPC client +3. How to verify that the served model can correctly detect objects in an image using a sample Jupyter notebook With this knowledge and the example code provided, you should be able to get started serving your own custom object detection model with good performance. If desired, you should also be able to investigate a variety of different settings combinations to see if further performance improvement are possible. diff --git a/docs/object_detection/tensorflow_serving/rfcn-benchmark.py b/docs/object_detection/tensorflow_serving/object_detection_benchmark.py similarity index 54% rename from docs/object_detection/tensorflow_serving/rfcn-benchmark.py rename to docs/object_detection/tensorflow_serving/object_detection_benchmark.py index 6948df969..c30c1aeae 100644 --- a/docs/object_detection/tensorflow_serving/rfcn-benchmark.py +++ b/docs/object_detection/tensorflow_serving/object_detection_benchmark.py @@ -14,7 +14,7 @@ # ####### USAGE ######### -# python rfcn-benchmark.py -i +# python object_detection_benchmark.py -i -m -p from __future__ import print_function @@ -25,8 +25,6 @@ import requests import numpy as np from PIL import Image -import tensorflow as tf -from object_detection.utils.visualization_utils import visualize_boxes_and_labels_on_image_array def check_for_link(value): @@ -40,7 +38,7 @@ def check_for_link(value): raise argparse.ArgumentTypeError("{} cannot be a link.".format(value)) def check_valid_folder(value): - """verifies filename exists and isn't a link""" + """Verifies filename exists and isn't a link""" if value is not None: if not os.path.isdir(value): raise argparse.ArgumentTypeError("{} does not exist or is not a directory.". @@ -48,6 +46,20 @@ def check_valid_folder(value): check_for_link(value) return value +def check_valid_model(value): + """Verifies model name is supported""" + if value not in ('rfcn', 'ssdmobilenet'): + raise argparse.ArgumentError("Model name {} does not match 'rfcn' or 'ssdmobilenet'.". + format(value)) + return value + +def check_valid_protocol(value): + """Verifies protocol is supported""" + if value not in ('rest', 'grpc'): + raise argparse.ArgumentError("Protocol name {} does not match 'rest' or 'grpc'.". + format(value)) + return value + def get_random_image(image_dir): image_path = os.path.join(image_dir, random.choice(os.listdir(image_dir))) image = Image.open(image_path) @@ -55,15 +67,38 @@ def get_random_image(image_dir): return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8) +def make_request(batch_size): + if PROTOCOL == 'rest': + np_images = np.repeat(np.expand_dims(get_random_image(IMAGES_PATH), 0).tolist(), batch_size, axis=0).tolist() + return '{"instances" : %s}' % np_images + elif PROTOCOL == 'grpc': + import grpc + import tensorflow as tf + from tensorflow_serving.apis import predict_pb2 + from tensorflow_serving.apis import prediction_service_pb2_grpc + np_images = np.repeat(np.expand_dims(get_random_image(IMAGES_PATH), 0), batch_size, axis=0) + channel = grpc.insecure_channel(SERVER_URL) + stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) + request = predict_pb2.PredictRequest() + request.model_spec.name = MODEL + request.model_spec.signature_name = 'serving_default' + request.inputs['inputs'].CopyFrom(tf.contrib.util.make_tensor_proto(np_images)) + return (stub, request) + +def send_request(predict_request): + if PROTOCOL == 'rest': + requests.post(SERVER_URL, data=predict_request) + elif PROTOCOL == 'grpc': + predict_request[0].Predict(predict_request[1]) + def benchmark(batch_size=1, num_iteration=20, warm_up_iteration=10): i = 0 total_time = 0 for _ in range(num_iteration): i += 1 - np_images = np.repeat(np.expand_dims(get_random_image(IMAGES_PATH), 0).tolist(), batch_size, axis=0).tolist() - predict_request = '{"instances" : %s}' % np_images + predict_request = make_request(batch_size) start_time = time.time() - requests.post(SERVER_URL, data=predict_request) + send_request(predict_request) time_consume = time.time() - start_time print('Iteration %d: %.3f sec' % (i, time_consume)) if i > warm_up_iteration: @@ -81,15 +116,26 @@ def benchmark(batch_size=1, num_iteration=20, warm_up_iteration=10): ap = argparse.ArgumentParser() ap.add_argument("-i", "--images_path", type=check_valid_folder, required=True, help="Path to COCO validation directory") + ap.add_argument("-m", "--model", type=check_valid_model, required=True, + help="Name of model (rfcn or ssdmobilenet)") + ap.add_argument("-p", "--protocol", type=check_valid_protocol, required=True, + help="Name of protocol (rest or grpc)") args = vars(ap.parse_args()) - - SERVER_URL = 'http://localhost:8501/v1/models/rfcn:predict' + IMAGES_PATH = args['images_path'] + MODEL = args['model'] + PROTOCOL = args['protocol'] + if PROTOCOL == 'rest': + SERVER_URL = 'http://localhost:8501/v1/models/{}:predict'.format(MODEL) + elif PROTOCOL == 'grpc': + SERVER_URL = 'localhost:8500' print('\n SERVER_URL: {} \n IMAGES_PATH: {}'.format(SERVER_URL, IMAGES_PATH)) - print('\nStarting R-FCN model benchmarking for Latency with batch_size=1, num_iteration=20, warm_up_iteration=10') + print('\nStarting {} model benchmarking for latency on {}:'.format(MODEL.upper(), PROTOCOL.upper())) + print('batch_size=1, num_iteration=20, warm_up_iteration=10\n') benchmark(batch_size=1, num_iteration=20, warm_up_iteration=10) - print('\nStarting R-FCN model benchmarking for Throughput with batch_size=128, num_iteration=10, warm_up_iteration=2') + print('\nStarting {} model benchmarking for throughput on {}:'.format(MODEL.upper(), PROTOCOL.upper())) + print('batch_size=128, num_iteration=10, warm_up_iteration=2\n') benchmark(batch_size=128, num_iteration=10, warm_up_iteration=2) diff --git a/docs/object_detection/tensorflow_serving/requirements.txt b/docs/object_detection/tensorflow_serving/requirements.txt new file mode 100644 index 000000000..1e77692c2 --- /dev/null +++ b/docs/object_detection/tensorflow_serving/requirements.txt @@ -0,0 +1,15 @@ +# rest +requests + +# grpc +intel-tensorflow +tensorflow-serving-api + +# object detection api +Cython +contextlib2 +jupyter +matplotlib +pillow +lxml +absl-py \ No newline at end of file diff --git a/docs/recommendation/tensorflow/Tutorial.md b/docs/recommendation/tensorflow/Tutorial.md index aa33a6643..96b389771 100644 --- a/docs/recommendation/tensorflow/Tutorial.md +++ b/docs/recommendation/tensorflow/Tutorial.md @@ -215,8 +215,6 @@ Set this parameter to a socket id to run the workload on a single socket. Average Latency (ms/batch) : ... Throughput is (records/sec) : ... -------------------------------------------------- - lscpu_path_cmd = command -v lscpu - lscpu located here: /usr/bin/lscpu num_inter_threads: 28 num_intra_threads: 1 Received these standard args: Namespace(accuracy_only=False, batch_size=512, benchmark_dir='/workspace/benchmarks', benchmark_only=True, checkpoint=None, data_location='/dataset', data_num_inter_threads=None, data_num_intra_threads=None, framework='tensorflow', input_graph='/in_graph/wide_deep_fp32_pretrained_model.pb', intelai_models='/workspace/intelai_models', mode='inference', model_args=[], model_name='wide_deep_large_ds', model_source_dir='/workspace/models', num_cores=-1, num_inter_threads=28, num_intra_threads=1, num_parallel_batches=28, output_dir='/workspace/benchmarks/common/tensorflow/logs', output_results=False, precision='fp32', socket_id=-1, use_case='recommendation', verbose=True) @@ -276,9 +274,7 @@ perform necessary installs, run the ```launch_benchmark.py``` script, and does n --debug   Example Output: - - lscpu_path_cmd = command -v lscpu - lscpu located here: b'/usr/bin/lscpu' + root@a78677f56d69:/workspace/benchmarks/common/tensorflow# To rerun the model script, execute the ```start.sh``` bash script from your existing directory with additional or modified flags. For example, to rerun with the best batch inference (batch size=512) settings, run with ```BATCH_SIZE``` diff --git a/models/image_recognition/tensorflow/densenet169/inference/fp32/accuracy.py b/models/image_recognition/tensorflow/densenet169/inference/fp32/accuracy.py new file mode 100644 index 000000000..0335ce423 --- /dev/null +++ b/models/image_recognition/tensorflow/densenet169/inference/fp32/accuracy.py @@ -0,0 +1,137 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys +import os +import time +import numpy as np +from tensorflow.core.protobuf import rewriter_config_pb2 +from google.protobuf import text_format +import tensorflow as tf +import image_preprocessing +import dataset + +NUM_TEST_IMAGES = 50000 + +def load_graph(model_file): + graph = tf.Graph() + graph_def = tf.GraphDef() + + import os + file_ext = os.path.splitext(model_file)[1] + + with open(model_file, "rb") as f: + if file_ext == '.pbtxt': + text_format.Merge(f.read(), graph_def) + else: + graph_def.ParseFromString(f.read()) + with graph.as_default(): + tf.import_graph_def(graph_def, name='') + return graph + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_graph", default=None, + help="graph/model to be executed") + parser.add_argument("--data_location", default=None, + help="full path to the validation data") + parser.add_argument("--input_height", default=224, + type=int, help="input height") + parser.add_argument("--input_width", default=224, + type=int, help="input width") + parser.add_argument("--batch_size", default=32, + type=int, help="batch size") + parser.add_argument("--input_layer", default="input", + help="name of input layer") + parser.add_argument("--output_layer", default="densenet169/predictions/Reshape_1", + help="name of output layer") + parser.add_argument( + '--num_inter_threads', + help='number threads across operators', + type=int, default=1) + parser.add_argument( + '--num_intra_threads', + help='number threads for an operator', + type=int, default=1) + args = parser.parse_args() + + if args.input_graph: + model_file = args.input_graph + else: + sys.exit("Please provide a graph file.") + input_height = args.input_height + input_width = args.input_width + batch_size = args.batch_size + input_layer = args.input_layer + output_layer = args.output_layer + num_inter_threads = args.num_inter_threads + num_intra_threads = args.num_intra_threads + data_location = args.data_location + dataset = dataset.ImagenetData(data_location) + preprocessor = image_preprocessing.ImagePreprocessor( + input_height, input_width, batch_size, + 1, # device count + tf.float32, # data_type for input fed to the graph + train=False, # doing inference + resize_method='crop') + images, labels = preprocessor.minibatch(dataset, subset='validation') + graph = load_graph(model_file) + input_tensor = graph.get_tensor_by_name(input_layer + ":0") + output_tensor = graph.get_tensor_by_name(output_layer + ":0") + + rewrite_options = rewriter_config_pb2.RewriterConfig( + layout_optimizer=rewriter_config_pb2.RewriterConfig.ON) + config = tf.ConfigProto() + config.inter_op_parallelism_threads = num_inter_threads + config.intra_op_parallelism_threads = num_intra_threads + config.graph_options.rewrite_options.remapping = ( + rewriter_config_pb2.RewriterConfig.OFF) + + total_accuracy1, total_accuracy5 = (0.0, 0.0) + num_processed_images = 0 + num_remaining_images = dataset.num_examples_per_epoch(subset='validation') \ + - num_processed_images + top1 = 0 + with tf.Session(config=config) as sess: + sess_graph = tf.Session(graph=graph, config=config) + + while num_remaining_images >= batch_size: + # Reads and preprocess data + #import pdb + #pdb.set_trace() + np_images, np_labels = sess.run([images[0], labels[0]]) + np_labels -= 1 + #print(np_labels.shape) + num_processed_images += batch_size + num_remaining_images -= batch_size + start_time = time.time() + # Compute inference on the preprocessed data + predictions1 = sess_graph.run(output_tensor, + {input_tensor: np_images}) + elapsed_time = time.time() - start_time + if(batch_size !=1): + predictions1 = sess.run(tf.squeeze(predictions1)) + else : + predictions1 = sess.run(tf.reshape(predictions1,[1,1000])) + predictions2 = tf.argmax(predictions1, axis=1) + predictions = sess.run(predictions2) + top1 += batch_size - (np.count_nonzero(predictions - np_labels)) + print("Iteration time: %0.4f ms" % elapsed_time) + print(top1/num_processed_images) diff --git a/models/image_recognition/tensorflow/densenet169/inference/fp32/benchmark.py b/models/image_recognition/tensorflow/densenet169/inference/fp32/benchmark.py new file mode 100644 index 000000000..4091b4137 --- /dev/null +++ b/models/image_recognition/tensorflow/densenet169/inference/fp32/benchmark.py @@ -0,0 +1,161 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. # You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys +import os +import time +import numpy as np +from tensorflow.core.protobuf import rewriter_config_pb2 +from google.protobuf import text_format +import tensorflow as tf + +def load_graph(model_file): + graph = tf.Graph() + graph_def = tf.GraphDef() + + import os + file_ext = os.path.splitext(model_file)[1] + + with open(model_file, "rb") as f: + if file_ext == '.pbtxt': + text_format.Merge(f.read(), graph_def) + else: + graph_def.ParseFromString(f.read()) + with graph.as_default(): + tf.import_graph_def(graph_def, name='') + + return graph + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_graph", default=None, + help="graph/model to be executed") + parser.add_argument("--input_height", default=224, + type=int, help="input height") + parser.add_argument("--input_width", default=224, + type=int, help="input width") + parser.add_argument("--batch_size", default=32, + type=int, help="batch size") + parser.add_argument("--input_layer", default="input", + help="name of input layer") + parser.add_argument("--output_layer", default="densenet169/predictions/Reshape_1", + help="name of output layer") + parser.add_argument( + '--num_inter_threads', + help='number threads across operators', + type=int, default=1) + parser.add_argument( + '--num_intra_threads', + help='number threads for an operator', + type=int, default=1) + parser.add_argument("-gpu", "--gpu", + default = -1, + type=int, help="Run on gpu, other wise cpu", + required=False) + + parser.add_argument("--warmup_steps", type=int, default=40, + help="number of warmup steps") + parser.add_argument("--steps", type=int, default=100, help="number of steps") + args = parser.parse_args() + + if args.input_graph: + model_file = args.input_graph + else: + sys.exit("Please provide a graph file.") + input_height = args.input_height + input_width = args.input_width + batch_size = args.batch_size + input_layer = args.input_layer + output_layer = args.output_layer + warmup_steps = args.warmup_steps + steps = args.steps + print(steps) + assert steps > 10, "Benchmark steps should be at least 10." + num_inter_threads = args.num_inter_threads + num_intra_threads = args.num_intra_threads + + input_shape = [batch_size, input_height, input_width, 3] + images = tf.truncated_normal( + input_shape, + dtype=tf.float32, + stddev=10, + name='synthetic_images') + + image_data = None + graph = load_graph(model_file) + + input_tensor = graph.get_tensor_by_name(input_layer + ":0"); + output_tensor = graph.get_tensor_by_name(output_layer + ":0"); + + rewrite_options = rewriter_config_pb2.RewriterConfig( + layout_optimizer=rewriter_config_pb2.RewriterConfig.ON) + config = tf.ConfigProto() + if (args.gpu < 0): + config.inter_op_parallelism_threads = num_inter_threads + config.intra_op_parallelism_threads = num_intra_threads + config.graph_options.rewrite_options.remapping = ( + rewriter_config_pb2.RewriterConfig.OFF) + #os.environ["OMP_NUM_THREADS"] = "14" + with tf.Session(config=config) as sess: + image_data = sess.run(images) + + with tf.Session(graph=graph, config=config) as sess: + sys.stdout.flush() + print("[Running warmup steps...]") + for t in range(warmup_steps): + start_time = time.time() + sess.run(output_tensor, {input_tensor: image_data}) + elapsed_time = time.time() - start_time + if((t+1) % 10 == 0): + print("steps = {0}, {1} images/sec" + "".format(t+1, batch_size/elapsed_time)) + avg = 0 + print("[Running benchmark steps...]") + total_time = 0; + total_images = 0; + for t in range(steps): + start_time = time.time() + results = sess.run(output_tensor, {input_tensor: image_data}) + elapsed_time = time.time() - start_time + avg += elapsed_time + if((t+1) % 10 == 0): + print("steps = {0}, {1} images/sec" + "".format(t+1, batch_size*(t+1)/avg)); + print(" Latency: {0} ms" + "".format(avg*1000. /(t+1))) diff --git a/models/image_recognition/tensorflow/densenet169/inference/fp32/cnn_util.py b/models/image_recognition/tensorflow/densenet169/inference/fp32/cnn_util.py new file mode 100644 index 000000000..32902d149 --- /dev/null +++ b/models/image_recognition/tensorflow/densenet169/inference/fp32/cnn_util.py @@ -0,0 +1,50 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Utilities for CNN benchmarks.""" + +import tensorflow as tf + + +def tensorflow_version_tuple(): + v = tf.__version__ + major, minor, patch = v.split('.') + return (int(major), int(minor), patch) + + +def tensorflow_version(): + vt = tensorflow_version_tuple() + return vt[0] * 1000 + vt[1] + diff --git a/models/image_recognition/tensorflow/densenet169/inference/fp32/dataset.py b/models/image_recognition/tensorflow/densenet169/inference/fp32/dataset.py new file mode 100644 index 000000000..88fdebce6 --- /dev/null +++ b/models/image_recognition/tensorflow/densenet169/inference/fp32/dataset.py @@ -0,0 +1,103 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Benchmark dataset utilities. +""" + +from abc import abstractmethod +import os + +import tensorflow as tf + + +class Dataset(object): + """Abstract class for cnn benchmarks dataset.""" + + def __init__(self, name, data_dir=None): + self.name = name + if data_dir is None: + raise ValueError('Data directory not specified') + self.data_dir = data_dir + + def tf_record_pattern(self, subset): + return os.path.join(self.data_dir, '%s-*-of-*' % subset) + + def reader(self): + return tf.TFRecordReader() + + @abstractmethod + def num_classes(self): + pass + + @abstractmethod + def num_examples_per_epoch(self, subset): + pass + + def __str__(self): + return self.name + + +class FlowersData(Dataset): + + def __init__(self, data_dir=None): + super(FlowersData, self).__init__('Flowers', data_dir) + + def num_classes(self): + return 5 + + def num_examples_per_epoch(self, subset): + if subset == 'train': + return 3170 + elif subset == 'validation': + return 500 + else: + raise ValueError('Invalid data subset "%s"' % subset) + + +class ImagenetData(Dataset): + + def __init__(self, data_dir=None): + super(ImagenetData, self).__init__('ImageNet', data_dir) + + def num_classes(self): + return 1000 + + def num_examples_per_epoch(self, subset='train'): + if subset == 'train': + return 1281167 + elif subset == 'validation': + return 50000 + else: + raise ValueError('Invalid data subset "%s"' % subset) diff --git a/models/image_recognition/tensorflow/densenet169/inference/fp32/densenet_preprocessing.py b/models/image_recognition/tensorflow/densenet169/inference/fp32/densenet_preprocessing.py new file mode 100644 index 000000000..298694af0 --- /dev/null +++ b/models/image_recognition/tensorflow/densenet169/inference/fp32/densenet_preprocessing.py @@ -0,0 +1,391 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Provides utilities to preprocess images. + +The preprocessing steps for VGG were introduced in the following technical +report: + + Very Deep Convolutional Networks For Large-Scale Image Recognition + Karen Simonyan and Andrew Zisserman + arXiv technical report, 2015 + PDF: http://arxiv.org/pdf/1409.1556.pdf + ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf + CC-BY-4.0 + +More information can be obtained from the VGG website: +www.robots.ox.ac.uk/~vgg/research/very_deep/ +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim + +_R_MEAN = 123.68 +_G_MEAN = 116.78 +_B_MEAN = 103.94 + +_SCALE_FACTOR = 0.017 + +_RESIZE_SIDE_MIN = 256 +_RESIZE_SIDE_MAX = 512 + + +def _crop(image, offset_height, offset_width, crop_height, crop_width): + """Crops the given image using the provided offsets and sizes. + + Note that the method doesn't assume we know the input image size but it does + assume we know the input image rank. + + Args: + image: an image of shape [height, width, channels]. + offset_height: a scalar tensor indicating the height offset. + offset_width: a scalar tensor indicating the width offset. + crop_height: the height of the cropped image. + crop_width: the width of the cropped image. + + Returns: + the cropped (and resized) image. + + Raises: + InvalidArgumentError: if the rank is not 3 or if the image dimensions are + less than the crop size. + """ + original_shape = tf.shape(image) + + rank_assertion = tf.Assert( + tf.equal(tf.rank(image), 3), + ['Rank of image must be equal to 3.']) + with tf.control_dependencies([rank_assertion]): + cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]]) + + size_assertion = tf.Assert( + tf.logical_and( + tf.greater_equal(original_shape[0], crop_height), + tf.greater_equal(original_shape[1], crop_width)), + ['Crop size greater than the image size.']) + + offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0])) + + # Use tf.slice instead of crop_to_bounding box as it accepts tensors to + # define the crop size. + with tf.control_dependencies([size_assertion]): + image = tf.slice(image, offsets, cropped_shape) + return tf.reshape(image, cropped_shape) + + +def _random_crop(image_list, crop_height, crop_width): + """Crops the given list of images. + + The function applies the same crop to each image in the list. This can be + effectively applied when there are multiple image inputs of the same + dimension such as: + + image, depths, normals = _random_crop([image, depths, normals], 120, 150) + + Args: + image_list: a list of image tensors of the same dimension but possibly + varying channel. + crop_height: the new height. + crop_width: the new width. + + Returns: + the image_list with cropped images. + + Raises: + ValueError: if there are multiple image inputs provided with different size + or the images are smaller than the crop dimensions. + """ + if not image_list: + raise ValueError('Empty image_list.') + + # Compute the rank assertions. + rank_assertions = [] + for i in range(len(image_list)): + image_rank = tf.rank(image_list[i]) + rank_assert = tf.Assert( + tf.equal(image_rank, 3), + ['Wrong rank for tensor %s [expected] [actual]', + image_list[i].name, 3, image_rank]) + rank_assertions.append(rank_assert) + + with tf.control_dependencies([rank_assertions[0]]): + image_shape = tf.shape(image_list[0]) + image_height = image_shape[0] + image_width = image_shape[1] + crop_size_assert = tf.Assert( + tf.logical_and( + tf.greater_equal(image_height, crop_height), + tf.greater_equal(image_width, crop_width)), + ['Crop size greater than the image size.']) + + asserts = [rank_assertions[0], crop_size_assert] + + for i in range(1, len(image_list)): + image = image_list[i] + asserts.append(rank_assertions[i]) + with tf.control_dependencies([rank_assertions[i]]): + shape = tf.shape(image) + height = shape[0] + width = shape[1] + + height_assert = tf.Assert( + tf.equal(height, image_height), + ['Wrong height for tensor %s [expected][actual]', + image.name, height, image_height]) + width_assert = tf.Assert( + tf.equal(width, image_width), + ['Wrong width for tensor %s [expected][actual]', + image.name, width, image_width]) + asserts.extend([height_assert, width_assert]) + + # Create a random bounding box. + # + # Use tf.random_uniform and not numpy.random.rand as doing the former would + # generate random numbers at graph eval time, unlike the latter which + # generates random numbers at graph definition time. + with tf.control_dependencies(asserts): + max_offset_height = tf.reshape(image_height - crop_height + 1, []) + with tf.control_dependencies(asserts): + max_offset_width = tf.reshape(image_width - crop_width + 1, []) + offset_height = tf.random_uniform( + [], maxval=max_offset_height, dtype=tf.int32) + offset_width = tf.random_uniform( + [], maxval=max_offset_width, dtype=tf.int32) + + return [_crop(image, offset_height, offset_width, + crop_height, crop_width) for image in image_list] + + +def _central_crop(image_list, crop_height, crop_width): + """Performs central crops of the given image list. + + Args: + image_list: a list of image tensors of the same dimension but possibly + varying channel. + crop_height: the height of the image following the crop. + crop_width: the width of the image following the crop. + + Returns: + the list of cropped images. + """ + outputs = [] + for image in image_list: + image_height = tf.shape(image)[0] + image_width = tf.shape(image)[1] + + offset_height = (image_height - crop_height) / 2 + offset_width = (image_width - crop_width) / 2 + + outputs.append(_crop(image, offset_height, offset_width, + crop_height, crop_width)) + return outputs + + +def _mean_image_subtraction(image, means): + """Subtracts the given means from each image channel. + + For example: + means = [123.68, 116.779, 103.939] + image = _mean_image_subtraction(image, means) + + Note that the rank of `image` must be known. + + Args: + image: a tensor of size [height, width, C]. + means: a C-vector of values to subtract from each channel. + + Returns: + the centered image. + + Raises: + ValueError: If the rank of `image` is unknown, if `image` has a rank other + than three or if the number of channels in `image` doesn't match the + number of values in `means`. + """ + if image.get_shape().ndims != 3: + raise ValueError('Input must be of size [height, width, C>0]') + num_channels = image.get_shape().as_list()[-1] + if len(means) != num_channels: + raise ValueError('len(means) must match the number of channels') + + channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image) + for i in range(num_channels): + channels[i] -= means[i] + return tf.concat(axis=2, values=channels) + + +def _smallest_size_at_least(height, width, smallest_side): + """Computes new shape with the smallest side equal to `smallest_side`. + + Computes new shape with the smallest side equal to `smallest_side` while + preserving the original aspect ratio. + + Args: + height: an int32 scalar tensor indicating the current height. + width: an int32 scalar tensor indicating the current width. + smallest_side: A python integer or scalar `Tensor` indicating the size of + the smallest side after resize. + + Returns: + new_height: an int32 scalar tensor indicating the new height. + new_width: and int32 scalar tensor indicating the new width. + """ + smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32) + + height = tf.to_float(height) + width = tf.to_float(width) + smallest_side = tf.to_float(smallest_side) + + scale = tf.cond(tf.greater(height, width), + lambda: smallest_side / width, + lambda: smallest_side / height) + new_height = tf.to_int32(height * scale) + new_width = tf.to_int32(width * scale) + return new_height, new_width + + +def _aspect_preserving_resize(image, smallest_side): + """Resize images preserving the original aspect ratio. + + Args: + image: A 3-D image `Tensor`. + smallest_side: A python integer or scalar `Tensor` indicating the size of + the smallest side after resize. + + Returns: + resized_image: A 3-D tensor containing the resized image. + """ + smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32) + + shape = tf.shape(image) + height = shape[0] + width = shape[1] + new_height, new_width = _smallest_size_at_least(height, width, smallest_side) + image = tf.expand_dims(image, 0) + resized_image = tf.image.resize_bilinear(image, [new_height, new_width], + align_corners=False) + resized_image = tf.squeeze(resized_image) + resized_image.set_shape([None, None, 3]) + return resized_image + + +def preprocess_for_train(image, + output_height, + output_width, + resize_side_min=_RESIZE_SIDE_MIN, + resize_side_max=_RESIZE_SIDE_MAX): + """Preprocesses the given image for training. + + Note that the actual resizing scale is sampled from + [`resize_size_min`, `resize_size_max`]. + + Args: + image: A `Tensor` representing an image of arbitrary size. + output_height: The height of the image after preprocessing. + output_width: The width of the image after preprocessing. + resize_side_min: The lower bound for the smallest side of the image for + aspect-preserving resizing. + resize_side_max: The upper bound for the smallest side of the image for + aspect-preserving resizing. + + Returns: + A preprocessed image. + """ + resize_side = tf.random_uniform( + [], minval=resize_side_min, maxval=resize_side_max+1, dtype=tf.int32) + + image = _aspect_preserving_resize(image, resize_side) + image = _random_crop([image], output_height, output_width)[0] + image.set_shape([output_height, output_width, 3]) + image = tf.to_float(image) + image = tf.image.random_flip_left_right(image) + + image = _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]) + return image * _SCALE_FACTOR + + +def preprocess_for_eval(image, output_height, output_width, resize_side): + """Preprocesses the given image for evaluation. + + Args: + image: A `Tensor` representing an image of arbitrary size. + output_height: The height of the image after preprocessing. + output_width: The width of the image after preprocessing. + resize_side: The smallest side of the image for aspect-preserving resizing. + + Returns: + A preprocessed image. + """ + image = _aspect_preserving_resize(image, resize_side) + image = _central_crop([image], output_height, output_width)[0] + image.set_shape([output_height, output_width, 3]) + image = tf.to_float(image) + + image = _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]) + return image * _SCALE_FACTOR + + +def preprocess_image(image, output_height, output_width, is_training=False, + resize_side_min=_RESIZE_SIDE_MIN, + resize_side_max=_RESIZE_SIDE_MAX): + """Preprocesses the given image. + + Args: + image: A `Tensor` representing an image of arbitrary size. + output_height: The height of the image after preprocessing. + output_width: The width of the image after preprocessing. + is_training: `True` if we're preprocessing the image for training and + `False` otherwise. + resize_side_min: The lower bound for the smallest side of the image for + aspect-preserving resizing. If `is_training` is `False`, then this value + is used for rescaling. + resize_side_max: The upper bound for the smallest side of the image for + aspect-preserving resizing. If `is_training` is `False`, this value is + ignored. Otherwise, the resize side is sampled from + [resize_size_min, resize_size_max]. + + Returns: + A preprocessed image. + """ + if is_training: + return preprocess_for_train(image, output_height, output_width, + resize_side_min, resize_side_max) + else: + return preprocess_for_eval(image, output_height, output_width, + resize_side_min) diff --git a/models/image_recognition/tensorflow/densenet169/inference/fp32/image_preprocessing.py b/models/image_recognition/tensorflow/densenet169/inference/fp32/image_preprocessing.py new file mode 100644 index 000000000..fe5d0eee0 --- /dev/null +++ b/models/image_recognition/tensorflow/densenet169/inference/fp32/image_preprocessing.py @@ -0,0 +1,420 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Image pre-processing utilities. +""" +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf +from random import randint +import densenet_preprocessing +from tensorflow.python.ops import data_flow_ops +import cnn_util + +def parse_example_proto(example_serialized): + """Parses an Example proto containing a training example of an image. + + The output of the build_image_data.py image preprocessing script is a dataset + containing serialized Example protocol buffers. Each Example proto contains + the following fields: + + image/height: 462 + image/width: 581 + image/colorspace: 'RGB' + image/channels: 3 + image/class/label: 615 + image/class/synset: 'n03623198' + image/class/text: 'knee pad' + image/object/bbox/xmin: 0.1 + image/object/bbox/xmax: 0.9 + image/object/bbox/ymin: 0.2 + image/object/bbox/ymax: 0.6 + image/object/bbox/label: 615 + image/format: 'JPEG' + image/filename: 'ILSVRC2012_val_00041207.JPEG' + image/encoded: + + Args: + example_serialized: scalar Tensor tf.string containing a serialized + Example protocol buffer. + + Returns: + image_buffer: Tensor tf.string containing the contents of a JPEG file. + label: Tensor tf.int32 containing the label. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + text: Tensor tf.string containing the human-readable label. + """ + # Dense features in Example proto. + feature_map = { + 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, + default_value=-1), + 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + } + sparse_float32 = tf.VarLenFeature(dtype=tf.float32) + # Sparse features in Example proto. + feature_map.update( + {k: sparse_float32 for k in ['image/object/bbox/xmin', + 'image/object/bbox/ymin', + 'image/object/bbox/xmax', + 'image/object/bbox/ymax']}) + + features = tf.parse_single_example(example_serialized, feature_map) + label = tf.cast(features['image/class/label'], dtype=tf.int32) + + xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) + ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) + xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) + ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) + + # Note that we impose an ordering of (y, x) just to make life difficult. + bbox = tf.concat([ymin, xmin, ymax, xmax], 0) + + # Force the variable number of bounding boxes into the shape + # [1, num_boxes, coords]. + bbox = tf.expand_dims(bbox, 0) + bbox = tf.transpose(bbox, [0, 2, 1]) + + return features['image/encoded'], label, bbox, features['image/class/text'] + + +def decode_jpeg(image_buffer, scope=None): # , dtype=tf.float32): + """Decode a JPEG string into one 3-D float image Tensor. + + Args: + image_buffer: scalar string Tensor. + scope: Optional scope for op_scope. + Returns: + 3-D float Tensor with values ranging from [0, 1). + """ + # with tf.op_scope([image_buffer], scope, 'decode_jpeg'): + # with tf.name_scope(scope, 'decode_jpeg', [image_buffer]): + with tf.name_scope(scope or 'decode_jpeg'): + # Decode the string as an RGB JPEG. + # Note that the resulting image contains an unknown height and width + # that is set dynamically by decode_jpeg. In other words, the height + # and width of image is unknown at compile-time. + image = tf.image.decode_jpeg(image_buffer, channels=3, + fancy_upscaling=False, + dct_method='INTEGER_FAST') + + # image = tf.Print(image, [tf.shape(image)], 'Image shape: ') + + return image + + +def eval_image(image, height, width, bbox, thread_id, resize): + """Get the image for model evaluation.""" + with tf.name_scope('eval_image'): + if not thread_id: + tf.summary.image( + 'original_image', tf.expand_dims(image, 0)) + + if resize == 'crop': + # Note: This is much slower than crop_to_bounding_box + # It seems that the redundant pad step has huge overhead + # distorted_image = tf.image.resize_image_with_crop_or_pad(image, + # height, width) + shape = tf.shape(image) + image = tf.cond(tf.less(shape[0], shape[1]), + lambda: tf.image.resize_images(image, tf.convert_to_tensor([256, 256*shape[1]/shape[0]], dtype=tf.int32)), + lambda: tf.image.resize_images(image, tf.convert_to_tensor([256*shape[0]/shape[1], 256], dtype=tf.int32))) + shape = tf.shape(image) + + y0 = (shape[0] - height) // 2 + x0 = (shape[1] - width) // 2 + #y0=tf.random_uniform([],minval=0,maxval=(shape[0] - height + 1), dtype=tf.int32) + #x0=tf.random_uniform([],minval=0,maxval=(shape[1] - width + 1), dtype=tf.int32) + ## distorted_image = tf.slice(image, [y0,x0,0], [height,width,3]) + distorted_image = tf.image.crop_to_bounding_box(image, y0, x0, height, + width) + else: + sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( + tf.shape(image), + bounding_boxes=bbox, + min_object_covered=0.5, + aspect_ratio_range=[0.90, 1.10], + area_range=[0.10, 1.0], + max_attempts=100, + use_image_if_no_bounding_boxes=True) + bbox_begin, bbox_size, _ = sample_distorted_bounding_box + # Crop the image to the specified bounding box. + distorted_image = tf.slice(image, bbox_begin, bbox_size) + resize_method = { + 'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR, + 'bilinear': tf.image.ResizeMethod.BILINEAR, + 'bicubic': tf.image.ResizeMethod.BICUBIC, + 'area': tf.image.ResizeMethod.AREA + }[resize] + # This resizing operation may distort the images because the aspect + # ratio is not respected. + if cnn_util.tensorflow_version() >= 11: + distorted_image = tf.image.resize_images( + distorted_image, [height, width], + resize_method, + align_corners=False) + else: + distorted_image = tf.image.resize_images( + distorted_image, height, width, resize_method, align_corners=False) + distorted_image.set_shape([height, width, 3]) + if not thread_id: + tf.summary.image( + 'cropped_resized_image', tf.expand_dims(distorted_image, 0)) + image = distorted_image + return image + + +def distort_image(image, height, width, bbox, thread_id=0, scope=None): + """Distort one image for training a network. + + Distorting images provides a useful technique for augmenting the data + set during training in order to make the network invariant to aspects + of the image that do not effect the label. + + Args: + image: 3-D float Tensor of image + height: integer + width: integer + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged + as [ymin, xmin, ymax, xmax]. + thread_id: integer indicating the preprocessing thread. + scope: Optional scope for op_scope. + Returns: + 3-D float Tensor of distorted image used for training. + """ + # with tf.op_scope([image, height, width, bbox], scope, 'distort_image'): + # with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]): + with tf.name_scope(scope or 'distort_image'): + # Each bounding box has shape [1, num_boxes, box coords] and + # the coordinates are ordered [ymin, xmin, ymax, xmax]. + + # After this point, all image pixels reside in [0,1) + # until the very end, when they're rescaled to (-1, 1). The various + # adjust_* ops all require this range for dtype float. + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + + # Display the bounding box in the first thread only. + if not thread_id: + image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), + bbox) + tf.summary.image( + 'image_with_bounding_boxes', image_with_box) + + # A large fraction of image datasets contain a human-annotated bounding + # box delineating the region of the image containing the object of interest. + # We choose to create a new bounding box for the object which is a randomly + # distorted version of the human-annotated bounding box that obeys an allowed + # range of aspect ratios, sizes and overlap with the human-annotated + # bounding box. If no box is supplied, then we assume the bounding box is + # the entire image. + sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( + tf.shape(image), + bounding_boxes=bbox, + min_object_covered=0.1, + aspect_ratio_range=[0.99, 1.01], + area_range=[0.05, 1.0], + max_attempts=100, + use_image_if_no_bounding_boxes=True) + bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box + if not thread_id: + image_with_distorted_box = tf.image.draw_bounding_boxes( + tf.expand_dims(image, 0), distort_bbox) + tf.summary.image( + 'images_with_distorted_bounding_box', + image_with_distorted_box) + + # Crop the image to the specified bounding box. + distorted_image = tf.slice(image, bbox_begin, bbox_size) + + # This resizing operation may distort the images because the aspect + # ratio is not respected. We select a resize method in a round robin + # fashion based on the thread number. + # Note that ResizeMethod contains 4 enumerated resizing methods. + resize_method = thread_id % 4 + if cnn_util.tensorflow_version() >= 11: + distorted_image = tf.image.resize_images( + distorted_image, [height, width], resize_method, align_corners=False) + else: + distorted_image = tf.image.resize_images( + distorted_image, height, width, resize_method, align_corners=False) + # Restore the shape since the dynamic slice based upon the bbox_size loses + # the third dimension. + distorted_image.set_shape([height, width, 3]) + if not thread_id: + tf.summary.image( + 'cropped_resized_image', + tf.expand_dims(distorted_image, 0)) + + # Randomly flip the image horizontally. + distorted_image = tf.image.random_flip_left_right(distorted_image) + + # Randomly distort the colors. + distorted_image = distort_color(distorted_image, thread_id) + + # Note: This ensures the scaling matches the output of eval_image + distorted_image *= 256 + + if not thread_id: + tf.summary.image( + 'final_distorted_image', + tf.expand_dims(distorted_image, 0)) + return distorted_image + + +def distort_color(image, thread_id=0, scope=None): + """Distort the color of the image. + + Each color distortion is non-commutative and thus ordering of the color ops + matters. Ideally we would randomly permute the ordering of the color ops. + Rather then adding that level of complication, we select a distinct ordering + of color ops for each preprocessing thread. + + Args: + image: Tensor containing single image. + thread_id: preprocessing thread ID. + scope: Optional scope for op_scope. + Returns: + color-distorted image + """ + # with tf.op_scope([image], scope, 'distort_color'): + # with tf.name_scope(scope, 'distort_color', [image]): + with tf.name_scope(scope or 'distort_color'): + color_ordering = thread_id % 2 + + if color_ordering == 0: + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + elif color_ordering == 1: + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + + # The random_* ops do not necessarily clamp. + image = tf.clip_by_value(image, 0.0, 1.0) + return image + + +class ImagePreprocessor(object): + """Preprocessor for input images.""" + + def __init__(self, + height, + width, + batch_size, + device_count, + dtype=tf.float32, + train=True, + distortions=None, + resize_method=None): + self.height = height + self.width = width + self.batch_size = batch_size + self.device_count = device_count + self.dtype = dtype + self.train = train + self.resize_method = resize_method + if distortions is None: + distortions = False + self.distortions = distortions + if self.batch_size % self.device_count != 0: + raise ValueError( + ('batch_size must be a multiple of device_count: ' + 'batch_size %d, device_count: %d') % + (self.batch_size, self.device_count)) + self.batch_size_per_device = self.batch_size // self.device_count + + def preprocess(self, image_buffer, bbox, thread_id): + """Preprocessing image_buffer using thread_id.""" + # Note: Width and height of image is known only at runtime. + image = tf.image.decode_jpeg(image_buffer, channels=3, + dct_method='INTEGER_FAST') + if self.train and self.distortions: + image = distort_image(image, self.height, self.width, bbox, thread_id) + else: + #image = eval_image(image, self.height, self.width, bbox, thread_id, + # self.resize_method) + image = densenet_preprocessing.preprocess_image(image,224,224,False) + # Note: image is now float32 [height,width,3] with range [0, 255] + + # image = tf.cast(image, tf.uint8) # HACK TESTING + + return image + + def minibatch(self, dataset, subset): + with tf.name_scope('batch_processing'): + images = [[] for i in range(self.device_count)] + labels = [[] for i in range(self.device_count)] + record_input = data_flow_ops.RecordInput( + file_pattern=dataset.tf_record_pattern(subset), + seed=randint(0, 9000), + parallelism=64, + buffer_size=10000, + batch_size=self.batch_size, + name='record_input') + records = record_input.get_yield_op() + records = tf.split(records, self.batch_size, 0) + records = [tf.reshape(record, []) for record in records] + for i in xrange(self.batch_size): + value = records[i] + image_buffer, label_index, bbox, _ = parse_example_proto(value) + image = self.preprocess(image_buffer, bbox, i % 4) + + device_index = i % self.device_count + images[device_index].append(image) + labels[device_index].append(label_index) + label_index_batch = [None] * self.device_count + for device_index in xrange(self.device_count): + images[device_index] = tf.parallel_stack(images[device_index]) + label_index_batch[device_index] = tf.concat(labels[device_index], 0) + + # dynamic_pad=True) # HACK TESTING dynamic_pad=True + images[device_index] = tf.cast(images[device_index], self.dtype) + depth = 3 + images[device_index] = tf.reshape( + images[device_index], + shape=[self.batch_size_per_device, self.height, self.width, depth]) + label_index_batch[device_index] = tf.reshape( + label_index_batch[device_index], [self.batch_size_per_device]) + # Display the training images in the visualizer. + # tf.summary.image('images', images) + + return images, label_index_batch diff --git a/models/image_recognition/tensorflow/inception_resnet_v2/eval_image_classifier.py b/models/image_recognition/tensorflow/inception_resnet_v2/eval_image_classifier.py deleted file mode 100644 index 361836891..000000000 --- a/models/image_recognition/tensorflow/inception_resnet_v2/eval_image_classifier.py +++ /dev/null @@ -1,277 +0,0 @@ -# -# -*- coding: utf-8 -*- -# -# Copyright (c) 2019 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: EPL-2.0 -# - - -# Copyright 2016 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Generic evaluation script that evaluates a model using a given dataset.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import tensorflow as tf -import os -import time -from datetime import datetime - -import dataset_factory -import nets_factory -import preprocessing_factory - -slim = tf.contrib.slim - -tf.app.flags.DEFINE_integer( - 'batch_size', 100, 'The number of samples in each batch.') - -tf.app.flags.DEFINE_integer( - 'max_num_batches', 1, - 'Max number of batches to evaluate by default use all.') - -tf.app.flags.DEFINE_string( - 'master', '', 'The address of the TensorFlow master to use.') - -tf.app.flags.DEFINE_string( - 'checkpoint_path', '/tmp/tfmodel/', - 'The directory where the model was written to or an absolute path to a ' - 'checkpoint file.') - -tf.app.flags.DEFINE_string( - 'eval_dir', '/tmp/tfmodel/', 'Directory where the results are saved to.') - -tf.app.flags.DEFINE_integer( - 'num_preprocessing_threads', 4, - 'The number of threads used to create the batches.') - -tf.app.flags.DEFINE_string( - 'dataset_name', 'imagenet', 'The name of the dataset to load.') - -tf.app.flags.DEFINE_string( - 'dataset_split_name', 'test', 'The name of the train/test split.') - -tf.app.flags.DEFINE_string( - 'dataset_dir', None, 'The directory where the dataset files are stored.') - -tf.app.flags.DEFINE_integer( - 'labels_offset', 0, - 'An offset for the labels in the dataset. This flag is primarily used to ' - 'evaluate the VGG and ResNet architectures which do not use a background ' - 'class for the ImageNet dataset.') - -tf.app.flags.DEFINE_string( - 'model_name', 'inception_resnet_v2', - 'The name of the architecture to evaluate.') - -tf.app.flags.DEFINE_string( - 'preprocessing_name', None, - 'The name of the preprocessing to use. If left ' - 'as `None`, then the model_name flag is used.') - -tf.app.flags.DEFINE_float( - 'moving_average_decay', None, - 'The decay to use for the moving average.' - 'If left as None, then moving averages are not used.') - -tf.app.flags.DEFINE_integer( - 'eval_image_size', None, 'Eval image size') - -tf.app.flags.DEFINE_integer( - 'eval_log_frequency', 10, - 'Number of eval steps to run between displaying ' - 'eval metrics.') - -tf.app.flags.DEFINE_integer( - 'inter_op_parallelism_threads', 1, 'The number of inter-thread.') - -tf.app.flags.DEFINE_integer( - 'intra_op_parallelism_threads', 28, 'The number of intra-thread.') - - -FLAGS = tf.app.flags.FLAGS - -class _LoggerHook(tf.train.SessionRunHook): - """ Logs loss and runtime.""" - - def begin(self): - self._step = -1 - self._displayed_steps = 0 - self._total_images_per_sec = 0 - - def before_run(self, run_context): - self._step += 1 - self._start_time = time.time() - - def after_run(self, run_context, run_values): - duration = time.time() - self._start_time - if (self._step + 1) % FLAGS.eval_log_frequency == 0: - images_per_sec = FLAGS.batch_size / duration - self._displayed_steps += 1 - self._total_images_per_sec += images_per_sec - - format_str = ('%s: step %d, %.1f images/sec') - print ( - format_str % (datetime.now(), (self._step+1), images_per_sec)) - - def end(self, run_context): - print( - 'self._total_images_per_sec = %.1f' % self._total_images_per_sec) - print('self._displayed_steps = %d' % self._displayed_steps) - images_per_sec = self._total_images_per_sec / self._displayed_steps - print('Total images/sec = %.1f' %(images_per_sec)) - if FLAGS.batch_size == 1: - latency = 1000 / images_per_sec - print('Latency ms/step = %.1f' % (latency)) - -def main(_): - if not FLAGS.dataset_dir: - raise ValueError( - 'You must supply the dataset directory with --dataset_dir') - - tf.logging.set_verbosity(tf.logging.INFO) - #os.environ["OMP_NUM_THREADS"] = "54" - with tf.Graph().as_default(): - tf_global_step = slim.get_or_create_global_step() - - ###################### - # Select the dataset # - ###################### - dataset = dataset_factory.get_dataset( - FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) - - #################### - # Select the model # - #################### - network_fn = nets_factory.get_network_fn( - FLAGS.model_name, - num_classes=(dataset.num_classes - FLAGS.labels_offset), - is_training=False) - - ############################################################## - # Create a dataset provider that loads data from the dataset # - ############################################################## - provider = slim.dataset_data_provider.DatasetDataProvider( - dataset, - shuffle=False, - common_queue_capacity=2 * FLAGS.batch_size, - common_queue_min=FLAGS.batch_size) - [image, label] = provider.get(['image', 'label']) - label -= FLAGS.labels_offset - - ##################################### - # Select the preprocessing function # - ##################################### - preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name - image_preprocessing_fn = preprocessing_factory.get_preprocessing( - preprocessing_name, - is_training=False) - - eval_image_size = \ - FLAGS.eval_image_size or network_fn.default_image_size - - image = image_preprocessing_fn( - image, eval_image_size, eval_image_size) - - images, labels = tf.train.batch( - [image, label], - batch_size=FLAGS.batch_size, - num_threads=FLAGS.num_preprocessing_threads, - capacity=5 * FLAGS.batch_size) - - #################### - # Define the model # - #################### - logits, _ = network_fn(images) - - if FLAGS.moving_average_decay: - variable_averages = tf.train.ExponentialMovingAverage( - FLAGS.moving_average_decay, tf_global_step) - variables_to_restore = variable_averages.variables_to_restore( - slim.get_model_variables()) - variables_to_restore[tf_global_step.op.name] = tf_global_step - else: - variables_to_restore = slim.get_variables_to_restore() - - predictions = tf.argmax(logits, 1) - #labels = tf.squeeze(labels) - - # Define the metrics: - names_to_values, names_to_updates = \ - slim.metrics.aggregate_metric_map({ - 'Accuracy': slim.metrics.streaming_accuracy( - predictions, labels), - 'Recall_5': slim.metrics.streaming_recall_at_k( - logits, labels, 5), - }) - - # Print the summaries to screen. - for name, value in names_to_values.items(): - summary_name = 'eval/%s' % name - op = tf.summary.scalar(summary_name, value, collections=[]) - op = tf.Print(op, [value], summary_name) - tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) - - # TODO(sguada) use num_epochs=1 - if FLAGS.max_num_batches: - num_batches = FLAGS.max_num_batches - else: - # This ensures that we make a single pass over all of the data. - num_batches = math.ceil( - dataset.num_samples / float(FLAGS.batch_size)) - - num_batches = 100 - - config = tf.ConfigProto( - inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads, - intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads) - - if tf.gfile.IsDirectory(FLAGS.checkpoint_path): - checkpoint_path = tf.train.latest_checkpoint( - FLAGS.checkpoint_path) - else: - checkpoint_path = FLAGS.checkpoint_path - - tf.logging.info('Evaluating %s' % checkpoint_path) - - slim.evaluation.evaluate_once( - master=FLAGS.master, - checkpoint_path=checkpoint_path, - logdir=FLAGS.eval_dir, - num_evals=num_batches, - eval_op=list(names_to_updates.values()), - variables_to_restore=variables_to_restore, - hooks=[_LoggerHook()], - session_config=config) - - -if __name__ == '__main__': - tf.app.run() diff --git a/models/image_recognition/tensorflow/inception_resnet_v2/eval_image_classifier_accuracy.py b/models/image_recognition/tensorflow/inception_resnet_v2/eval_image_classifier_accuracy.py index 5671f2287..595b252a4 100644 --- a/models/image_recognition/tensorflow/inception_resnet_v2/eval_image_classifier_accuracy.py +++ b/models/image_recognition/tensorflow/inception_resnet_v2/eval_image_classifier_accuracy.py @@ -147,9 +147,11 @@ def load_graph(model_file): np_images, np_labels = sess.run([images[0], labels[0]]) num_processed_images += batch_size num_remaining_images -= batch_size + start_time = time.time() # Compute inference on the preprocessed data predictions = sess_graph.run(output_tensor, {input_tensor: np_images}) + elapsed_time = time.time() - start_time accuracy1 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) @@ -160,6 +162,7 @@ def load_graph(model_file): np_accuracy1, np_accuracy5 = sess.run([accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 + print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images)) diff --git a/models/image_recognition/tensorflow/inceptionv3/fp32/eval_image_classifier_inference.py b/models/image_recognition/tensorflow/inceptionv3/fp32/eval_image_classifier_inference.py index 98b826ac9..b84d28ae3 100644 --- a/models/image_recognition/tensorflow/inceptionv3/fp32/eval_image_classifier_inference.py +++ b/models/image_recognition/tensorflow/inceptionv3/fp32/eval_image_classifier_inference.py @@ -189,9 +189,11 @@ def run(self): num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size + start_time = time.time() # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) + elapsed_time = time.time() - start_time with tf.Graph().as_default() as accu_graph: accuracy1 = tf.reduce_sum( @@ -207,6 +209,7 @@ def run(self): total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 + print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images)) diff --git a/models/image_recognition/tensorflow/inceptionv3/int8/accuracy.py b/models/image_recognition/tensorflow/inceptionv3/int8/accuracy.py index 7d79593aa..8062bd6be 100644 --- a/models/image_recognition/tensorflow/inceptionv3/int8/accuracy.py +++ b/models/image_recognition/tensorflow/inceptionv3/int8/accuracy.py @@ -120,9 +120,11 @@ def load_graph(model_file): np_images, np_labels = sess.run([images[0], labels[0]]) num_processed_images += batch_size num_remaining_images -= batch_size + start_time = time.time() # Compute inference on the preprocessed data predictions = sess_graph.run(output_tensor, {input_tensor: np_images}) + elapsed_time = time.time() - start_time accuracy1 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) @@ -133,6 +135,7 @@ def load_graph(model_file): np_accuracy1, np_accuracy5 = sess.run([accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 + print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1/num_processed_images, total_accuracy5/num_processed_images)) diff --git a/models/image_recognition/tensorflow/inceptionv4/inference/accuracy.py b/models/image_recognition/tensorflow/inceptionv4/inference/accuracy.py index 3dc0b90f9..a3bdf7c58 100644 --- a/models/image_recognition/tensorflow/inceptionv4/inference/accuracy.py +++ b/models/image_recognition/tensorflow/inceptionv4/inference/accuracy.py @@ -144,9 +144,11 @@ def load_graph(model_file): np_images, np_labels = sess.run([images[0], labels[0]]) num_processed_images += batch_size num_remaining_images -= batch_size + start_time = time.time() # Compute inference on the preprocessed data predictions = sess_graph.run(output_tensor, {input_tensor: np_images}) + elapsed_time = time.time() - start_time accuracy1 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) @@ -157,6 +159,7 @@ def load_graph(model_file): np_accuracy1, np_accuracy5 = sess.run([accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 + print("Iteration time: %0.4f ms" % elapsed_time) print( "Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % ( diff --git a/models/image_recognition/tensorflow/mobilenet_v1/inference/fp32/accuracy.py b/models/image_recognition/tensorflow/mobilenet_v1/inference/fp32/accuracy.py index 7d6a37abc..f5d45fb9f 100644 --- a/models/image_recognition/tensorflow/mobilenet_v1/inference/fp32/accuracy.py +++ b/models/image_recognition/tensorflow/mobilenet_v1/inference/fp32/accuracy.py @@ -143,9 +143,11 @@ def load_graph(model_file): np_images, np_labels = sess.run([images[0], labels[0]]) num_processed_images += batch_size num_remaining_images -= batch_size + start_time = time.time() # Compute inference on the preprocessed data predictions = sess_graph.run(output_tensor, {input_tensor: np_images}) + elapsed_time = time.time() - start_time accuracy1 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) @@ -156,6 +158,7 @@ def load_graph(model_file): np_accuracy1, np_accuracy5 = sess.run([accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 + print("Iteration time: %0.4f ms" % elapsed_time) print( "Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % ( diff --git a/models/image_recognition/tensorflow/mobilenet_v1/inference/fp32/eval_image_classifier.py b/models/image_recognition/tensorflow/mobilenet_v1/inference/fp32/eval_image_classifier.py index fd3165387..974913258 100644 --- a/models/image_recognition/tensorflow/mobilenet_v1/inference/fp32/eval_image_classifier.py +++ b/models/image_recognition/tensorflow/mobilenet_v1/inference/fp32/eval_image_classifier.py @@ -146,56 +146,66 @@ def end(self, run_context): print('Latency ms/step = %.1f' % (latency)) def main(_): - if not FLAGS.dataset_dir: - raise ValueError('You must supply the dataset directory with --dataset_dir') - tf.logging.set_verbosity(tf.logging.INFO) - #os.environ["OMP_NUM_THREADS"] = "54" + with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### - dataset = dataset_factory.get_dataset( - FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) + if FLAGS.dataset_dir: + print("Inference using real data") + dataset = dataset_factory.get_dataset( + FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) + num_classes = dataset.num_classes - FLAGS.labels_offset + else: + print("Inference using synthetic data") + num_classes = 1000 #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, - num_classes=(dataset.num_classes - FLAGS.labels_offset), - is_training=False) - - ############################################################## - # Create a dataset provider that loads data from the dataset # - ############################################################## - provider = slim.dataset_data_provider.DatasetDataProvider( - dataset, - shuffle=False, - common_queue_capacity=2 * FLAGS.batch_size, - common_queue_min=FLAGS.batch_size) - [image, label] = provider.get(['image', 'label']) - label -= FLAGS.labels_offset - - ##################################### - # Select the preprocessing function # - ##################################### - preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name - image_preprocessing_fn = preprocessing_factory.get_preprocessing( - preprocessing_name, + num_classes=num_classes, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size - image = image_preprocessing_fn(image, eval_image_size, eval_image_size) - - images, labels = tf.train.batch( - [image, label], - batch_size=FLAGS.batch_size, - num_threads=FLAGS.num_preprocessing_threads, - capacity=5 * FLAGS.batch_size) + if FLAGS.dataset_dir: + ############################################################## + # Create a dataset provider that loads data from the dataset # + ############################################################## + provider = slim.dataset_data_provider.DatasetDataProvider( + dataset, + shuffle=False, + common_queue_capacity=2 * FLAGS.batch_size, + common_queue_min=FLAGS.batch_size) + [image, label] = provider.get(['image', 'label']) + label -= FLAGS.labels_offset + + ##################################### + # Select the preprocessing function # + ##################################### + preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name + image_preprocessing_fn = preprocessing_factory.get_preprocessing( + preprocessing_name, + is_training=False) + + image = image_preprocessing_fn(image, eval_image_size, eval_image_size) + + images, labels = tf.train.batch( + [image, label], + batch_size=FLAGS.batch_size, + num_threads=FLAGS.num_preprocessing_threads, + capacity=5 * FLAGS.batch_size) + else: + # Generate random images and labels with constant 0 when no dataset is used + input_shape = [FLAGS.batch_size, eval_image_size, eval_image_size, 3] + label_shape = [FLAGS.batch_size] + images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') + labels = tf.constant(0, shape=label_shape, dtype=tf.int64) #################### # Define the model # @@ -258,4 +268,4 @@ def main(_): if __name__ == '__main__': - tf.app.run() \ No newline at end of file + tf.app.run() diff --git a/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/__init__.py b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/__init__.py new file mode 100644 index 000000000..159180624 --- /dev/null +++ b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/__init__.py @@ -0,0 +1,20 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + diff --git a/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/accuracy.py b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/accuracy.py new file mode 100644 index 000000000..6d7acaf50 --- /dev/null +++ b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/accuracy.py @@ -0,0 +1,135 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys +import os +import time +import numpy as np + +from google.protobuf import text_format +import tensorflow as tf +import preprocessing +import datasets + +NUM_TEST_IMAGES = 50000 + +def load_graph(model_file): + graph = tf.Graph() + graph_def = tf.GraphDef() + + import os + file_ext = os.path.splitext(model_file)[1] + + with open(model_file, "rb") as f: + if file_ext == '.pbtxt': + text_format.Merge(f.read(), graph_def) + else: + graph_def.ParseFromString(f.read()) + with graph.as_default(): + tf.import_graph_def(graph_def, name='') + + return graph + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_graph", default=None, + help="graph/model to be executed") + parser.add_argument("--data_location", default=None, + help="full path to the validation data") + parser.add_argument("--input_height", default=224, + type=int, help="input height") + parser.add_argument("--input_width", default=224, + type=int, help="input width") + parser.add_argument("--batch_size", default=32, + type=int, help="batch size") + parser.add_argument("--input_layer", default="input", + help="name of input layer") + parser.add_argument("--output_layer", default="MobilenetV1/Predictions/Reshape_1", + help="name of output layer") + parser.add_argument( + '--num_inter_threads', + help='number threads across operators', + type=int, default=1) + parser.add_argument( + '--num_intra_threads', + help='number threads for an operator', + type=int, default=1) + args = parser.parse_args() + + if args.input_graph: + model_file = args.input_graph + else: + sys.exit("Please provide a graph file.") + input_height = args.input_height + input_width = args.input_width + batch_size = args.batch_size + input_layer = args.input_layer + output_layer = args.output_layer + num_inter_threads = args.num_inter_threads + num_intra_threads = args.num_intra_threads + data_location = args.data_location + dataset = datasets.ImagenetData(data_location) + preprocessor = dataset.get_image_preprocessor()( + input_height, input_width, batch_size, + 1, # device count + tf.float32, # data_type for input fed to the graph + train=False, # doing inference + resize_method='bilinear') + + images, labels = preprocessor.minibatch(dataset, subset='validation', + use_datasets=True, cache_data=False) + graph = load_graph(model_file) + input_tensor = graph.get_tensor_by_name(input_layer + ":0") + output_tensor = graph.get_tensor_by_name(output_layer + ":0") + + config = tf.ConfigProto() + config.inter_op_parallelism_threads = num_inter_threads + config.intra_op_parallelism_threads = num_intra_threads + + total_accuracy1, total_accuracy5 = (0.0, 0.0) + num_processed_images = 0 + num_remaining_images = dataset.num_examples_per_epoch(subset='validation') \ + - num_processed_images + with tf.Session() as sess: + sess_graph = tf.Session(graph=graph, config=config) + while num_remaining_images >= batch_size: + # Reads and preprocess data + np_images, np_labels = sess.run([images[0], labels[0]]) + num_processed_images += batch_size + num_remaining_images -= batch_size + start_time = time.time() + # Compute inference on the preprocessed data + predictions = sess_graph.run(output_tensor, + {input_tensor: np_images}) + elapsed_time = time.time() - start_time + accuracy1 = tf.reduce_sum( + tf.cast(tf.nn.in_top_k(tf.constant(predictions), + tf.constant(np_labels), 1), tf.float32)) + + accuracy5 = tf.reduce_sum( + tf.cast(tf.nn.in_top_k(tf.constant(predictions), + tf.constant(np_labels), 5), tf.float32)) + np_accuracy1, np_accuracy5 = sess.run([accuracy1, accuracy5]) + total_accuracy1 += np_accuracy1 + total_accuracy5 += np_accuracy5 + print("Iteration time: %0.4f ms" % elapsed_time) + print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ + % (num_processed_images, total_accuracy1/num_processed_images, + total_accuracy5/num_processed_images)) diff --git a/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/benchmark.py b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/benchmark.py new file mode 100644 index 000000000..0e7a41f31 --- /dev/null +++ b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/benchmark.py @@ -0,0 +1,149 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys +import os +import time +import numpy as np + +from google.protobuf import text_format +import tensorflow as tf + +def load_graph(model_file): + graph = tf.Graph() + graph_def = tf.GraphDef() + + import os + file_ext = os.path.splitext(model_file)[1] + + with open(model_file, "rb") as f: + if file_ext == '.pbtxt': + text_format.Merge(f.read(), graph_def) + else: + graph_def.ParseFromString(f.read()) + with graph.as_default(): + tf.import_graph_def(graph_def, name='') + + return graph + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_graph", default=None, + help="graph/model to be executed") + parser.add_argument("--input_height", default=224, + type=int, help="input height") + parser.add_argument("--input_width", default=224, + type=int, help="input width") + parser.add_argument("--batch_size", default=32, + type=int, help="batch size") + parser.add_argument("--input_layer", default="input", + help="name of input layer") + parser.add_argument("--output_layer", default="MobilenetV1/Predictions/Reshape_1", + help="name of output layer") + parser.add_argument( + '--num_inter_threads', + help='number threads across operators', + type=int, default=1) + parser.add_argument( + '--num_intra_threads', + help='number threads for an operator', + type=int, default=1) + parser.add_argument("--warmup_steps", type=int, default=10, + help="number of warmup steps") + parser.add_argument("--steps", type=int, default=50, help="number of steps") + args = parser.parse_args() + + if args.input_graph: + model_file = args.input_graph + else: + sys.exit("Please provide a graph file.") + input_height = args.input_height + input_width = args.input_width + batch_size = args.batch_size + input_layer = args.input_layer + output_layer = args.output_layer + warmup_steps = args.warmup_steps + steps = args.steps + assert steps > 10, "Benchmark steps should be at least 10." + num_inter_threads = args.num_inter_threads + num_intra_threads = args.num_intra_threads + + input_shape = [batch_size, input_height, input_width, 3] + images = tf.truncated_normal( + input_shape, + dtype=tf.float32, + stddev=10, + name='synthetic_images') + + image_data = None + + config = tf.ConfigProto() + config.inter_op_parallelism_threads = num_inter_threads + config.intra_op_parallelism_threads = num_intra_threads + config.use_per_session_threads = True + + with tf.Session() as sess: + image_data = sess.run(images) + graph = load_graph(model_file) + + input_tensor = graph.get_tensor_by_name(input_layer + ":0"); + output_tensor = graph.get_tensor_by_name(output_layer + ":0"); + + with tf.Session(graph=graph, config=config) as sess: + sys.stdout.flush() + print("[Running warmup steps...]") + for t in range(warmup_steps): + start_time = time.time() + sess.run(output_tensor, {input_tensor: image_data}) + elapsed_time = time.time() - start_time + if((t+1) % 10 == 0): + print("steps = {0}, {1} images/sec" + "".format(t+1, batch_size/elapsed_time), flush=True) + + print("[Running benchmark steps...]") + total_time = 0; + total_images = 0; + for t in range(steps): + start_time = time.time() + results = sess.run(output_tensor, {input_tensor: image_data}) + elapsed_time = time.time() - start_time + if((t+1) % 10 == 0): + print("steps = {0}, {1} images/sec" + "".format(t+1, batch_size/elapsed_time), flush=True); diff --git a/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/cnn_util.py b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/cnn_util.py new file mode 100644 index 000000000..32902d149 --- /dev/null +++ b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/cnn_util.py @@ -0,0 +1,50 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Utilities for CNN benchmarks.""" + +import tensorflow as tf + + +def tensorflow_version_tuple(): + v = tf.__version__ + major, minor, patch = v.split('.') + return (int(major), int(minor), patch) + + +def tensorflow_version(): + vt = tensorflow_version_tuple() + return vt[0] * 1000 + vt[1] + diff --git a/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/datasets.py b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/datasets.py new file mode 100644 index 000000000..8734044b5 --- /dev/null +++ b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/datasets.py @@ -0,0 +1,195 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Benchmark dataset utilities. +""" + +from abc import abstractmethod +import os + +import numpy as np +from six.moves import cPickle +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + +from tensorflow.python.platform import gfile +import preprocessing + + +IMAGENET_NUM_TRAIN_IMAGES = 1281167 +IMAGENET_NUM_VAL_IMAGES = 50000 + + +def create_dataset(data_dir, data_name): + """Create a Dataset instance based on data_dir and data_name.""" + supported_datasets = { + 'imagenet': ImagenetData, + 'cifar10': Cifar10Data, + } + if not data_dir and not data_name: + # When using synthetic data, use synthetic imagenet images by default. + data_name = 'imagenet' + + if data_name is None: + for supported_name in supported_datasets: + if supported_name in data_dir: + data_name = supported_name + break + + if data_name is None: + raise ValueError('Could not identify name of dataset. ' + 'Please specify with --data_name option.') + + if data_name not in supported_datasets: + raise ValueError('Unknown dataset. Must be one of %s', ', '.join( + [key for key in sorted(supported_datasets.keys())])) + + return supported_datasets[data_name](data_dir) + + +class Dataset(object): + """Abstract class for cnn benchmarks dataset.""" + + def __init__(self, name, height=None, width=None, depth=None, data_dir=None, + queue_runner_required=False, num_classes=1000): + self.name = name + self.height = height + self.width = width + self.depth = depth or 3 + + self.data_dir = data_dir + self._queue_runner_required = queue_runner_required + self._num_classes = num_classes + + def tf_record_pattern(self, subset): + return os.path.join(self.data_dir, '%s-*-of-*' % subset) + + def reader(self): + return tf.TFRecordReader() + + @property + def num_classes(self): + return self._num_classes + + @num_classes.setter + def num_classes(self, val): + self._num_classes = val + + @abstractmethod + def num_examples_per_epoch(self, subset): + pass + + def __str__(self): + return self.name + + def get_image_preprocessor(self): + return None + + def queue_runner_required(self): + return self._queue_runner_required + + def use_synthetic_gpu_images(self): + return not self.data_dir + + +class ImagenetData(Dataset): + """Configuration for Imagenet dataset.""" + + def __init__(self, data_dir=None): + super(ImagenetData, self).__init__('imagenet', 300, 300, data_dir=data_dir) + + def num_examples_per_epoch(self, subset='train'): + if subset == 'train': + return IMAGENET_NUM_TRAIN_IMAGES + elif subset == 'validation': + return IMAGENET_NUM_VAL_IMAGES + else: + raise ValueError('Invalid data subset "%s"' % subset) + + def get_image_preprocessor(self): + if self.use_synthetic_gpu_images(): + return preprocessing.SyntheticImagePreprocessor + else: + return preprocessing.RecordInputImagePreprocessor + + +class Cifar10Data(Dataset): + """Configuration for cifar 10 dataset. + + It will mount all the input images to memory. + """ + + def __init__(self, data_dir=None): + super(Cifar10Data, self).__init__('cifar10', 32, 32, data_dir=data_dir, + queue_runner_required=True, + num_classes=10) + + def read_data_files(self, subset='train'): + """Reads from data file and returns images and labels in a numpy array.""" + assert self.data_dir, ('Cannot call `read_data_files` when using synthetic ' + 'data') + if subset == 'train': + filenames = [os.path.join(self.data_dir, 'data_batch_%d' % i) + for i in xrange(1, 6)] + elif subset == 'validation': + filenames = [os.path.join(self.data_dir, 'test_batch')] + else: + raise ValueError('Invalid data subset "%s"' % subset) + + inputs = [] + for filename in filenames: + with gfile.Open(filename, 'r') as f: + inputs.append(cPickle.load(f)) + # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the + # input format. + all_images = np.concatenate( + [each_input['data'] for each_input in inputs]).astype(np.float32) + all_labels = np.concatenate( + [each_input['labels'] for each_input in inputs]) + return all_images, all_labels + + def num_examples_per_epoch(self, subset='train'): + if subset == 'train': + return 50000 + elif subset == 'validation': + return 10000 + else: + raise ValueError('Invalid data subset "%s"' % subset) + + def get_image_preprocessor(self): + if self.use_synthetic_gpu_images(): + return preprocessing.SyntheticImagePreprocessor + else: + return preprocessing.Cifar10ImagePreprocessor diff --git a/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/preprocessing.py b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/preprocessing.py new file mode 100644 index 000000000..ef94d3e3d --- /dev/null +++ b/models/image_recognition/tensorflow/mobilenet_v1/inference/int8/preprocessing.py @@ -0,0 +1,637 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Image pre-processing utilities. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import math +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + +from tensorflow.contrib.data.python.ops import batching +from tensorflow.contrib.data.python.ops import interleave_ops +from tensorflow.contrib.image.python.ops import distort_image_ops +from tensorflow.python.layers import utils +from tensorflow.python.ops import data_flow_ops +from tensorflow.python.platform import gfile +import cnn_util + +from tensorflow.python.ops import control_flow_ops + +def parse_example_proto(example_serialized): + """Parses an Example proto containing a training example of an image. + + The output of the build_image_data.py image preprocessing script is a dataset + containing serialized Example protocol buffers. Each Example proto contains + the following fields: + + image/height: 462 + image/width: 581 + image/colorspace: 'RGB' + image/channels: 3 + image/class/label: 615 + image/class/synset: 'n03623198' + image/class/text: 'knee pad' + image/object/bbox/xmin: 0.1 + image/object/bbox/xmax: 0.9 + image/object/bbox/ymin: 0.2 + image/object/bbox/ymax: 0.6 + image/object/bbox/label: 615 + image/format: 'JPEG' + image/filename: 'ILSVRC2012_val_00041207.JPEG' + image/encoded: + + Args: + example_serialized: scalar Tensor tf.string containing a serialized + Example protocol buffer. + + Returns: + image_buffer: Tensor tf.string containing the contents of a JPEG file. + label: Tensor tf.int32 containing the label. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + text: Tensor tf.string containing the human-readable label. + """ + # Dense features in Example proto. + feature_map = { + 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, + default_value=-1), + 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + } + sparse_float32 = tf.VarLenFeature(dtype=tf.float32) + # Sparse features in Example proto. + feature_map.update( + {k: sparse_float32 for k in ['image/object/bbox/xmin', + 'image/object/bbox/ymin', + 'image/object/bbox/xmax', + 'image/object/bbox/ymax']}) + + features = tf.parse_single_example(example_serialized, feature_map) + label = tf.cast(features['image/class/label'], dtype=tf.int32) + + xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) + ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) + xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) + ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) + + # Note that we impose an ordering of (y, x) just to make life difficult. + bbox = tf.concat([ymin, xmin, ymax, xmax], 0) + + # Force the variable number of bounding boxes into the shape + # [1, num_boxes, coords]. + bbox = tf.expand_dims(bbox, 0) + bbox = tf.transpose(bbox, [0, 2, 1]) + + return features['image/encoded'], label, bbox, features['image/class/text'] + + +def get_image_resize_method(resize_method, batch_position=0): + """Get tensorflow resize method. + + If resize_method is 'round_robin', return different methods based on batch + position in a round-robin fashion. NOTE: If the batch size is not a multiple + of the number of methods, then the distribution of methods will not be + uniform. + + Args: + resize_method: (string) nearest, bilinear, bicubic, area, or round_robin. + batch_position: position of the image in a batch. NOTE: this argument can + be an integer or a tensor + Returns: + one of resize type defined in tf.image.ResizeMethod. + """ + resize_methods_map = { + 'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR, + 'bilinear': tf.image.ResizeMethod.BILINEAR, + 'bicubic': tf.image.ResizeMethod.BICUBIC, + 'area': tf.image.ResizeMethod.AREA + } + + if resize_method != 'round_robin': + return resize_methods_map[resize_method] + + # return a resize method based on batch position in a round-robin fashion. + resize_methods = resize_methods_map.values() + def lookup(index): + return resize_methods[index] + + def resize_method_0(): + return utils.smart_cond(batch_position % len(resize_methods) == 0, + lambda: lookup(0), resize_method_1) + + def resize_method_1(): + return utils.smart_cond(batch_position % len(resize_methods) == 1, + lambda: lookup(1), resize_method_2) + + def resize_method_2(): + return utils.smart_cond(batch_position % len(resize_methods) == 2, + lambda: lookup(2), lambda: lookup(3)) + + # NOTE(jsimsa): Unfortunately, we cannot use a single recursive function here + # because TF would not be able to construct a finite graph. + + return resize_method_0() + + +def decode_jpeg(image_buffer, scope=None): # , dtype=tf.float32): + """Decode a JPEG string into one 3-D float image Tensor. + + Args: + image_buffer: scalar string Tensor. + scope: Optional scope for op_scope. + Returns: + 3-D float Tensor with values ranging from [0, 1). + """ + # with tf.op_scope([image_buffer], scope, 'decode_jpeg'): + # with tf.name_scope(scope, 'decode_jpeg', [image_buffer]): + with tf.name_scope(scope or 'decode_jpeg'): + # Decode the string as an RGB JPEG. + # Note that the resulting image contains an unknown height and width + # that is set dynamically by decode_jpeg. In other words, the height + # and width of image is unknown at compile-time. + image = tf.image.decode_jpeg(image_buffer, channels=3) #, + # fancy_upscaling=False, + # dct_method='INTEGER_FAST') + + # image = tf.Print(image, [tf.shape(image)], 'Image shape: ') + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + + return image + + + +def preprocess_for_eval(image, height, width, + central_fraction=0.875, scope=None): + """Prepare one image for evaluation. + + If height and width are specified it would output an image with that size by + applying resize_bilinear. + + If central_fraction is specified it would crop the central fraction of the + input image. + + Args: + image: 3-D Tensor of image. If dtype is tf.float32 then the range should be + [0, 1], otherwise it would converted to tf.float32 assuming that the range + is [0, MAX], where MAX is largest positive representable number for + int(8/16/32) data type (see `tf.image.convert_image_dtype` for details). + height: integer + width: integer + central_fraction: Optional Float, fraction of the image to crop. + scope: Optional scope for name_scope. + Returns: + 3-D float Tensor of prepared image. + """ + with tf.name_scope(scope, 'eval_image', [image, height, width]): + if image.dtype != tf.float32: + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + # Crop the central region of the image with an area containing 87.5% of + # the original image. + if central_fraction: + image = tf.image.central_crop(image, central_fraction=central_fraction) + + if height and width: + # Resize the image to the specified height and width. + image = tf.expand_dims(image, 0) + image = tf.image.resize_bilinear(image, [height, width], + align_corners=False) + image = tf.squeeze(image, [0]) + image = tf.subtract(image, 0.5) + image = tf.multiply(image, 2.0) + return image + + + +def apply_with_random_selector(x, func, num_cases): + """Computes func(x, sel), with sel sampled from [0...num_cases-1]. + + Args: + x: input Tensor. + func: Python function to apply. + num_cases: Python int32, number of cases to sample sel from. + + Returns: + The result of func(x, sel), where func receives the value of the + selector as a python integer, but sel is sampled dynamically. + """ + sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32) + # Pass the real x only to one of the func calls. + return control_flow_ops.merge([ + func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case) + for case in range(num_cases)])[0] + + +def distort_color(image, color_ordering=0, fast_mode=True, scope=None): + """Distort the color of a Tensor image. + + Each color distortion is non-commutative and thus ordering of the color ops + matters. Ideally we would randomly permute the ordering of the color ops. + Rather then adding that level of complication, we select a distinct ordering + of color ops for each preprocessing thread. + + Args: + image: 3-D Tensor containing single image in [0, 1]. + color_ordering: Python int, a type of distortion (valid values: 0-3). + fast_mode: Avoids slower ops (random_hue and random_contrast) + scope: Optional scope for name_scope. + Returns: + 3-D Tensor color-distorted image on range [0, 1] + Raises: + ValueError: if color_ordering not in [0, 3] + """ + with tf.name_scope(scope, 'distort_color', [image]): + if fast_mode: + if color_ordering == 0: + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + else: + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_brightness(image, max_delta=32. / 255.) + else: + if color_ordering == 0: + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + elif color_ordering == 1: + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + elif color_ordering == 2: + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + elif color_ordering == 3: + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + image = tf.image.random_brightness(image, max_delta=32. / 255.) + else: + raise ValueError('color_ordering must be in [0, 3]') + + # The random_* ops do not necessarily clamp. + return tf.clip_by_value(image, 0.0, 1.0) + + +def distorted_bounding_box_crop(image, + bbox, + min_object_covered=0.1, + aspect_ratio_range=(0.75, 1.33), + area_range=(0.05, 1.0), + max_attempts=100, + scope=None): + """Generates cropped_image using a one of the bboxes randomly distorted. + + See `tf.image.sample_distorted_bounding_box` for more documentation. + + Args: + image: 3-D Tensor of image (it will be converted to floats in [0, 1]). + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged + as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole + image. + min_object_covered: An optional `float`. Defaults to `0.1`. The cropped + area of the image must contain at least this fraction of any bounding box + supplied. + aspect_ratio_range: An optional list of `floats`. The cropped area of the + image must have an aspect ratio = width / height within this range. + area_range: An optional list of `floats`. The cropped area of the image + must contain a fraction of the supplied image within in this range. + max_attempts: An optional `int`. Number of attempts at generating a cropped + region of the image of the specified constraints. After `max_attempts` + failures, return the entire image. + scope: Optional scope for name_scope. + Returns: + A tuple, a 3-D Tensor cropped_image and the distorted bbox + """ + with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bbox]): + # Each bounding box has shape [1, num_boxes, box coords] and + # the coordinates are ordered [ymin, xmin, ymax, xmax]. + + # A large fraction of image datasets contain a human-annotated bounding + # box delineating the region of the image containing the object of interest. + # We choose to create a new bounding box for the object which is a randomly + # distorted version of the human-annotated bounding box that obeys an + # allowed range of aspect ratios, sizes and overlap with the human-annotated + # bounding box. If no box is supplied, then we assume the bounding box is + # the entire image. + sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( + tf.shape(image), + bounding_boxes=bbox, + min_object_covered=min_object_covered, + aspect_ratio_range=aspect_ratio_range, + area_range=area_range, + max_attempts=max_attempts, + use_image_if_no_bounding_boxes=True) + bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box + + # Crop the image to the specified bounding box. + cropped_image = tf.slice(image, bbox_begin, bbox_size) + return cropped_image, distort_bbox + + + +def preprocess_for_train(image, height,width, bbox, + batch_position, + fast_mode=True, + scope=None, + add_image_summaries=True): + """Distort one image for training a network. + + Distorting images provides a useful technique for augmenting the data + set during training in order to make the network invariant to aspects + of the image that do not effect the label. + + Args: + image: 3-D Tensor of image. If dtype is tf.float32 then the range should be + [0, 1], otherwise it would converted to tf.float32 assuming that the range + is [0, MAX], where MAX is largest positive representable number for + int(8/16/32) data type (see `tf.image.convert_image_dtype` for details). + height: integer + width: integer + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged + as [ymin, xmin, ymax, xmax]. + batch_position: position of the image in a batch, which affects how images + are distorted and resized. NOTE: this argument can be an integer or a + tensor + scope: Optional scope for op_scope. + add_image_summaries: Enable image summaries. + Returns: + 3-D float Tensor of distorted image used for training with range [-1, 1]. + """ + + with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]): + if bbox is None: + bbox = tf.constant([0.0, 0.0, 1.0, 1.0], + dtype=tf.float32, + shape=[1, 1, 4]) + if image.dtype != tf.float32: + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + # Each bounding box has shape [1, num_boxes, box coords] and + # the coordinates are ordered [ymin, xmin, ymax, xmax]. + image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), + bbox) + if add_image_summaries: + tf.summary.image('image_with_bounding_boxes', image_with_box) + + distorted_image, distorted_bbox = distorted_bounding_box_crop(image, bbox) + # Restore the shape since the dynamic slice based upon the bbox_size loses + # the third dimension. + distorted_image.set_shape([None, None, 3]) + image_with_distorted_box = tf.image.draw_bounding_boxes( + tf.expand_dims(image, 0), distorted_bbox) + if add_image_summaries: + tf.summary.image('images_with_distorted_bounding_box', + image_with_distorted_box) + + + # This resizing operation may distort the images because the aspect + # ratio is not respected. We select a resize method in a round robin + # fashion based on the thread number. + # Note that ResizeMethod contains 4 enumerated resizing methods. + + # We select only 1 case for fast_mode bilinear. + num_resize_cases = 1 if fast_mode else 4 + distorted_image = apply_with_random_selector( + distorted_image, + lambda x, method: tf.image.resize_images(x, [height, width], method), + num_cases=num_resize_cases) + + if add_image_summaries: + tf.summary.image('cropped_resized_image', + tf.expand_dims(distorted_image, 0)) + + + # Randomly flip the image horizontally. + distorted_image = tf.image.random_flip_left_right(distorted_image) + # Randomly distort the colors. There are 1 or 4 ways to do it. + num_distort_cases = 1 if fast_mode else 4 + distorted_image = apply_with_random_selector( + distorted_image, + lambda x, ordering: distort_color(x, ordering, fast_mode), + num_cases=num_distort_cases) + + if add_image_summaries: + tf.summary.image('final_distorted_image', + tf.expand_dims(distorted_image, 0)) + distorted_image = tf.subtract(distorted_image, 0.5) + distorted_image = tf.multiply(distorted_image, 2.0) + return distorted_image + + +def distort_color(image, batch_position=0, distort_color_in_yiq=False, + scope=None): + """Distort the color of the image. + + Each color distortion is non-commutative and thus ordering of the color ops + matters. Ideally we would randomly permute the ordering of the color ops. + Rather then adding that level of complication, we select a distinct ordering + of color ops based on the position of the image in a batch. + + Args: + image: float32 Tensor containing single image. Tensor values should be in + range [0, 1]. + batch_position: the position of the image in a batch. NOTE: this argument + can be an integer or a tensor + distort_color_in_yiq: distort color of input images in YIQ space. + scope: Optional scope for op_scope. + Returns: + color-distorted image + """ + with tf.name_scope(scope or 'distort_color'): + + def distort_fn_0(image=image): + """Variant 0 of distort function.""" + image = tf.image.random_brightness(image, max_delta=32. / 255.) + #if distort_color_in_yiq: + # image = distort_image_ops.random_hsv_in_yiq( + # image, lower_saturation=0.5, upper_saturation=1.5, + # max_delta_hue=0.2 * math.pi) + #else: + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + return image + + def distort_fn_1(image=image): + """Variant 1 of distort function.""" + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + #if distort_color_in_yiq: + # image = distort_image_ops.random_hsv_in_yiq( + # image, lower_saturation=0.5, upper_saturation=1.5, + # max_delta_hue=0.2 * math.pi) + #else: + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + return image + + image = utils.smart_cond(batch_position % 2 == 0, distort_fn_0, + distort_fn_1) + # The random_* ops do not necessarily clamp. + image = tf.clip_by_value(image, 0.0, 1.0) + return image + + +class RecordInputImagePreprocessor(object): + """Preprocessor for images with RecordInput format.""" + + def __init__(self, + height, + width, + batch_size, + num_splits, + dtype, + train, + distortions=False, + resize_method="bilinear", + shift_ratio=0, + summary_verbosity=1, + distort_color_in_yiq=False, + fuse_decode_and_crop=False): + self.height = height + self.width = width + self.batch_size = batch_size + self.num_splits = num_splits + self.dtype = dtype + self.train = train + self.resize_method = resize_method + self.shift_ratio = shift_ratio + self.distortions = distortions + self.distort_color_in_yiq = distort_color_in_yiq + self.fuse_decode_and_crop = fuse_decode_and_crop + if self.batch_size % self.num_splits != 0: + raise ValueError( + ('batch_size must be a multiple of num_splits: ' + 'batch_size %d, num_splits: %d') % + (self.batch_size, self.num_splits)) + self.batch_size_per_split = self.batch_size // self.num_splits + self.summary_verbosity = summary_verbosity + + def image_preprocess(self, image_buffer, bbox, batch_position): + """Preprocessing image_buffer as a function of its batch position.""" + if self.train: + image_buffer = tf.image.decode_jpeg( + image_buffer, channels=3, dct_method='INTEGER_FAST') + image = preprocess_for_train(image_buffer, self.height, self.width, bbox, + batch_position) + else: + image = tf.image.decode_jpeg( + image_buffer, channels=3, dct_method='INTEGER_FAST') + image = preprocess_for_eval(image, self.height, self.width) + return image + + def parse_and_preprocess(self, value, batch_position): + image_buffer, label_index, bbox, _ = parse_example_proto(value) + image = self.image_preprocess(image_buffer, bbox, batch_position) + return (label_index, image) + + def minibatch(self, dataset, subset, use_datasets, cache_data, + shift_ratio=-1): + if shift_ratio < 0: + shift_ratio = self.shift_ratio + with tf.name_scope('batch_processing'): + # Build final results per split. + images = [[] for _ in range(self.num_splits)] + labels = [[] for _ in range(self.num_splits)] + if use_datasets: + glob_pattern = dataset.tf_record_pattern(subset) + file_names = gfile.Glob(glob_pattern) + if not file_names: + raise ValueError('Found no files in --data_dir matching: {}' + .format(glob_pattern)) + ds = tf.data.TFRecordDataset.list_files(file_names) + ds = ds.apply( + interleave_ops.parallel_interleave( + tf.data.TFRecordDataset, cycle_length=10)) + if cache_data: + ds = ds.take(1).cache().repeat() + counter = tf.data.Dataset.range(self.batch_size) + counter = counter.repeat() + ds = tf.data.Dataset.zip((ds, counter)) + ds = ds.prefetch(buffer_size=self.batch_size) + ds = ds.shuffle(buffer_size=10000) + ds = ds.repeat() + ds = ds.apply( + batching.map_and_batch( + map_func=self.parse_and_preprocess, + batch_size=self.batch_size_per_split, + num_parallel_batches=self.num_splits)) + ds = ds.prefetch(buffer_size=self.num_splits) + ds_iterator = ds.make_one_shot_iterator() + for d in xrange(self.num_splits): + labels[d], images[d] = ds_iterator.get_next() + + else: + record_input = data_flow_ops.RecordInput( + file_pattern=dataset.tf_record_pattern(subset), + seed=301, + parallelism=64, + buffer_size=10000, + batch_size=self.batch_size, + shift_ratio=shift_ratio, + name='record_input') + records = record_input.get_yield_op() + records = tf.split(records, self.batch_size, 0) + records = [tf.reshape(record, []) for record in records] + for idx in xrange(self.batch_size): + value = records[idx] + (label, image) = self.parse_and_preprocess(value, idx) + split_index = idx % self.num_splits + labels[split_index].append(label) + images[split_index].append(image) + + for split_index in xrange(self.num_splits): + if not use_datasets: + images[split_index] = tf.parallel_stack(images[split_index]) + labels[split_index] = tf.concat(labels[split_index], 0) + images[split_index] = tf.cast(images[split_index], self.dtype) + depth = 3 + images[split_index] = tf.reshape( + images[split_index], + shape=[self.batch_size_per_split, self.height, self.width, depth]) + labels[split_index] = tf.reshape(labels[split_index], + [self.batch_size_per_split]) + return images, labels + diff --git a/models/image_recognition/tensorflow/resnet101/inference/eval_image_classifier_inference.py b/models/image_recognition/tensorflow/resnet101/inference/eval_image_classifier_inference.py index a65a54b08..e62b40b3d 100644 --- a/models/image_recognition/tensorflow/resnet101/inference/eval_image_classifier_inference.py +++ b/models/image_recognition/tensorflow/resnet101/inference/eval_image_classifier_inference.py @@ -200,9 +200,11 @@ def run(self): num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size + start_time = time.time() # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) + elapsed_time = time.time() - start_time with tf.Graph().as_default() as accu_graph: # Putting all code within this make things faster. accuracy1 = tf.reduce_sum( @@ -216,6 +218,7 @@ def run(self): np_accuracy1, np_accuracy5 = accu_sess.run([accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 + print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images)) diff --git a/models/image_recognition/tensorflow/resnet50/inference/eval_image_classifier_inference.py b/models/image_recognition/tensorflow/resnet50/inference/eval_image_classifier_inference.py index 198509a23..21a1b465e 100644 --- a/models/image_recognition/tensorflow/resnet50/inference/eval_image_classifier_inference.py +++ b/models/image_recognition/tensorflow/resnet50/inference/eval_image_classifier_inference.py @@ -165,12 +165,12 @@ def run(self): input_tensor = infer_graph.get_tensor_by_name('input:0') output_tensor = infer_graph.get_tensor_by_name('predict:0') - data_sess = tf.Session(graph=data_graph, config=data_config) + data_sess = tf.Session(graph=data_graph, config=data_config) infer_sess = tf.Session(graph=infer_graph, config=infer_config) num_processed_images = 0 num_remaining_images = dataset.num_examples_per_epoch(subset=subset) - num_processed_images \ - if self.args.data_location else datasets.IMAGENET_NUM_VAL_IMAGES + if self.args.data_location else (self.args.batch_size * self.args.steps) if (not self.args.accuracy_only): iteration = 0 @@ -230,9 +230,11 @@ def run(self): num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size + start_time = time.time() # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) + elapsed_time = time.time() - start_time # Write out the file name, expected label, and top prediction self.write_results_output(predictions, tf_filenames, np_labels) @@ -251,6 +253,7 @@ def run(self): total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 + print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images)) diff --git a/models/image_recognition/tensorflow/resnet50v1_5/__init__.py b/models/image_recognition/tensorflow/resnet50v1_5/__init__.py new file mode 100644 index 000000000..159180624 --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/__init__.py @@ -0,0 +1,20 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py new file mode 100644 index 000000000..159180624 --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py @@ -0,0 +1,20 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/datasets.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/datasets.py new file mode 100644 index 000000000..cb848e467 --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/datasets.py @@ -0,0 +1,96 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Benchmark dataset utilities. +""" + +from abc import abstractmethod +import os + +import tensorflow as tf + +import preprocessing + +IMAGENET_NUM_TRAIN_IMAGES = 1281167 +IMAGENET_NUM_VAL_IMAGES = 50000 +IMAGENET_NUM_CLASSES = 1000 + +class Dataset(object): + """Abstract class for cnn benchmarks dataset.""" + + def __init__(self, name, data_dir=None): + self.name = name + if data_dir is None: + raise ValueError('Data directory not specified') + self.data_dir = data_dir + + def tf_record_pattern(self, subset): + return os.path.join(self.data_dir, '%s-*-of-*' % subset) + + def reader(self): + return tf.TFRecordReader() + + @abstractmethod + def num_classes(self): + pass + + @abstractmethod + def num_examples_per_epoch(self, subset): + pass + + def __str__(self): + return self.name + + +class ImagenetData(Dataset): + + def __init__(self, data_dir=None): + super(ImagenetData, self).__init__('ImageNet', data_dir) + + def num_classes(self): + return IMAGENET_NUM_CLASSES + + def num_examples_per_epoch(self, subset='train'): + if subset == 'train': + return IMAGENET_NUM_TRAIN_IMAGES + elif subset == 'validation': + return IMAGENET_NUM_VAL_IMAGES + elif subset == 'calibrate' or subset == 'calibration': + return 100 + else: + raise ValueError('Invalid data subset "%s"' % subset) + + def get_image_preprocessor(self): + return preprocessing.RecordInputImagePreprocessor diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py new file mode 100644 index 000000000..e1e6133e1 --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py @@ -0,0 +1,271 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +import time +from argparse import ArgumentParser + +import tensorflow as tf +import tensorflow.tools.graph_transforms as graph_transforms + +import datasets +import numpy as np + +INPUTS = 'input_tensor:0' +OUTPUTS = 'softmax_tensor:0' +OPTIMIZATION = 'strip_unused_nodes remove_nodes(op=Identity, op=CheckNumerics) fold_constants(ignore_errors=true) fold_batch_norms fold_old_batch_norms' + +RESNET_IMAGE_SIZE = 224 + + +class eval_classifier_optimized_graph: + """Evaluate image classifier with optimized TensorFlow graph""" + + def __init__(self): + + arg_parser = ArgumentParser(description='Parse args') + + arg_parser.add_argument('-b', "--batch-size", + help="Specify the batch size. If this " \ + "parameter is not specified or is -1, the " \ + "largest ideal batch size for the model will " \ + "be used.", + dest="batch_size", type=int, default=-1) + + arg_parser.add_argument('-e', "--num-inter-threads", + help='The number of inter-thread.', + dest='num_inter_threads', type=int, default=0) + + arg_parser.add_argument('-a', "--num-intra-threads", + help='The number of intra-thread.', + dest='num_intra_threads', type=int, default=0) + + arg_parser.add_argument('-m', "--model-name", + help='Specify the model name to run benchmark for', + dest='model_name') + + arg_parser.add_argument('-g', "--input-graph", + help='Specify the input graph for the transform tool', + dest='input_graph') + + arg_parser.add_argument('-d', "--data-location", + help='Specify the location of the data. ' + 'If this parameter is not specified, ' + 'the benchmark will use random/dummy data.', + dest="data_location", default=None) + + arg_parser.add_argument('-r', "--accuracy-only", + help='For accuracy measurement only.', + dest='accuracy_only', action='store_true') + arg_parser.add_argument('--calibrate', dest='calibrate', + help='Run accuracy with calibration data,' + 'to generate min_max ranges, calibrate=[True/False]', + type=bool, default=False) + arg_parser.add_argument("--results-file-path", + help="File path for the inference results", + dest="results_file_path", default=None) + arg_parser.add_argument("--warmup-steps", type=int, default=10, + help="number of warmup steps") + arg_parser.add_argument("--steps", type=int, default=50, + help="number of steps") + + arg_parser.add_argument( + '--data-num-inter-threads', dest='data_num_inter_threads', + help='number threads across operators', + type=int, default=32) + arg_parser.add_argument( + '--data-num-intra-threads', dest='data_num_intra_threads', + help='number threads for data layer operator', + type=int, default=14) + arg_parser.add_argument( + '--num-cores', dest='num_cores', + help='number of cores', + type=int, default=28) + + self.args = arg_parser.parse_args() + # validate the arguements + self.validate_args() + + def write_results_output(self, predictions, filenames, labels): + # If a results_file_path is provided, write the predictions to the file + if self.args.results_file_path: + top_predictions = np.argmax(predictions, 1) + with open(self.args.results_file_path, "a") as fp: + for filename, expected_label, top_prediction in zip(filenames, labels, top_predictions): + fp.write("{},{},{}\n".format(filename, expected_label, top_prediction)) + + def run(self): + """run benchmark with optimized graph""" + + print("Run inference") + + data_config = tf.ConfigProto() + data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads + data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads + data_config.use_per_session_threads = 1 + + infer_config = tf.ConfigProto() + infer_config.intra_op_parallelism_threads = self.args.num_intra_threads + infer_config.inter_op_parallelism_threads = self.args.num_inter_threads + infer_config.use_per_session_threads = 1 + + data_graph = tf.Graph() + with data_graph.as_default(): + if (self.args.data_location): + print("Inference with real data.") + if self.args.calibrate: + subset = 'calibration' + else: + subset = 'validation' + dataset = datasets.ImagenetData(self.args.data_location) + preprocessor = dataset.get_image_preprocessor()( + RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size, + num_cores=self.args.num_cores, + resize_method='crop') + + images, labels, filenames = preprocessor.minibatch(dataset, subset=subset) + + # If a results file path is provided, then start the prediction output file + if self.args.results_file_path: + with open(self.args.results_file_path, "w+") as fp: + fp.write("filename,actual,prediction\n") + else: + print("Inference with dummy data.") + input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3] + images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') + + infer_graph = tf.Graph() + with infer_graph.as_default(): + graph_def = tf.GraphDef() + with tf.gfile.FastGFile(self.args.input_graph, 'rb') as input_file: + input_graph_content = input_file.read() + graph_def.ParseFromString(input_graph_content) + + output_graph = graph_transforms.TransformGraph(graph_def, + [INPUTS], [OUTPUTS], [OPTIMIZATION]) + tf.import_graph_def(output_graph, name='') + + # Definite input and output Tensors for detection_graph + input_tensor = infer_graph.get_tensor_by_name('input_tensor:0') + output_tensor = infer_graph.get_tensor_by_name('softmax_tensor:0') + + data_sess = tf.Session(graph=data_graph, config=data_config) + infer_sess = tf.Session(graph=infer_graph, config=infer_config) + + num_processed_images = 0 + num_remaining_images = dataset.num_examples_per_epoch(subset=subset) - num_processed_images \ + if self.args.data_location else datasets.IMAGENET_NUM_VAL_IMAGES + + if (not self.args.accuracy_only): + iteration = 0 + warm_up_iteration = self.args.warmup_steps + total_run = self.args.steps + total_time = 0 + + while num_remaining_images >= self.args.batch_size and iteration < total_run: + iteration += 1 + tf_filenames = None + np_labels = None + data_load_start = time.time() + if self.args.results_file_path: + image_np, np_labels, tf_filenames = data_sess.run([images, labels, filenames]) + else: + image_np = data_sess.run(images) + + data_load_time = time.time() - data_load_start + + num_processed_images += self.args.batch_size + num_remaining_images -= self.args.batch_size + + start_time = time.time() + predictions = infer_sess.run(output_tensor, feed_dict={input_tensor: image_np}) + time_consume = time.time() - start_time + + # Write out the file name, expected label, and top prediction + self.write_results_output(predictions, tf_filenames, np_labels) + + # only add data loading time for real data, not for dummy data + if self.args.data_location: + time_consume += data_load_time + + print('Iteration %d: %.6f sec' % (iteration, time_consume)) + if iteration > warm_up_iteration: + total_time += time_consume + + time_average = total_time / (iteration - warm_up_iteration) + print('Average time: %.6f sec' % (time_average)) + + print('Batch size = %d' % self.args.batch_size) + if (self.args.batch_size == 1): + print('Latency: %.3f ms' % (time_average * 1000)) + # print throughput for both batch size 1 and 128 + print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) + + else: # accuracy check + total_accuracy1, total_accuracy5 = (0.0, 0.0) + + while num_remaining_images >= self.args.batch_size: + # Reads and preprocess data + tf_filenames = None + if self.args.results_file_path: + np_images, np_labels, tf_filenames = data_sess.run([images, labels, filenames]) + else: + np_images, np_labels = data_sess.run([images, labels]) + num_processed_images += self.args.batch_size + num_remaining_images -= self.args.batch_size + + start_time = time.time() + # Compute inference on the preprocessed data + predictions = infer_sess.run(output_tensor, + {input_tensor: np_images}) + elapsed_time = time.time() - start_time + + # Write out the file name, expected label, and top prediction + self.write_results_output(predictions, tf_filenames, np_labels) + + with tf.Graph().as_default() as accu_graph: + accuracy1 = tf.reduce_sum( + tf.cast(tf.nn.in_top_k(tf.constant(predictions), + tf.constant(np_labels), 1), tf.float32)) + + accuracy5 = tf.reduce_sum( + tf.cast(tf.nn.in_top_k(tf.constant(predictions), + tf.constant(np_labels), 5), tf.float32)) + with tf.Session() as accu_sess: + np_accuracy1, np_accuracy5 = accu_sess.run([accuracy1, accuracy5]) + + total_accuracy1 += np_accuracy1 + total_accuracy5 += np_accuracy5 + + print("Iteration time: %0.4f ms" % elapsed_time) + print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ + % (num_processed_images, total_accuracy1 / num_processed_images, + total_accuracy5 / num_processed_images)) + + def validate_args(self): + """validate the arguments""" + + if not self.args.data_location: + if self.args.accuracy_only: + raise ValueError("You must use real data for accuracy measurement.") + + +if __name__ == "__main__": + evaluate_opt_graph = eval_classifier_optimized_graph() + evaluate_opt_graph.run() diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/preprocessing.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/preprocessing.py new file mode 100644 index 000000000..3c6361584 --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/preprocessing.py @@ -0,0 +1,177 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from tensorflow.data.experimental import parallel_interleave +from tensorflow.data.experimental import map_and_batch +from tensorflow.python.platform import gfile + + +def parse_example_proto(example_serialized): + """Parses an Example proto containing a training example of an image. + """ + # Dense features in Example proto. + feature_map = { + 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, + default_value=-1), + 'image/filename': tf.FixedLenFeature([], dtype=tf.string, + default_value="") + } + sparse_float32 = tf.VarLenFeature(dtype=tf.float32) + # Sparse features in Example proto. + feature_map.update( + {k: sparse_float32 for k in ['image/object/bbox/xmin', + 'image/object/bbox/ymin', + 'image/object/bbox/xmax', + 'image/object/bbox/ymax']}) + + features = tf.parse_single_example(example_serialized, feature_map) + label = tf.cast(features['image/class/label'], dtype=tf.int32) + filename = tf.cast(features['image/filename'], dtype=tf.string) + + return features['image/encoded'], label, filename + + +def eval_image(image, height, width, resize_method, + central_fraction=0.875, scope=None): + + with tf.name_scope('eval_image'): + if resize_method == 'crop': + shape = tf.shape(image) + image = tf.cond(tf.less(shape[0], shape[1]), + lambda: tf.image.resize_images(image, + tf.convert_to_tensor([256, 256 * shape[1] / shape[0]], + dtype=tf.int32)), + lambda: tf.image.resize_images(image, + tf.convert_to_tensor([256 * shape[0] / shape[1], 256], + dtype=tf.int32))) + + shape = tf.shape(image) + y0 = (shape[0] - height) // 2 + x0 = (shape[1] - width) // 2 + distorted_image = tf.image.crop_to_bounding_box(image, y0, x0, height, width) + distorted_image.set_shape([height, width, 3]) + means = tf.broadcast_to([123.68, 116.78, 103.94], tf.shape(distorted_image)) + return distorted_image - means + else: # bilinear + if image.dtype != tf.float32: + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + # Crop the central region of the image with an area containing 87.5% of + # the original image. + if central_fraction: + image = tf.image.central_crop(image, central_fraction=central_fraction) + + if height and width: + # Resize the image to the specified height and width. + image = tf.expand_dims(image, 0) + image = tf.image.resize_bilinear(image, [height, width], + align_corners=False) + image = tf.squeeze(image, [0]) + image = tf.subtract(image, 0.5) + image = tf.multiply(image, 2.0) + return image + +class RecordInputImagePreprocessor(object): + """Preprocessor for images with RecordInput format.""" + + def __init__(self, + height, + width, + batch_size, + num_cores, + resize_method="bilinear"): + + self.height = height + self.width = width + self.batch_size = batch_size + self.num_cores = num_cores + self.resize_method = resize_method + + def parse_and_preprocess(self, value): + # parse + image_buffer, label_index, filename = parse_example_proto(value) + # preprocess + image = tf.image.decode_jpeg( + image_buffer, channels=3, fancy_upscaling=False, dct_method='INTEGER_FAST') + image = eval_image(image, self.height, self.width, self.resize_method) + return (image, label_index, filename) + + def minibatch(self, dataset, subset, cache_data=False): + + with tf.name_scope('batch_processing'): + + glob_pattern = dataset.tf_record_pattern(subset) + file_names = gfile.Glob(glob_pattern) + if not file_names: + raise ValueError('Found no files in --data_dir matching: {}' + .format(glob_pattern)) + ds = tf.data.TFRecordDataset.list_files(file_names) + + ds = ds.apply( + parallel_interleave( + tf.data.TFRecordDataset, cycle_length=self.num_cores, block_length=5, + sloppy=True, + buffer_output_elements=10000, prefetch_input_elements=10000)) + + if cache_data: + ds = ds.take(1).cache().repeat() + + ds = ds.prefetch(buffer_size=10000) + #ds = ds.prefetch(buffer_size=self.batch_size) + + # num of parallel batches not greater than 56 + max_num_parallel_batches = min(56, 2 * self.num_cores) + ds = ds.apply( + map_and_batch( + map_func=self.parse_and_preprocess, + batch_size=self.batch_size, + num_parallel_batches=max_num_parallel_batches, + num_parallel_calls=None)) + + ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) + + ds_iterator = ds.make_one_shot_iterator() + images, labels, filename = ds_iterator.get_next() + # reshape + labels = tf.reshape(labels, [self.batch_size]) + filename = tf.reshape(filename, [self.batch_size]) + + return images, labels, filename diff --git a/models/image_recognition/tensorflow/resnet50v1_5/int8/__init__.py b/models/image_recognition/tensorflow/resnet50v1_5/int8/__init__.py new file mode 100644 index 000000000..159180624 --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/int8/__init__.py @@ -0,0 +1,20 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + diff --git a/models/image_recognition/tensorflow/resnet50v1_5/int8/benchmark.py b/models/image_recognition/tensorflow/resnet50v1_5/int8/benchmark.py new file mode 100644 index 000000000..c6d9a9e1f --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/int8/benchmark.py @@ -0,0 +1,213 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys +import time + +import datasets +import tensorflow as tf + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_graph", default=None, + help="graph/model to be executed") + parser.add_argument("--input_height", default=224, + type=int, help="input height") + parser.add_argument("--input_width", default=224, + type=int, help="input width") + parser.add_argument("--batch_size", default=32, + type=int, help="batch size") + parser.add_argument("--data_location", default=None, + help="dataset location") + parser.add_argument("--input_layer", default="input", + help="name of input layer") + parser.add_argument("--output_layer", default="predict", + help="name of output layer") + parser.add_argument("--num_cores", default=28, + type=int, help="number of physical cores") + parser.add_argument( + '--num_inter_threads', + help='number threads across operators', + type=int, default=1) + parser.add_argument( + '--num_intra_threads', + help='number threads for an operator', + type=int, default=1) + parser.add_argument( + '--data_num_inter_threads', + help='number threads across data layer operators', + type=int, default=16) + parser.add_argument( + '--data_num_intra_threads', + help='number threads for an data layer operator', + type=int, default=14) + parser.add_argument("--warmup_steps", type=int, default=10, + help="number of warmup steps") + parser.add_argument("--steps", type=int, default=50, help="number of steps") + args = parser.parse_args() + + if args.input_graph: + model_file = args.input_graph + else: + sys.exit("Please provide a graph file.") + if args.input_height: + input_height = args.input_height + else: + input_height = 224 + if args.input_width: + input_width = args.input_width + else: + input_width = 224 + batch_size = args.batch_size + input_layer = args.input_layer + output_layer = args.output_layer + warmup_steps = args.warmup_steps + steps = args.steps + assert steps > 10, "Benchmark steps should be at least 10." + num_inter_threads = args.num_inter_threads + num_intra_threads = args.num_intra_threads + + data_config = tf.ConfigProto() + data_config.intra_op_parallelism_threads = args.data_num_intra_threads + data_config.inter_op_parallelism_threads = args.data_num_inter_threads + data_config.use_per_session_threads = 1 + + infer_config = tf.ConfigProto() + infer_config.intra_op_parallelism_threads = num_intra_threads + infer_config.inter_op_parallelism_threads = num_inter_threads + infer_config.use_per_session_threads = 1 + + data_graph = tf.Graph() + with data_graph.as_default(): + if args.data_location: + print("inference with real data") + # get the images from dataset + dataset = datasets.ImagenetData(args.data_location) + preprocessor = dataset.get_image_preprocessor(benchmark=True)( + input_height, input_width, batch_size, + num_cores=args.num_cores, + resize_method='crop') + images = preprocessor.minibatch(dataset, subset='validation') + else: + # synthetic images + print("inference with dummy data") + input_shape = [batch_size, input_height, input_width, 3] + images = tf.random.uniform( + input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') + + infer_graph = tf.Graph() + with infer_graph.as_default(): + graph_def = tf.GraphDef() + with open(model_file, "rb") as f: + graph_def.ParseFromString(f.read()) + tf.import_graph_def(graph_def, name='') + + input_tensor = infer_graph.get_tensor_by_name(input_layer + ":0") + output_tensor = infer_graph.get_tensor_by_name(output_layer + ":0") + tf.global_variables_initializer() + + data_sess = tf.Session(graph=data_graph, config=data_config) + infer_sess = tf.Session(graph=infer_graph, config=infer_config) + + print("[Running warmup steps...]") + step_total_time = 0 + step_total_images = 0 + + for t in range(warmup_steps): + data_start_time = time.time() + image_data = data_sess.run(images) + data_load_time = time.time() - data_start_time + + start_time = time.time() + infer_sess.run(output_tensor, {input_tensor: image_data}) + elapsed_time = time.time() - start_time + + # only count the data loading and processing time for real data + if args.data_location: + elapsed_time += data_load_time + + step_total_time += elapsed_time + step_total_images += batch_size + + if ((t + 1) % 10 == 0): + print("steps = {0}, {1} images/sec" + "".format(t + 1, step_total_images / step_total_time)) + step_total_time = 0 + step_total_images = 0 + + print("[Running benchmark steps...]") + total_time = 0 + total_images = 0 + + step_total_time = 0 + step_total_images = 0 + + for t in range(steps): + try: + data_start_time = time.time() + image_data = data_sess.run(images) + data_load_time = time.time() - data_start_time + + start_time = time.time() + infer_sess.run(output_tensor, {input_tensor: image_data}) + elapsed_time = time.time() - start_time + + # only count the data loading and processing time for real data + if args.data_location: + elapsed_time += data_load_time + + total_time += elapsed_time + total_images += batch_size + + step_total_time += elapsed_time + step_total_images += batch_size + + if ((t + 1) % 10 == 0): + print("steps = {0}, {1} images/sec" + "".format(t + 1, step_total_images / step_total_time)) + step_total_time = 0 + step_total_images = 0 + + except tf.errors.OutOfRangeError: + print("Running out of images from dataset.") + break + + print("Average throughput for batch size {0}: {1} images/sec".format(batch_size, total_images / total_time)) diff --git a/models/image_recognition/tensorflow/resnet50v1_5/int8/cnn_util.py b/models/image_recognition/tensorflow/resnet50v1_5/int8/cnn_util.py new file mode 100644 index 000000000..fb76f2971 --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/int8/cnn_util.py @@ -0,0 +1,51 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Utilities for CNN benchmarks.""" + +import tensorflow as tf + + +def tensorflow_version_tuple(): + v = tf.__version__ + major, minor, patch = v.split('.') + return (int(major), int(minor), patch) + + +def tensorflow_version(): + vt = tensorflow_version_tuple() + return vt[0] * 1000 + vt[1] + diff --git a/models/image_recognition/tensorflow/resnet50v1_5/int8/datasets.py b/models/image_recognition/tensorflow/resnet50v1_5/int8/datasets.py new file mode 100644 index 000000000..1a885cb66 --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/int8/datasets.py @@ -0,0 +1,114 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from abc import abstractmethod + +import tensorflow as tf + + +IMAGENET_NUM_TRAIN_IMAGES = 1281167 +IMAGENET_NUM_VAL_IMAGES = 50000 + +class Dataset(object): + """Abstract class for cnn benchmarks dataset.""" + + def __init__(self, name, height=None, width=None, depth=None, data_dir=None, + queue_runner_required=False, num_classes=1000): + self.name = name + self.height = height + self.width = width + self.depth = depth or 3 + + self.data_dir = data_dir + self._queue_runner_required = queue_runner_required + self._num_classes = num_classes + + def tf_record_pattern(self, subset): + return os.path.join(self.data_dir, '%s-*-of-*' % subset) + + def reader(self): + return tf.TFRecordReader() + + @property + def num_classes(self): + return self._num_classes + + @num_classes.setter + def num_classes(self, val): + self._num_classes = val + + @abstractmethod + def num_examples_per_epoch(self, subset): + pass + + def __str__(self): + return self.name + + def get_image_preprocessor(self): + return None + + def queue_runner_required(self): + return self._queue_runner_required + + def use_synthetic_gpu_images(self): + return not self.data_dir + + +class ImagenetData(Dataset): + """Configuration for Imagenet dataset.""" + + def __init__(self, data_dir=None): + super(ImagenetData, self).__init__('imagenet', 300, 300, data_dir=data_dir) + + def num_examples_per_epoch(self, subset='train'): + if subset == 'train': + return IMAGENET_NUM_TRAIN_IMAGES + elif subset == 'validation': + return IMAGENET_NUM_VAL_IMAGES + elif subset == 'calibrate' or subset == 'calibration': + return 100 + else: + raise ValueError('Invalid data subset "%s"' % subset) + + def get_image_preprocessor(self, benchmark=False): + if benchmark: + import preprocessing_benchmark + return preprocessing_benchmark.RecordInputImagePreprocessor + else: + import preprocessing + return preprocessing.RecordInputImagePreprocessor + diff --git a/models/image_recognition/tensorflow/resnet50v1_5/int8/generate_calibration_data.py b/models/image_recognition/tensorflow/resnet50v1_5/int8/generate_calibration_data.py new file mode 100644 index 000000000..abf62345b --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/int8/generate_calibration_data.py @@ -0,0 +1,183 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys +import os +import time +import numpy as np +from collections import namedtuple +from operator import attrgetter + +from google.protobuf import text_format +import tensorflow as tf +import preprocessing +import datasets + +NUM_TEST_IMAGES = 50000 + +def load_graph(model_file): + graph = tf.Graph() + graph_def = tf.GraphDef() + + import os + file_ext = os.path.splitext(model_file)[1] + + with open(model_file, "rb") as f: + if file_ext == '.pbtxt': + text_format.Merge(f.read(), graph_def) + else: + graph_def.ParseFromString(f.read()) + with graph.as_default(): + tf.import_graph_def(graph_def, name='') + + return graph + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_graph", default=None, + help="graph/model to be executed") + parser.add_argument("--data_location", default=None, + help="full path to the validation data") + parser.add_argument("--input_height", default=224, + type=int, help="input height") + parser.add_argument("--input_width", default=224, + type=int, help="input width") + parser.add_argument("--batch_size", default=32, + type=int, help="batch size") + parser.add_argument("--input_layer", default="input", + help="name of input layer") + parser.add_argument("--output_layer", default="predict", + help="name of output layer") + parser.add_argument( + '--num_inter_threads', + help='number threads across operators', + type=int, default=1) + parser.add_argument( + '--num_intra_threads', + help='number threads for an operator', + type=int, default=1) + args = parser.parse_args() + + if args.input_graph: + model_file = args.input_graph + else: + sys.exit("Please provide a graph file.") + if args.input_height: + input_height = args.input_height + else: + input_height = 224 + if args.input_width: + input_width = args.input_width + else: + input_width = 224 + batch_size = args.batch_size + input_layer = args.input_layer + output_layer = args.output_layer + num_inter_threads = args.num_inter_threads + num_intra_threads = args.num_intra_threads + data_location = args.data_location + dataset = datasets.ImagenetData(data_location) + preprocessor = preprocessing.ImagePreprocessor( + input_height, input_width, batch_size, + 1, # device count + tf.float32, # data_type for input fed to the graph + train=False, # doing inference + resize_method='crop') + images, labels, tf_records = preprocessor.minibatch(dataset, subset='train') + graph = load_graph(model_file) + input_tensor = graph.get_tensor_by_name(input_layer + ":0") + output_tensor = graph.get_tensor_by_name(output_layer + ":0") + + config = tf.ConfigProto() + config.inter_op_parallelism_threads = num_inter_threads + config.intra_op_parallelism_threads = num_intra_threads + + total_accuracy1, total_accuracy5 = (0.0, 0.0) + num_processed_images = 0 + num_remaining_images = dataset.num_examples_per_epoch(subset='train') \ + - num_processed_images + + CALIBRATION_POOL_SIZE = 1000 + CALIBRATION_SET_SIZE = 100 + calibration_pool = [] + ImageWithConfidence = namedtuple('ImageWithConfidence', + ['tf_record', 'confidence']) + current_pool_size = 0 + with tf.Session() as sess: + sess_graph = tf.Session(graph=graph, config=config) + while num_remaining_images >= batch_size: + # Reads and preprocess data + np_images, np_labels, serialized_images = sess.run( + [images[0], labels[0], tf_records]) + num_processed_images += batch_size + num_remaining_images -= batch_size + # Compute inference on the preprocessed data + predictions = sess_graph.run(output_tensor, + {input_tensor: np_images}) + selected_img_indices = np.where( + predictions.argmax(axis=1) == np_labels)[0].tolist() + current_pool_size += len(selected_img_indices) + for indx in selected_img_indices: + calibration_pool.append(ImageWithConfidence( + serialized_images[indx], predictions[indx].max())) + + accuracy1 = tf.reduce_sum( + tf.cast(tf.nn.in_top_k(tf.constant(predictions), + tf.constant(np_labels), 1), tf.float32)) + + accuracy5 = tf.reduce_sum( + tf.cast(tf.nn.in_top_k(tf.constant(predictions), + tf.constant(np_labels), 5), tf.float32)) + np_accuracy1, np_accuracy5 = sess.run([accuracy1, accuracy5]) + total_accuracy1 += np_accuracy1 + total_accuracy5 += np_accuracy5 + print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ + % (num_processed_images, total_accuracy1/num_processed_images, + total_accuracy5/num_processed_images)) + if current_pool_size >= CALIBRATION_POOL_SIZE: + break + + writer = tf.python_io.TFRecordWriter('calibration-1-of-1') + calibration_pool = sorted(calibration_pool, + key=attrgetter('confidence'), reverse=True) + for i in range(CALIBRATION_SET_SIZE): + writer.write(calibration_pool[i].tf_record) + writer.close() diff --git a/models/image_recognition/tensorflow/resnet50v1_5/int8/preprocessing.py b/models/image_recognition/tensorflow/resnet50v1_5/int8/preprocessing.py new file mode 100644 index 000000000..c4e0a95ce --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/int8/preprocessing.py @@ -0,0 +1,419 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Image pre-processing utilities. +""" +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf +from random import randint + +from tensorflow.python.ops import data_flow_ops +import cnn_util + +def parse_example_proto(example_serialized): + """Parses an Example proto containing a training example of an image. + + The output of the build_image_data.py image preprocessing script is a dataset + containing serialized Example protocol buffers. Each Example proto contains + the following fields: + + image/height: 462 + image/width: 581 + image/colorspace: 'RGB' + image/channels: 3 + image/class/label: 615 + image/class/synset: 'n03623198' + image/class/text: 'knee pad' + image/object/bbox/xmin: 0.1 + image/object/bbox/xmax: 0.9 + image/object/bbox/ymin: 0.2 + image/object/bbox/ymax: 0.6 + image/object/bbox/label: 615 + image/format: 'JPEG' + image/filename: 'ILSVRC2012_val_00041207.JPEG' + image/encoded: + + Args: + example_serialized: scalar Tensor tf.string containing a serialized + Example protocol buffer. + + Returns: + image_buffer: Tensor tf.string containing the contents of a JPEG file. + label: Tensor tf.int32 containing the label. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + text: Tensor tf.string containing the human-readable label. + """ + # Dense features in Example proto. + feature_map = { + 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, + default_value=-1), + 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + } + sparse_float32 = tf.VarLenFeature(dtype=tf.float32) + # Sparse features in Example proto. + feature_map.update( + {k: sparse_float32 for k in ['image/object/bbox/xmin', + 'image/object/bbox/ymin', + 'image/object/bbox/xmax', + 'image/object/bbox/ymax']}) + + features = tf.parse_single_example(example_serialized, feature_map) + label = tf.cast(features['image/class/label'], dtype=tf.int32) + + xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) + ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) + xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) + ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) + + # Note that we impose an ordering of (y, x) just to make life difficult. + bbox = tf.concat([ymin, xmin, ymax, xmax], 0) + + # Force the variable number of bounding boxes into the shape + # [1, num_boxes, coords]. + bbox = tf.expand_dims(bbox, 0) + bbox = tf.transpose(bbox, [0, 2, 1]) + + return features['image/encoded'], label, bbox, features['image/class/text'] + + +def decode_jpeg(image_buffer, scope=None): # , dtype=tf.float32): + """Decode a JPEG string into one 3-D float image Tensor. + + Args: + image_buffer: scalar string Tensor. + scope: Optional scope for op_scope. + Returns: + 3-D float Tensor with values ranging from [0, 1). + """ + # with tf.op_scope([image_buffer], scope, 'decode_jpeg'): + # with tf.name_scope(scope, 'decode_jpeg', [image_buffer]): + with tf.name_scope(scope or 'decode_jpeg'): + # Decode the string as an RGB JPEG. + # Note that the resulting image contains an unknown height and width + # that is set dynamically by decode_jpeg. In other words, the height + # and width of image is unknown at compile-time. + image = tf.image.decode_jpeg(image_buffer, channels=3, + fancy_upscaling=False, + dct_method='INTEGER_FAST') + + # image = tf.Print(image, [tf.shape(image)], 'Image shape: ') + + return image + + +def eval_image(image, height, width, bbox, thread_id, resize): + """Get the image for model evaluation.""" + with tf.name_scope('eval_image'): + if not thread_id: + tf.summary.image( + 'original_image', tf.expand_dims(image, 0)) + + if resize == 'crop': + # Note: This is much slower than crop_to_bounding_box + # It seems that the redundant pad step has huge overhead + # distorted_image = tf.image.resize_image_with_crop_or_pad(image, + # height, width) + shape = tf.shape(image) + image = tf.cond(tf.less(shape[0], shape[1]), + lambda: tf.image.resize_images(image, tf.convert_to_tensor([256, 256*shape[1]/shape[0]], dtype=tf.int32)), + lambda: tf.image.resize_images(image, tf.convert_to_tensor([256*shape[0]/shape[1], 256], dtype=tf.int32))) + shape = tf.shape(image) + + y0 = (shape[0] - height) // 2 + x0 = (shape[1] - width) // 2 + #y0=tf.random_uniform([],minval=0,maxval=(shape[0] - height + 1), dtype=tf.int32) + #x0=tf.random_uniform([],minval=0,maxval=(shape[1] - width + 1), dtype=tf.int32) + ## distorted_image = tf.slice(image, [y0,x0,0], [height,width,3]) + distorted_image = tf.image.crop_to_bounding_box(image, y0, x0, height, + width) + else: + sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( + tf.shape(image), + bounding_boxes=bbox, + min_object_covered=0.5, + aspect_ratio_range=[0.90, 1.10], + area_range=[0.10, 1.0], + max_attempts=100, + use_image_if_no_bounding_boxes=True) + bbox_begin, bbox_size, _ = sample_distorted_bounding_box + # Crop the image to the specified bounding box. + distorted_image = tf.slice(image, bbox_begin, bbox_size) + resize_method = { + 'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR, + 'bilinear': tf.image.ResizeMethod.BILINEAR, + 'bicubic': tf.image.ResizeMethod.BICUBIC, + 'area': tf.image.ResizeMethod.AREA + }[resize] + # This resizing operation may distort the images because the aspect + # ratio is not respected. + if cnn_util.tensorflow_version() >= 11: + distorted_image = tf.image.resize_images( + distorted_image, [height, width], + resize_method, + align_corners=False) + else: + distorted_image = tf.image.resize_images( + distorted_image, height, width, resize_method, align_corners=False) + distorted_image.set_shape([height, width, 3]) + if not thread_id: + tf.summary.image( + 'cropped_resized_image', tf.expand_dims(distorted_image, 0)) + image = distorted_image + return image + + +def distort_image(image, height, width, bbox, thread_id=0, scope=None): + """Distort one image for training a network. + + Distorting images provides a useful technique for augmenting the data + set during training in order to make the network invariant to aspects + of the image that do not effect the label. + + Args: + image: 3-D float Tensor of image + height: integer + width: integer + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged + as [ymin, xmin, ymax, xmax]. + thread_id: integer indicating the preprocessing thread. + scope: Optional scope for op_scope. + Returns: + 3-D float Tensor of distorted image used for training. + """ + # with tf.op_scope([image, height, width, bbox], scope, 'distort_image'): + # with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]): + with tf.name_scope(scope or 'distort_image'): + # Each bounding box has shape [1, num_boxes, box coords] and + # the coordinates are ordered [ymin, xmin, ymax, xmax]. + + # After this point, all image pixels reside in [0,1) + # until the very end, when they're rescaled to (-1, 1). The various + # adjust_* ops all require this range for dtype float. + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + + # Display the bounding box in the first thread only. + if not thread_id: + image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), + bbox) + tf.summary.image( + 'image_with_bounding_boxes', image_with_box) + + # A large fraction of image datasets contain a human-annotated bounding + # box delineating the region of the image containing the object of interest. + # We choose to create a new bounding box for the object which is a randomly + # distorted version of the human-annotated bounding box that obeys an allowed + # range of aspect ratios, sizes and overlap with the human-annotated + # bounding box. If no box is supplied, then we assume the bounding box is + # the entire image. + sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( + tf.shape(image), + bounding_boxes=bbox, + min_object_covered=0.1, + aspect_ratio_range=[0.99, 1.01], + area_range=[0.05, 1.0], + max_attempts=100, + use_image_if_no_bounding_boxes=True) + bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box + if not thread_id: + image_with_distorted_box = tf.image.draw_bounding_boxes( + tf.expand_dims(image, 0), distort_bbox) + tf.summary.image( + 'images_with_distorted_bounding_box', + image_with_distorted_box) + + # Crop the image to the specified bounding box. + distorted_image = tf.slice(image, bbox_begin, bbox_size) + + # This resizing operation may distort the images because the aspect + # ratio is not respected. We select a resize method in a round robin + # fashion based on the thread number. + # Note that ResizeMethod contains 4 enumerated resizing methods. + resize_method = thread_id % 4 + if cnn_util.tensorflow_version() >= 11: + distorted_image = tf.image.resize_images( + distorted_image, [height, width], resize_method, align_corners=False) + else: + distorted_image = tf.image.resize_images( + distorted_image, height, width, resize_method, align_corners=False) + # Restore the shape since the dynamic slice based upon the bbox_size loses + # the third dimension. + distorted_image.set_shape([height, width, 3]) + if not thread_id: + tf.summary.image( + 'cropped_resized_image', + tf.expand_dims(distorted_image, 0)) + + # Randomly flip the image horizontally. + distorted_image = tf.image.random_flip_left_right(distorted_image) + + # Randomly distort the colors. + distorted_image = distort_color(distorted_image, thread_id) + + # Note: This ensures the scaling matches the output of eval_image + distorted_image *= 256 + + if not thread_id: + tf.summary.image( + 'final_distorted_image', + tf.expand_dims(distorted_image, 0)) + return distorted_image + + +def distort_color(image, thread_id=0, scope=None): + """Distort the color of the image. + + Each color distortion is non-commutative and thus ordering of the color ops + matters. Ideally we would randomly permute the ordering of the color ops. + Rather then adding that level of complication, we select a distinct ordering + of color ops for each preprocessing thread. + + Args: + image: Tensor containing single image. + thread_id: preprocessing thread ID. + scope: Optional scope for op_scope. + Returns: + color-distorted image + """ + # with tf.op_scope([image], scope, 'distort_color'): + # with tf.name_scope(scope, 'distort_color', [image]): + with tf.name_scope(scope or 'distort_color'): + color_ordering = thread_id % 2 + + if color_ordering == 0: + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + elif color_ordering == 1: + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + + # The random_* ops do not necessarily clamp. + image = tf.clip_by_value(image, 0.0, 1.0) + return image + + +class ImagePreprocessor(object): + """Preprocessor for input images.""" + + def __init__(self, + height, + width, + batch_size, + device_count, + dtype=tf.float32, + train=True, + distortions=None, + resize_method=None): + self.height = height + self.width = width + self.batch_size = batch_size + self.device_count = device_count + self.dtype = dtype + self.train = train + self.resize_method = resize_method + if distortions is None: + distortions = False + self.distortions = distortions + if self.batch_size % self.device_count != 0: + raise ValueError( + ('batch_size must be a multiple of device_count: ' + 'batch_size %d, device_count: %d') % + (self.batch_size, self.device_count)) + self.batch_size_per_device = self.batch_size // self.device_count + + def preprocess(self, image_buffer, bbox, thread_id): + """Preprocessing image_buffer using thread_id.""" + # Note: Width and height of image is known only at runtime. + image = tf.image.decode_jpeg(image_buffer, channels=3, + dct_method='INTEGER_FAST') + if self.train and self.distortions: + image = distort_image(image, self.height, self.width, bbox, thread_id) + else: + image = eval_image(image, self.height, self.width, bbox, thread_id, + self.resize_method) + # Note: image is now float32 [height,width,3] with range [0, 255] + + # image = tf.cast(image, tf.uint8) # HACK TESTING + + return image + + def minibatch(self, dataset, subset): + with tf.name_scope('batch_processing'): + images = [[] for i in range(self.device_count)] + labels = [[] for i in range(self.device_count)] + record_input = data_flow_ops.RecordInput( + file_pattern=dataset.tf_record_pattern(subset), + seed=randint(0, 9000), + parallelism=64, + buffer_size=10000, + batch_size=self.batch_size, + name='record_input') + records = record_input.get_yield_op() + records = tf.split(records, self.batch_size, 0) + records = [tf.reshape(record, []) for record in records] + for i in xrange(self.batch_size): + value = records[i] + image_buffer, label_index, bbox, _ = parse_example_proto(value) + image = self.preprocess(image_buffer, bbox, i % 4) + device_index = i % self.device_count + images[device_index].append(image) + labels[device_index].append(label_index) + label_index_batch = [None] * self.device_count + for device_index in xrange(self.device_count): + images[device_index] = tf.parallel_stack(images[device_index]) + label_index_batch[device_index] = tf.concat(labels[device_index], 0) + + # dynamic_pad=True) # HACK TESTING dynamic_pad=True + images[device_index] = tf.cast(images[device_index], self.dtype) + depth = 3 + images[device_index] = tf.reshape( + images[device_index], + shape=[self.batch_size_per_device, self.height, self.width, depth]) + label_index_batch[device_index] = tf.reshape( + label_index_batch[device_index], [self.batch_size_per_device]) + # Display the training images in the visualizer. + # tf.summary.image('images', images) + + return images, label_index_batch, records diff --git a/models/image_recognition/tensorflow/resnet50v1_5/int8/preprocessing_benchmark.py b/models/image_recognition/tensorflow/resnet50v1_5/int8/preprocessing_benchmark.py new file mode 100644 index 000000000..8e3556556 --- /dev/null +++ b/models/image_recognition/tensorflow/resnet50v1_5/int8/preprocessing_benchmark.py @@ -0,0 +1,173 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from tensorflow.data.experimental import parallel_interleave +from tensorflow.data.experimental import map_and_batch +from tensorflow.python.platform import gfile + + +def parse_example_proto(example_serialized): + """Parses an Example proto containing a training example of an image. + """ + # Dense features in Example proto. + feature_map = { + 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, + default_value=-1), + } + sparse_float32 = tf.VarLenFeature(dtype=tf.float32) + # Sparse features in Example proto. + feature_map.update( + {k: sparse_float32 for k in ['image/object/bbox/xmin', + 'image/object/bbox/ymin', + 'image/object/bbox/xmax', + 'image/object/bbox/ymax']}) + + features = tf.parse_single_example(example_serialized, feature_map) + label = tf.cast(features['image/class/label'], dtype=tf.int32) + + return features['image/encoded'], label + + +def eval_image(image, height, width, resize_method, + central_fraction=0.875, scope=None): + with tf.name_scope('eval_image'): + if resize_method == 'crop': + shape = tf.shape(image) + image = tf.cond(tf.less(shape[0], shape[1]), + lambda: tf.image.resize_images(image, + tf.convert_to_tensor([256, 256 * shape[1] / shape[0]], + dtype=tf.int32)), + lambda: tf.image.resize_images(image, + tf.convert_to_tensor([256 * shape[0] / shape[1], 256], + dtype=tf.int32))) + shape = tf.shape(image) + y0 = (shape[0] - height) // 2 + x0 = (shape[1] - width) // 2 + distorted_image = tf.image.crop_to_bounding_box(image, y0, x0, height, width) + distorted_image.set_shape([height, width, 3]) + means = tf.broadcast_to([123.68, 116.78, 103.94], tf.shape(distorted_image)) + return distorted_image - means + else: # bilinear + if image.dtype != tf.float32: + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + # Crop the central region of the image with an area containing 87.5% of + # the original image. + if central_fraction: + image = tf.image.central_crop(image, central_fraction=central_fraction) + + if height and width: + # Resize the image to the specified height and width. + image = tf.expand_dims(image, 0) + image = tf.image.resize_bilinear(image, [height, width], + align_corners=False) + image = tf.squeeze(image, [0]) + image = tf.subtract(image, 0.5) + image = tf.multiply(image, 2.0) + return image + + +class RecordInputImagePreprocessor(object): + """Preprocessor for images with RecordInput format.""" + + def __init__(self, + height, + width, + batch_size, + num_cores, + resize_method): + + self.height = height + self.width = width + self.batch_size = batch_size + self.num_cores = num_cores + self.resize_method = resize_method + + def parse_and_preprocess(self, value): + # parse + image_buffer, label_index = parse_example_proto(value) + # preprocess + image = tf.image.decode_jpeg( + image_buffer, channels=3, fancy_upscaling=False, dct_method='INTEGER_FAST') + image = eval_image(image, self.height, self.width, self.resize_method) + + return (image, label_index) + + def minibatch(self, dataset, subset, cache_data=False): + + with tf.name_scope('batch_processing'): + + glob_pattern = dataset.tf_record_pattern(subset) + file_names = gfile.Glob(glob_pattern) + if not file_names: + raise ValueError('Found no files in --data_dir matching: {}' + .format(glob_pattern)) + ds = tf.data.TFRecordDataset.list_files(file_names) + + ds = ds.apply( + parallel_interleave( + tf.data.TFRecordDataset, cycle_length=self.num_cores, block_length=5, + sloppy=True, + buffer_output_elements=10000, prefetch_input_elements=10000)) + + if cache_data: + ds = ds.take(1).cache().repeat() + + ds = ds.prefetch(buffer_size=10000) + # ds = ds.prefetch(buffer_size=self.batch_size) + + # num of parallel batches not greater than 56 + max_num_parallel_batches = min(56, 2*self.num_cores) + ds = ds.apply( + map_and_batch( + map_func=self.parse_and_preprocess, + batch_size=self.batch_size, + num_parallel_batches=max_num_parallel_batches, + num_parallel_calls=None)) # this number should be tuned + + ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) # this number can be tuned + + ds_iterator = ds.make_one_shot_iterator() + images, _ = ds_iterator.get_next() + + return images diff --git a/models/object_detection/tensorflow/faster_rcnn/inference/int8/coco_int8.sh b/models/object_detection/tensorflow/faster_rcnn/inference/int8/coco_int8.sh old mode 100644 new mode 100755 diff --git a/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/run_frozen_graph_ssdmob.py b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/run_frozen_graph_ssdmob.py index 89b890ab1..90a1d1fd0 100644 --- a/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/run_frozen_graph_ssdmob.py +++ b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/run_frozen_graph_ssdmob.py @@ -36,10 +36,6 @@ import argparse from tensorflow.python.client import timeline -os.environ["KMP_BLOCKTIME"] = "0" -os.environ["KMP_SETTINGS"] = "1" -os.environ["KMP_AFFINITY"] = "granularity=fine,verbose,compact,1,0" - parser = argparse.ArgumentParser() parser.add_argument('-g', '--graph', help='Path to input graph to run', type=str, required=True) parser.add_argument('-d', '--dataset', help='Full Path to input dataset to run', type=str, required=True) diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/coco_metric.py b/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/coco_metric.py deleted file mode 100644 index 08f3b7e5a..000000000 --- a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/coco_metric.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright 2018 Google. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""COCO-style evaluation metrics. - -Forked from reference model implementation. - -COCO API: github.com/cocodataset/cocoapi/ -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import atexit -import tempfile - -from absl import flags - -import numpy as np -from pycocotools.coco import COCO -from pycocotools.cocoeval import COCOeval -import six - -import tensorflow as tf - -import ssd_constants - -FLAGS = flags.FLAGS - - -# https://github.com/cocodataset/cocoapi/issues/49 -if six.PY3: - import pycocotools.coco - pycocotools.coco.unicode = str - - -def async_eval_runner(queue_predictions, queue_results, val_json_file): - """Load intermediate eval results and get COCO metrics.""" - while True: - message = queue_predictions.get() - if message == 'STOP': # poison pill - break - step, predictions = message - results = compute_map(predictions, val_json_file) - queue_results.put((step, results)) - - -def compute_map(predictions, val_json_file): - """Use model predictions to compute mAP. - - Args: - predictions: a list of tuples returned by decoded_predictions function, - each containing the following elements: - image source_id, box coordinates in XYWH order, probability score, label - val_json_file: path to COCO annotation file - Returns: - A dictionary that maps all COCO metrics (keys) to their values - """ - - if val_json_file.startswith("gs://"): - _, local_val_json = tempfile.mkstemp(suffix=".json") - tf.gfile.Remove(local_val_json) - - tf.gfile.Copy(val_json_file, local_val_json) - atexit.register(tf.gfile.Remove, local_val_json) - else: - local_val_json = val_json_file - - cocoGt = COCO(local_val_json) - cocoDt = cocoGt.loadRes(np.array(predictions)) - E = COCOeval(cocoGt, cocoDt, iouType='bbox') - E.evaluate() - E.accumulate() - E.summarize() - print("Current AP: {:.5f}".format(E.stats[0])) - metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', - 'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl'] - - # Prefix with "COCO" to group in TensorBoard. - return {"COCO/" + key: value for key, value in zip(metric_names, E.stats)} - - -def calc_iou(target, candidates): - target_tiled = np.tile(target[np.newaxis, :], (candidates.shape[0], 1)) - # Left Top & Right Bottom - lt = np.maximum(target_tiled[:,:2], candidates[:,:2]) - - rb = np.minimum(target_tiled[:,2:], candidates[:,2:]) - - delta = np.maximum(rb - lt, 0) - - intersect = delta[:,0] * delta[:,1] - - delta1 = target_tiled[:,2:] - candidates[:,:2] - area1 = delta1[:,0] * delta1[:,1] - delta2 = target_tiled[:,2:] - candidates[:,:2] - area2 = delta2[:,0] * delta2[:,1] - - iou = intersect/(area1 + area2 - intersect) - return iou - - -# TODO(haoyuzhang): Rewrite this NumPy based implementation to TensorFlow based -# implementation under ssd_model.py accuracy_function. -def decode_predictions(labels_and_predictions): - """Decode predictions and remove unused boxes and labels.""" - predictions = [] - for example in labels_and_predictions: - source_id = int(example[ssd_constants.SOURCE_ID]) - pred_box = example[ssd_constants.PRED_BOXES] - pred_scores = example[ssd_constants.PRED_SCORES] - - locs, labels, probs = decode_single( - pred_box, pred_scores, ssd_constants.OVERLAP_CRITERIA, - ssd_constants.MAX_NUM_EVAL_BOXES, ssd_constants.MAX_NUM_EVAL_BOXES) - - raw_height, raw_width, _ = example[ssd_constants.RAW_SHAPE] - for loc, label, prob in zip(locs, labels, probs): - # Ordering convention differs, hence [1], [0] rather than [0], [1] - x, y = loc[1] * raw_width, loc[0] * raw_height - w, h = (loc[3] - loc[1]) * raw_width, (loc[2] - loc[0]) * raw_height - predictions.append( - [source_id, x, y, w, h, prob, ssd_constants.CLASS_INV_MAP[label]]) - return predictions - - -def decode_single(bboxes_in, scores_in, criteria, max_output, max_num=200): - # Reference to https://github.com/amdegroot/ssd.pytorch - - bboxes_out = [] - scores_out = [] - labels_out = [] - - for i, score in enumerate(np.split(scores_in, scores_in.shape[1], 1)): - score = np.squeeze(score, 1) - - # skip background - if i == 0: - continue - - mask = score > ssd_constants.MIN_SCORE - if not np.any(mask): - continue - - bboxes, score = bboxes_in[mask, :], score[mask] - - score_idx_sorted = np.argsort(score) - score_sorted = score[score_idx_sorted] - - score_idx_sorted = score_idx_sorted[-max_num:] - candidates = [] - - # perform non-maximum suppression - while len(score_idx_sorted): - idx = score_idx_sorted[-1] - bboxes_sorted = bboxes[score_idx_sorted, :] - bboxes_idx = bboxes[idx, :] - iou = calc_iou(bboxes_idx, bboxes_sorted) - - score_idx_sorted = score_idx_sorted[iou < criteria] - candidates.append(idx) - - bboxes_out.append(bboxes[candidates, :]) - scores_out.append(score[candidates]) - labels_out.extend([i]*len(candidates)) - - if len(scores_out) == 0: - tf.logging.info("No objects detected. Returning dummy values.") - return ( - np.zeros(shape=(1, 4), dtype=np.float32), - np.zeros(shape=(1,), dtype=np.int32), - np.ones(shape=(1,), dtype=np.float32) * ssd_constants.DUMMY_SCORE, - ) - - bboxes_out = np.concatenate(bboxes_out, axis=0) - scores_out = np.concatenate(scores_out, axis=0) - labels_out = np.array(labels_out) - - max_ids = np.argsort(scores_out)[-max_output:] - - return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids] diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/datasets.py b/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/datasets.py deleted file mode 100644 index 58c0f0dff..000000000 --- a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/datasets.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Benchmark dataset utilities. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from abc import abstractmethod -import os - -import numpy as np -import six -from six.moves import cPickle -from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf - -from tensorflow.python.platform import gfile -import preprocessing - -IMAGENET_NUM_TRAIN_IMAGES = 1281167 -IMAGENET_NUM_VAL_IMAGES = 50000 - -COCO_NUM_TRAIN_IMAGES = 118287 -COCO_NUM_VAL_IMAGES = 4952 - - -class Dataset(object): - """Abstract class for cnn benchmarks dataset.""" - - def __init__(self, - name, - data_dir=None, - queue_runner_required=False, - num_classes=None): - self.name = name - self.data_dir = data_dir - self._queue_runner_required = queue_runner_required - self._num_classes = num_classes - - def tf_record_pattern(self, subset): - return os.path.join(self.data_dir, '%s-*-of-*' % subset) - - def reader(self): - return tf.TFRecordReader() - - @property - def num_classes(self): - return self._num_classes - - @num_classes.setter - def num_classes(self, val): - self._num_classes = val - - @abstractmethod - def num_examples_per_epoch(self, subset): - pass - - def __str__(self): - return self.name - - def get_input_preprocessor(self, input_preprocessor='default'): - assert not self.use_synthetic_gpu_inputs() - return _SUPPORTED_INPUT_PREPROCESSORS[self.name][input_preprocessor] - - def queue_runner_required(self): - return self._queue_runner_required - - def use_synthetic_gpu_inputs(self): - return not self.data_dir - - -class LibrispeechDataset(Dataset): - """Configuration for LibriSpeech dataset.""" - - def __init__(self, data_dir=None): - super(LibrispeechDataset, self).__init__( - 'librispeech', data_dir, num_classes=29) - - def tf_record_pattern(self, subset): - if subset == 'train': - return os.path.join(self.data_dir, 'train-clean-*.tfrecords') - elif subset == 'validation': - return os.path.join(self.data_dir, 'test-clean.tfrecords') - else: - return '' - - def num_examples_per_epoch(self, subset='train'): - del subset - return 2 # TODO(laigd): currently this is an arbitrary number. - - -class ImageDataset(Dataset): - """Abstract class for image datasets.""" - - def __init__(self, - name, - height, - width, - depth=None, - data_dir=None, - queue_runner_required=False, - num_classes=1001): - super(ImageDataset, self).__init__(name, data_dir, queue_runner_required, - num_classes) - self.height = height - self.width = width - self.depth = depth or 3 - - -class ImagenetDataset(ImageDataset): - """Configuration for Imagenet dataset.""" - - def __init__(self, data_dir=None): - super(ImagenetDataset, self).__init__( - 'imagenet', 300, 300, data_dir=data_dir) - - def num_examples_per_epoch(self, subset='train'): - if subset == 'train': - return IMAGENET_NUM_TRAIN_IMAGES - elif subset == 'validation': - return IMAGENET_NUM_VAL_IMAGES - else: - raise ValueError('Invalid data subset "%s"' % subset) - - -class Cifar10Dataset(ImageDataset): - """Configuration for cifar 10 dataset. - - It will mount all the input images to memory. - """ - - def __init__(self, data_dir=None): - super(Cifar10Dataset, self).__init__( - 'cifar10', - 32, - 32, - data_dir=data_dir, - queue_runner_required=True, - num_classes=11) - - def read_data_files(self, subset='train'): - """Reads from data file and returns images and labels in a numpy array.""" - assert self.data_dir, ('Cannot call `read_data_files` when using synthetic ' - 'data') - if subset == 'train': - filenames = [ - os.path.join(self.data_dir, 'data_batch_%d' % i) - for i in xrange(1, 6) - ] - elif subset == 'validation': - filenames = [os.path.join(self.data_dir, 'test_batch')] - else: - raise ValueError('Invalid data subset "%s"' % subset) - - inputs = [] - for filename in filenames: - with gfile.Open(filename, 'rb') as f: - # python2 does not have the encoding parameter - encoding = {} if six.PY2 else {'encoding': 'bytes'} - inputs.append(cPickle.load(f, **encoding)) - # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the - # input format. - all_images = np.concatenate( - [each_input[b'data'] for each_input in inputs]).astype(np.float32) - all_labels = np.concatenate( - [each_input[b'labels'] for each_input in inputs]) - return all_images, all_labels - - def num_examples_per_epoch(self, subset='train'): - if subset == 'train': - return 50000 - elif subset == 'validation': - return 10000 - else: - raise ValueError('Invalid data subset "%s"' % subset) - - -class COCODataset(ImageDataset): - """COnfiguration for COCO dataset.""" - - def __init__(self, data_dir=None, image_size=300): - super(COCODataset, self).__init__( - 'coco', image_size, image_size, data_dir=data_dir, num_classes=81) - - def num_examples_per_epoch(self, subset='train'): - if subset == 'train': - return COCO_NUM_TRAIN_IMAGES - elif subset == 'validation': - return COCO_NUM_VAL_IMAGES - else: - raise ValueError('Invalid data subset "%s"' % subset) - - -_SUPPORTED_DATASETS = { - 'imagenet': ImagenetDataset, - 'cifar10': Cifar10Dataset, - 'librispeech': LibrispeechDataset, - 'coco': COCODataset, -} - -_SUPPORTED_INPUT_PREPROCESSORS = { - 'imagenet': { - 'default': preprocessing.RecordInputImagePreprocessor, - 'official_models_imagenet': preprocessing.ImagenetPreprocessor, - }, - 'cifar10': { - 'default': preprocessing.Cifar10ImagePreprocessor - }, - 'librispeech': { - 'default': preprocessing.LibrispeechPreprocessor - }, - 'coco': { - 'default': preprocessing.COCOPreprocessor - }, -} - - -def create_dataset(data_dir, data_name): - """Create a Dataset instance based on data_dir and data_name.""" - if not data_dir and not data_name: - # When using synthetic data, use synthetic imagenet images by default. - data_name = 'imagenet' - - # Infere dataset name from data_dir if data_name is not provided. - if data_name is None: - for supported_name in _SUPPORTED_DATASETS: - if supported_name in data_dir: - data_name = supported_name - break - else: # Failed to identify dataset name from data dir. - raise ValueError('Could not identify name of dataset. ' - 'Please specify with --data_name option.') - if data_name not in _SUPPORTED_DATASETS: - raise ValueError('Unknown dataset. Must be one of %s' % ', '.join( - [key for key in sorted(_SUPPORTED_DATASETS.keys())])) - - return _SUPPORTED_DATASETS[data_name](data_dir) diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/infer_detections.py b/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/infer_detections.py index f2666a94c..657469658 100644 --- a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/infer_detections.py +++ b/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/infer_detections.py @@ -23,9 +23,10 @@ from argparse import ArgumentParser +import benchmark_cnn import datasets import ssd_constants -import ssd_model +from models import ssd_model from preprocessing import COCOPreprocessor IMAGE_SIZE = 300 @@ -168,7 +169,8 @@ def accuracy_check(self): ds_init = tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS) ds_sess = tf.Session() - self.model = ssd_model.SSD300Model(self.args.data_location) + params = benchmark_cnn.make_params(data_dir=self.args.data_location) + self.model = ssd_model.SSD300Model(params=params) print("Inference for accuracy check.") with tf.Session(graph=self.freeze_graph, config=self.config) as sess: diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/preprocessing.py b/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/preprocessing.py deleted file mode 100644 index 6814a48cd..000000000 --- a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/preprocessing.py +++ /dev/null @@ -1,1259 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Image pre-processing utilities. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf - -from tensorflow.contrib.data.python.ops import threadpool -from tensorflow.contrib.image.python.ops import distort_image_ops -from tensorflow.contrib.data.python.ops import interleave_ops -from tensorflow.contrib.data.python.ops import batching -from tensorflow.python.framework import function -from tensorflow.python.layers import utils -from tensorflow.python.ops import data_flow_ops -from tensorflow.python.platform import gfile - - -def parse_example_proto(example_serialized): - """Parses an Example proto containing a training example of an image. - - The output of the build_image_data.py image preprocessing script is a dataset - containing serialized Example protocol buffers. Each Example proto contains - the following fields: - - image/height: 462 - image/width: 581 - image/colorspace: 'RGB' - image/channels: 3 - image/class/label: 615 - image/class/synset: 'n03623198' - image/class/text: 'knee pad' - image/object/bbox/xmin: 0.1 - image/object/bbox/xmax: 0.9 - image/object/bbox/ymin: 0.2 - image/object/bbox/ymax: 0.6 - image/object/bbox/label: 615 - image/format: 'JPEG' - image/filename: 'ILSVRC2012_val_00041207.JPEG' - image/encoded: - - Args: - example_serialized: scalar Tensor tf.string containing a serialized - Example protocol buffer. - - Returns: - image_buffer: Tensor tf.string containing the contents of a JPEG file. - label: Tensor tf.int32 containing the label. - bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] - where each coordinate is [0, 1) and the coordinates are arranged as - [ymin, xmin, ymax, xmax]. - text: Tensor tf.string containing the human-readable label. - """ - # Dense features in Example proto. - feature_map = { - 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, - default_value=''), - 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, - default_value=-1), - 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, - default_value=''), - } - sparse_float32 = tf.VarLenFeature(dtype=tf.float32) - # Sparse features in Example proto. - feature_map.update( - {k: sparse_float32 for k in ['image/object/bbox/xmin', - 'image/object/bbox/ymin', - 'image/object/bbox/xmax', - 'image/object/bbox/ymax']}) - - features = tf.parse_single_example(example_serialized, feature_map) - label = tf.cast(features['image/class/label'], dtype=tf.int32) - - xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) - ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) - xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) - ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) - - # Note that we impose an ordering of (y, x) just to make life difficult. - bbox = tf.concat([ymin, xmin, ymax, xmax], 0) - - # Force the variable number of bounding boxes into the shape - # [1, num_boxes, coords]. - bbox = tf.expand_dims(bbox, 0) - bbox = tf.transpose(bbox, [0, 2, 1]) - - return features['image/encoded'], label, bbox, features['image/class/text'] - - -_RESIZE_METHOD_MAP = { - 'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR, - 'bilinear': tf.image.ResizeMethod.BILINEAR, - 'bicubic': tf.image.ResizeMethod.BICUBIC, - 'area': tf.image.ResizeMethod.AREA -} - - -def get_image_resize_method(resize_method, batch_position=0): - """Get tensorflow resize method. - - If resize_method is 'round_robin', return different methods based on batch - position in a round-robin fashion. NOTE: If the batch size is not a multiple - of the number of methods, then the distribution of methods will not be - uniform. - - Args: - resize_method: (string) nearest, bilinear, bicubic, area, or round_robin. - batch_position: position of the image in a batch. NOTE: this argument can - be an integer or a tensor - Returns: - one of resize type defined in tf.image.ResizeMethod. - """ - - if resize_method != 'round_robin': - return _RESIZE_METHOD_MAP[resize_method] - - # return a resize method based on batch position in a round-robin fashion. - resize_methods = list(_RESIZE_METHOD_MAP.values()) - def lookup(index): - return resize_methods[index] - - def resize_method_0(): - return utils.smart_cond(batch_position % len(resize_methods) == 0, - lambda: lookup(0), resize_method_1) - - def resize_method_1(): - return utils.smart_cond(batch_position % len(resize_methods) == 1, - lambda: lookup(1), resize_method_2) - - def resize_method_2(): - return utils.smart_cond(batch_position % len(resize_methods) == 2, - lambda: lookup(2), lambda: lookup(3)) - - # NOTE(jsimsa): Unfortunately, we cannot use a single recursive function here - # because TF would not be able to construct a finite graph. - - return resize_method_0() - - -def decode_jpeg(image_buffer, scope=None): # , dtype=tf.float32): - """Decode a JPEG string into one 3-D float image Tensor. - - Args: - image_buffer: scalar string Tensor. - scope: Optional scope for op_scope. - Returns: - 3-D float Tensor with values ranging from [0, 1). - """ - # with tf.op_scope([image_buffer], scope, 'decode_jpeg'): - # with tf.name_scope(scope, 'decode_jpeg', [image_buffer]): - with tf.name_scope(scope or 'decode_jpeg'): - # Decode the string as an RGB JPEG. - # Note that the resulting image contains an unknown height and width - # that is set dynamically by decode_jpeg. In other words, the height - # and width of image is unknown at compile-time. - image = tf.image.decode_jpeg(image_buffer, channels=3, - fancy_upscaling=False, - dct_method='INTEGER_FAST') - - # image = tf.Print(image, [tf.shape(image)], 'Image shape: ') - - return image - - -_R_MEAN = 123.68 -_G_MEAN = 116.78 -_B_MEAN = 103.94 -_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN] - - -def normalized_image(images): - # Rescale from [0, 255] to [0, 2] - images = tf.multiply(images, 1. / 127.5) - # Rescale to [-1, 1] - return tf.subtract(images, 1.0) - - -def eval_image(image, - height, - width, - batch_position, - resize_method, - summary_verbosity=0): - """Get the image for model evaluation. - - We preprocess the image simiarly to Slim, see - https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/vgg_preprocessing.py - Validation images do not have bounding boxes, so to crop the image, we first - resize the image such that the aspect ratio is maintained and the resized - height and width are both at least 1.145 times `height` and `width` - respectively. Then, we do a central crop to size (`height`, `width`). - - Args: - image: 3-D float Tensor representing the image. - height: The height of the image that will be returned. - width: The width of the image that will be returned. - batch_position: position of the image in a batch, which affects how images - are distorted and resized. NOTE: this argument can be an integer or a - tensor - resize_method: one of the strings 'round_robin', 'nearest', 'bilinear', - 'bicubic', or 'area'. - summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both - summaries and checkpoints. - Returns: - An image of size (output_height, output_width, 3) that is resized and - cropped as described above. - """ - # TODO(reedwm): Currently we resize then crop. Investigate if it's faster to - # crop then resize. - with tf.name_scope('eval_image'): - if summary_verbosity >= 3: - tf.summary.image( - 'original_image', tf.expand_dims(image, 0)) - - shape = tf.shape(image) - image_height = shape[0] - image_width = shape[1] - image_height_float = tf.cast(image_height, tf.float32) - image_width_float = tf.cast(image_width, tf.float32) - - # This value is chosen so that in resnet, images are cropped to a size of - # 256 x 256, which matches what other implementations do. The final image - # size for resnet is 224 x 224, and floor(224 * 1.145) = 256. - scale_factor = 1.145 - - # Compute resize_height and resize_width to be the minimum values such that - # 1. The aspect ratio is maintained (i.e. resize_height / resize_width is - # image_height / image_width), and - # 2. resize_height >= height * `scale_factor`, and - # 3. resize_width >= width * `scale_factor` - max_ratio = tf.maximum(height / image_height_float, - width / image_width_float) - resize_height = tf.cast(image_height_float * max_ratio * scale_factor, - tf.int32) - resize_width = tf.cast(image_width_float * max_ratio * scale_factor, - tf.int32) - - # Resize the image to shape (`resize_height`, `resize_width`) - image_resize_method = get_image_resize_method(resize_method, batch_position) - distorted_image = tf.image.resize_images(image, - [resize_height, resize_width], - image_resize_method, - align_corners=False) - - # Do a central crop of the image to size (height, width). - # MLPerf requires us to log (height, width) with two different keys. - total_crop_height = (resize_height - height) - crop_top = total_crop_height // 2 - total_crop_width = (resize_width - width) - crop_left = total_crop_width // 2 - distorted_image = tf.slice(distorted_image, [crop_top, crop_left, 0], - [height, width, 3]) - - distorted_image.set_shape([height, width, 3]) - if summary_verbosity >= 3: - tf.summary.image( - 'cropped_resized_image', tf.expand_dims(distorted_image, 0)) - image = distorted_image - return image - - -def train_image(image_buffer, - height, - width, - bbox, - batch_position, - resize_method, - distortions, - scope=None, - summary_verbosity=0, - distort_color_in_yiq=False, - fuse_decode_and_crop=False): - """Distort one image for training a network. - - Distorting images provides a useful technique for augmenting the data - set during training in order to make the network invariant to aspects - of the image that do not effect the label. - - Args: - image_buffer: scalar string Tensor representing the raw JPEG image buffer. - height: integer - width: integer - bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] - where each coordinate is [0, 1) and the coordinates are arranged - as [ymin, xmin, ymax, xmax]. - batch_position: position of the image in a batch, which affects how images - are distorted and resized. NOTE: this argument can be an integer or a - tensor - resize_method: round_robin, nearest, bilinear, bicubic, or area. - distortions: If true, apply full distortions for image colors. - scope: Optional scope for op_scope. - summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both - summaries and checkpoints. - distort_color_in_yiq: distort color of input images in YIQ space. - fuse_decode_and_crop: fuse the decode/crop operation. - Returns: - 3-D float Tensor of distorted image used for training. - """ - # with tf.op_scope([image, height, width, bbox], scope, 'distort_image'): - # with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]): - with tf.name_scope(scope or 'distort_image'): - # A large fraction of image datasets contain a human-annotated bounding box - # delineating the region of the image containing the object of interest. We - # choose to create a new bounding box for the object which is a randomly - # distorted version of the human-annotated bounding box that obeys an - # allowed range of aspect ratios, sizes and overlap with the human-annotated - # bounding box. If no box is supplied, then we assume the bounding box is - # the entire image. - min_object_covered = 0.1 - aspect_ratio_range = [0.75, 1.33] - area_range = [0.05, 1.0] - max_attempts = 100 - - sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( - tf.image.extract_jpeg_shape(image_buffer), - bounding_boxes=bbox, - min_object_covered=min_object_covered, - aspect_ratio_range=aspect_ratio_range, - area_range=area_range, - max_attempts=max_attempts, - use_image_if_no_bounding_boxes=True) - bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box - if summary_verbosity >= 3: - image = tf.image.decode_jpeg(image_buffer, channels=3, - dct_method='INTEGER_FAST') - image = tf.image.convert_image_dtype(image, dtype=tf.float32) - image_with_distorted_box = tf.image.draw_bounding_boxes( - tf.expand_dims(image, 0), distort_bbox) - tf.summary.image( - 'images_with_distorted_bounding_box', - image_with_distorted_box) - - # Crop the image to the specified bounding box. - if fuse_decode_and_crop: - offset_y, offset_x, _ = tf.unstack(bbox_begin) - target_height, target_width, _ = tf.unstack(bbox_size) - crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) - image = tf.image.decode_and_crop_jpeg( - image_buffer, crop_window, channels=3) - else: - image = tf.image.decode_jpeg(image_buffer, channels=3, - dct_method='INTEGER_FAST') - image = tf.slice(image, bbox_begin, bbox_size) - - distorted_image = tf.image.random_flip_left_right(image) - - # This resizing operation may distort the images because the aspect - # ratio is not respected. - image_resize_method = get_image_resize_method(resize_method, batch_position) - distorted_image = tf.image.resize_images( - distorted_image, [height, width], - image_resize_method, - align_corners=False) - # Restore the shape since the dynamic slice based upon the bbox_size loses - # the third dimension. - distorted_image.set_shape([height, width, 3]) - if summary_verbosity >= 3: - tf.summary.image('cropped_resized_maybe_flipped_image', - tf.expand_dims(distorted_image, 0)) - - if distortions: - distorted_image = tf.cast(distorted_image, dtype=tf.float32) - # Images values are expected to be in [0,1] for color distortion. - distorted_image /= 255. - # Randomly distort the colors. - distorted_image = distort_color(distorted_image, batch_position, - distort_color_in_yiq=distort_color_in_yiq) - - # Note: This ensures the scaling matches the output of eval_image - distorted_image *= 255 - - if summary_verbosity >= 3: - tf.summary.image( - 'final_distorted_image', - tf.expand_dims(distorted_image, 0)) - return distorted_image - - -def distort_color(image, batch_position=0, distort_color_in_yiq=False, - scope=None): - """Distort the color of the image. - - Each color distortion is non-commutative and thus ordering of the color ops - matters. Ideally we would randomly permute the ordering of the color ops. - Rather then adding that level of complication, we select a distinct ordering - of color ops based on the position of the image in a batch. - - Args: - image: float32 Tensor containing single image. Tensor values should be in - range [0, 1]. - batch_position: the position of the image in a batch. NOTE: this argument - can be an integer or a tensor - distort_color_in_yiq: distort color of input images in YIQ space. - scope: Optional scope for op_scope. - Returns: - color-distorted image - """ - with tf.name_scope(scope or 'distort_color'): - - def distort_fn_0(image=image): - """Variant 0 of distort function.""" - image = tf.image.random_brightness(image, max_delta=32. / 255.) - if distort_color_in_yiq: - image = distort_image_ops.random_hsv_in_yiq( - image, lower_saturation=0.5, upper_saturation=1.5, - max_delta_hue=0.2 * math.pi) - else: - image = tf.image.random_saturation(image, lower=0.5, upper=1.5) - image = tf.image.random_hue(image, max_delta=0.2) - image = tf.image.random_contrast(image, lower=0.5, upper=1.5) - return image - - def distort_fn_1(image=image): - """Variant 1 of distort function.""" - image = tf.image.random_brightness(image, max_delta=32. / 255.) - image = tf.image.random_contrast(image, lower=0.5, upper=1.5) - if distort_color_in_yiq: - image = distort_image_ops.random_hsv_in_yiq( - image, lower_saturation=0.5, upper_saturation=1.5, - max_delta_hue=0.2 * math.pi) - else: - image = tf.image.random_saturation(image, lower=0.5, upper=1.5) - image = tf.image.random_hue(image, max_delta=0.2) - return image - - image = utils.smart_cond(batch_position % 2 == 0, distort_fn_0, - distort_fn_1) - # The random_* ops do not necessarily clamp. - image = tf.clip_by_value(image, 0.0, 1.0) - return image - - -class InputPreprocessor(object): - """Base class for all model preprocessors.""" - - def __init__(self, batch_size, output_shapes): - self.batch_size = batch_size - self.output_shapes = output_shapes - - def supports_datasets(self): - """Whether this preprocessor supports dataset.""" - return False - - def minibatch(self, dataset, subset, params, shift_ratio=-1): - """Returns tensors representing a minibatch of all the input.""" - raise NotImplementedError('Must be implemented by subclass.') - - # The methods added below are only supported/used if supports_datasets() - # returns True. - # TODO(laigd): refactor benchmark_cnn.py and put the logic of - # _build_input_processing() into InputPreprocessor. - - def parse_and_preprocess(self, value, batch_position): - """Function to parse and preprocess an Example proto in input pipeline.""" - raise NotImplementedError('Must be implemented by subclass.') - - def build_prefetch_input_processing(self, batch_size, model_input_shapes, - num_splits, cpu_device, params, - gpu_devices, model_input_data_types, - dataset, doing_eval): - """"Returns FunctionBufferingResources that do input pre(processing).""" - assert self.supports_datasets() - with tf.device(cpu_device): - if doing_eval: - subset = 'validation' - else: - subset = 'train' - - function_buffering_resources = [] - remote_fn, args = self.minibatch_fn( - batch_size=batch_size, - model_input_shapes=model_input_shapes, - num_splits=num_splits, - dataset=dataset, - subset=subset, - train=(not doing_eval), - datasets_repeat_cached_sample=params.datasets_repeat_cached_sample, - num_threads=params.datasets_num_private_threads, - datasets_use_caching=params.datasets_use_caching, - datasets_parallel_interleave_cycle_length=( - params.datasets_parallel_interleave_cycle_length), - datasets_sloppy_parallel_interleave=( - params.datasets_sloppy_parallel_interleave), - datasets_parallel_interleave_prefetch=( - params.datasets_parallel_interleave_prefetch)) - for device_num in range(len(gpu_devices)): - with tf.device(gpu_devices[device_num]): - buffer_resource_handle = prefetching_ops.function_buffering_resource( - f=remote_fn, - output_types=model_input_data_types, - target_device=cpu_device, - string_arg=args[0], - buffer_size=params.datasets_prefetch_buffer_size, - shared_name=None) - function_buffering_resources.append(buffer_resource_handle) - return function_buffering_resources - - # TODO(laigd): figure out how to remove these parameters, since the - # preprocessor itself has self.batch_size, self.num_splits, etc defined. - def build_multi_device_iterator(self, batch_size, num_splits, cpu_device, - params, gpu_devices, dataset, doing_eval): - """Creates a MultiDeviceIterator.""" - assert self.supports_datasets() - assert num_splits == len(gpu_devices) - with tf.name_scope('batch_processing'): - if doing_eval: - subset = 'validation' - else: - subset = 'train' - batch_size_per_split = batch_size // num_splits - ds = self.create_dataset( - batch_size, - num_splits, - batch_size_per_split, - dataset, - subset, - train=(not doing_eval), - datasets_repeat_cached_sample=params.datasets_repeat_cached_sample, - num_threads=params.datasets_num_private_threads, - datasets_use_caching=params.datasets_use_caching, - datasets_parallel_interleave_cycle_length=( - params.datasets_parallel_interleave_cycle_length), - datasets_sloppy_parallel_interleave=( - params.datasets_sloppy_parallel_interleave), - datasets_parallel_interleave_prefetch=( - params.datasets_parallel_interleave_prefetch)) - multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator( - ds, - gpu_devices, - source_device=cpu_device, - max_buffer_size=params.multi_device_iterator_max_buffer_size) - tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, - multi_device_iterator.initializer) - return multi_device_iterator - - def create_dataset(self, - batch_size, - num_splits, - batch_size_per_split, - dataset, - subset, - train, - datasets_repeat_cached_sample, - num_threads=None, - datasets_use_caching=False, - datasets_parallel_interleave_cycle_length=None, - datasets_sloppy_parallel_interleave=False, - datasets_parallel_interleave_prefetch=None): - """Creates a dataset for the benchmark.""" - raise NotImplementedError('Must be implemented by subclass.') - - def create_iterator(self, ds): - ds_iterator = ds.make_initializable_iterator() - tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, - ds_iterator.initializer) - return ds_iterator - - def minibatch_fn(self, batch_size, model_input_shapes, num_splits, - dataset, subset, train, datasets_repeat_cached_sample, - num_threads, datasets_use_caching, - datasets_parallel_interleave_cycle_length, - datasets_sloppy_parallel_interleave, - datasets_parallel_interleave_prefetch): - """Returns a function and list of args for the fn to create a minibatch.""" - assert self.supports_datasets() - batch_size_per_split = batch_size // num_splits - assert batch_size_per_split == model_input_shapes[0][0] - with tf.name_scope('batch_processing'): - ds = self.create_dataset(batch_size, num_splits, batch_size_per_split, - dataset, subset, train, - datasets_repeat_cached_sample, num_threads, - datasets_use_caching, - datasets_parallel_interleave_cycle_length, - datasets_sloppy_parallel_interleave, - datasets_parallel_interleave_prefetch) - ds_iterator = self.create_iterator(ds) - - ds_iterator_string_handle = ds_iterator.string_handle() - - @function.Defun(tf.string) - def _fn(h): - remote_iterator = tf.data.Iterator.from_string_handle( - h, ds_iterator.output_types, ds_iterator.output_shapes) - input_list = remote_iterator.get_next() - reshaped_input_list = [ - tf.reshape(input_list[i], shape=model_input_shapes[i]) - for i in range(len(input_list)) - ] - return reshaped_input_list - - return _fn, [ds_iterator_string_handle] - - -class BaseImagePreprocessor(InputPreprocessor): - """Base class for all image model preprocessors.""" - - def __init__(self, - batch_size, - output_shapes, - num_splits, - dtype, - train, - distortions, - resize_method, - shift_ratio=-1, - summary_verbosity=0, - distort_color_in_yiq=True, - fuse_decode_and_crop=True, - match_mlperf=False): - super(BaseImagePreprocessor, self).__init__(batch_size, output_shapes) - image_shape = output_shapes[0] - # image_shape is in form (batch_size, height, width, depth) - self.height = image_shape[1] - self.width = image_shape[2] - self.depth = image_shape[3] - self.num_splits = num_splits - self.dtype = dtype - self.train = train - self.resize_method = resize_method - self.shift_ratio = shift_ratio - self.distortions = distortions - self.distort_color_in_yiq = distort_color_in_yiq - self.fuse_decode_and_crop = fuse_decode_and_crop - if self.batch_size % self.num_splits != 0: - raise ValueError( - ('batch_size must be a multiple of num_splits: ' - 'batch_size %d, num_splits: %d') % - (self.batch_size, self.num_splits)) - self.batch_size_per_split = self.batch_size // self.num_splits - self.summary_verbosity = summary_verbosity - self.match_mlperf = match_mlperf - - def parse_and_preprocess(self, value, batch_position): - assert self.supports_datasets() - image_buffer, label_index, bbox, _ = parse_example_proto(value) - if self.match_mlperf: - bbox = tf.zeros((1, 0, 4), dtype=bbox.dtype) - image = self.preprocess(image_buffer, bbox, batch_position) - return (image, label_index) - - def preprocess(self, image_buffer, bbox, batch_position): - raise NotImplementedError('Must be implemented by subclass.') - - def create_dataset(self, - batch_size, - num_splits, - batch_size_per_split, - dataset, - subset, - train, - datasets_repeat_cached_sample, - num_threads=None, - datasets_use_caching=False, - datasets_parallel_interleave_cycle_length=None, - datasets_sloppy_parallel_interleave=False, - datasets_parallel_interleave_prefetch=None): - """Creates a dataset for the benchmark.""" - assert self.supports_datasets() - glob_pattern = dataset.tf_record_pattern(subset) - file_names = gfile.Glob(glob_pattern) - if not file_names: - raise ValueError('Found no files in --data_dir matching: {}' - .format(glob_pattern)) - ds = tf.data.TFRecordDataset.list_files(file_names) - ds = ds.apply( - interleave_ops.parallel_interleave( - tf.data.TFRecordDataset, - cycle_length=datasets_parallel_interleave_cycle_length or 10, - sloppy=datasets_sloppy_parallel_interleave, - prefetch_input_elements=datasets_parallel_interleave_prefetch)) - if datasets_repeat_cached_sample: - # Repeat a single sample element indefinitely to emulate memory-speed IO. - ds = ds.take(1).cache().repeat() - counter = tf.data.Dataset.range(batch_size) - counter = counter.repeat() - ds = tf.data.Dataset.zip((ds, counter)) - ds = ds.prefetch(buffer_size=batch_size) - if datasets_use_caching: - ds = ds.cache() - if train: - buffer_size = 10000 - ds = ds.apply( - tf.data.experimental.shuffle_and_repeat(buffer_size=buffer_size)) - else: - ds = ds.repeat() - ds = ds.apply( - batching.map_and_batch( - map_func=self.parse_and_preprocess, - batch_size=batch_size_per_split, - num_parallel_batches=num_splits)) - ds = ds.prefetch(buffer_size=num_splits) - if num_threads: - ds = threadpool.override_threadpool( - ds, - threadpool.PrivateThreadPool( - num_threads, display_name='input_pipeline_thread_pool')) - return ds - - -class RecordInputImagePreprocessor(BaseImagePreprocessor): - """Preprocessor for images with RecordInput format.""" - - def preprocess(self, image_buffer, bbox, batch_position): - """Preprocessing image_buffer as a function of its batch position.""" - if self.train: - image = train_image(image_buffer, self.height, self.width, bbox, - batch_position, self.resize_method, self.distortions, - None, summary_verbosity=self.summary_verbosity, - distort_color_in_yiq=self.distort_color_in_yiq, - fuse_decode_and_crop=self.fuse_decode_and_crop) - else: - image = tf.image.decode_jpeg( - image_buffer, channels=3, dct_method='INTEGER_FAST') - image = eval_image(image, self.height, self.width, batch_position, - self.resize_method, - summary_verbosity=self.summary_verbosity) - # Note: image is now float32 [height,width,3] with range [0, 255] - - # image = tf.cast(image, tf.uint8) # HACK TESTING - - if self.match_mlperf: - normalized = image - _CHANNEL_MEANS - else: - normalized = normalized_image(image) - return tf.cast(normalized, self.dtype) - - def minibatch(self, - dataset, - subset, - params, - shift_ratio=-1): - if shift_ratio < 0: - shift_ratio = self.shift_ratio - with tf.name_scope('batch_processing'): - # Build final results per split. - images = [[] for _ in range(self.num_splits)] - labels = [[] for _ in range(self.num_splits)] - if params.use_datasets: - ds = self.create_dataset( - self.batch_size, self.num_splits, self.batch_size_per_split, - dataset, subset, self.train, - datasets_repeat_cached_sample=params.datasets_repeat_cached_sample, - num_threads=params.datasets_num_private_threads, - datasets_use_caching=params.datasets_use_caching, - datasets_parallel_interleave_cycle_length=( - params.datasets_parallel_interleave_cycle_length), - datasets_sloppy_parallel_interleave=( - params.datasets_sloppy_parallel_interleave), - datasets_parallel_interleave_prefetch=( - params.datasets_parallel_interleave_prefetch)) - ds_iterator = self.create_iterator(ds) - for d in xrange(self.num_splits): - images[d], labels[d] = ds_iterator.get_next() - - # TODO(laigd): consider removing the --use_datasets option, it should - # always use datasets. - else: - record_input = data_flow_ops.RecordInput( - file_pattern=dataset.tf_record_pattern(subset), - seed=301, - parallelism=64, - buffer_size=10000, - batch_size=self.batch_size, - shift_ratio=shift_ratio, - name='record_input') - records = record_input.get_yield_op() - records = tf.split(records, self.batch_size, 0) - records = [tf.reshape(record, []) for record in records] - for idx in xrange(self.batch_size): - value = records[idx] - (image, label) = self.parse_and_preprocess(value, idx) - split_index = idx % self.num_splits - labels[split_index].append(label) - images[split_index].append(image) - - for split_index in xrange(self.num_splits): - if not params.use_datasets: - images[split_index] = tf.parallel_stack(images[split_index]) - labels[split_index] = tf.concat(labels[split_index], 0) - images[split_index] = tf.reshape( - images[split_index], - shape=[self.batch_size_per_split, self.height, self.width, - self.depth]) - labels[split_index] = tf.reshape(labels[split_index], - [self.batch_size_per_split]) - return images, labels - - def supports_datasets(self): - return True - - -class ImagenetPreprocessor(RecordInputImagePreprocessor): - - def preprocess(self, image_buffer, bbox, batch_position): - # pylint: disable=g-import-not-at-top - try: - from official.resnet.imagenet_preprocessing import preprocess_image - except ImportError: - tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH.') - raise - if self.train: - image = preprocess_image( - image_buffer, bbox, self.height, self.width, self.depth, - is_training=True) - else: - image = preprocess_image( - image_buffer, bbox, self.height, self.width, self.depth, - is_training=False) - return tf.cast(image, self.dtype) - - -class Cifar10ImagePreprocessor(BaseImagePreprocessor): - """Preprocessor for Cifar10 input images.""" - - def _distort_image(self, image): - """Distort one image for training a network. - - Adopted the standard data augmentation scheme that is widely used for - this dataset: the images are first zero-padded with 4 pixels on each side, - then randomly cropped to again produce distorted images; half of the images - are then horizontally mirrored. - - Args: - image: input image. - Returns: - distorted image. - """ - image = tf.image.resize_image_with_crop_or_pad( - image, self.height + 8, self.width + 8) - distorted_image = tf.random_crop(image, - [self.height, self.width, self.depth]) - # Randomly flip the image horizontally. - distorted_image = tf.image.random_flip_left_right(distorted_image) - if self.summary_verbosity >= 3: - tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0)) - return distorted_image - - def _eval_image(self, image): - """Get the image for model evaluation.""" - distorted_image = tf.image.resize_image_with_crop_or_pad( - image, self.width, self.height) - if self.summary_verbosity >= 3: - tf.summary.image('cropped.image', tf.expand_dims(distorted_image, 0)) - return distorted_image - - def preprocess(self, raw_image): - """Preprocessing raw image.""" - if self.summary_verbosity >= 3: - tf.summary.image('raw.image', tf.expand_dims(raw_image, 0)) - if self.train and self.distortions: - image = self._distort_image(raw_image) - else: - image = self._eval_image(raw_image) - normalized = normalized_image(image) - return tf.cast(normalized, self.dtype) - - def minibatch(self, - dataset, - subset, - params, - shift_ratio=-1): - # TODO(jsimsa): Implement datasets code path - del shift_ratio, params - with tf.name_scope('batch_processing'): - all_images, all_labels = dataset.read_data_files(subset) - all_images = tf.constant(all_images) - all_labels = tf.constant(all_labels) - input_image, input_label = tf.train.slice_input_producer( - [all_images, all_labels]) - input_image = tf.cast(input_image, self.dtype) - input_label = tf.cast(input_label, tf.int32) - # Ensure that the random shuffling has good mixing properties. - min_fraction_of_examples_in_queue = 0.4 - min_queue_examples = int(dataset.num_examples_per_epoch(subset) * - min_fraction_of_examples_in_queue) - raw_images, raw_labels = tf.train.shuffle_batch( - [input_image, input_label], batch_size=self.batch_size, - capacity=min_queue_examples + 3 * self.batch_size, - min_after_dequeue=min_queue_examples) - - images = [[] for i in range(self.num_splits)] - labels = [[] for i in range(self.num_splits)] - - # Create a list of size batch_size, each containing one image of the - # batch. Without the unstack call, raw_images[i] would still access the - # same image via a strided_slice op, but would be slower. - raw_images = tf.unstack(raw_images, axis=0) - raw_labels = tf.unstack(raw_labels, axis=0) - for i in xrange(self.batch_size): - split_index = i % self.num_splits - # The raw image read from data has the format [depth, height, width] - # reshape to the format returned by minibatch. - raw_image = tf.reshape(raw_images[i], - [dataset.depth, dataset.height, dataset.width]) - raw_image = tf.transpose(raw_image, [1, 2, 0]) - image = self.preprocess(raw_image) - images[split_index].append(image) - - labels[split_index].append(raw_labels[i]) - - for split_index in xrange(self.num_splits): - images[split_index] = tf.parallel_stack(images[split_index]) - labels[split_index] = tf.parallel_stack(labels[split_index]) - return images, labels - - -class COCOPreprocessor(BaseImagePreprocessor): - """Preprocessor for COCO dataset input images, boxes, and labels.""" - - def minibatch(self, - dataset, - subset, - params, - shift_ratio=-1): - del shift_ratio # Not used when using datasets instead of data_flow_ops - with tf.name_scope('batch_processing'): - ds = self.create_dataset( - self.batch_size, self.num_splits, self.batch_size_per_split, - dataset, subset, self.train, params.datasets_repeat_cached_sample) - ds_iterator = self.create_iterator(ds) - - # Training data: 4 tuple - # Validation data: 5 tuple - # See get_input_shapes in models/ssd_model.py for details. - input_len = 4 if subset == 'train' else 5 - input_lists = [[None for _ in range(self.num_splits)] - for _ in range(input_len)] - for d in xrange(self.num_splits): - input_list = ds_iterator.get_next() - for i in range(input_len): - input_lists[i][d] = input_list[i] - return input_lists - - def preprocess(self, data): - try: - import ssd_dataloader # pylint: disable=g-import-not-at-top - import ssd_constants # pylint: disable=g-import-not-at-top - from object_detection.core import preprocessor # pylint: disable=g-import-not-at-top - except ImportError: - raise ImportError('To use the COCO dataset, you must clone the ' - 'repo https://github.com/tensorflow/models and add ' - 'tensorflow/models and tensorflow/models/research to ' - 'the PYTHONPATH, and compile the protobufs by ' - 'following https://github.com/tensorflow/models/blob/' - 'master/research/object_detection/g3doc/installation.md' - '#protobuf-compilation') - image_buffer = data['image_buffer'] - boxes = data['groundtruth_boxes'] - classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) - source_id = tf.string_to_number(data['source_id']) - raw_shape = data['raw_shape'] - - ssd_encoder = ssd_dataloader.Encoder() - - # Only 80 of the 90 COCO classes are used. - class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) - classes = tf.gather(class_map, classes) - classes = tf.cast(classes, dtype=tf.float32) - - if self.train: - image, boxes, classes = ssd_dataloader.ssd_decode_and_crop( - image_buffer, boxes, classes, raw_shape) - # ssd_crop resizes and returns image of dtype float32 and does not change - # its range (i.e., value in between 0--255). Divide by 255. converts it - # to [0, 1] range. Not doing this before cropping to avoid dtype cast - # (which incurs additional memory copy). - image /= 255. - - image, boxes = preprocessor.random_horizontal_flip( - image=image, boxes=boxes) - # Random horizontal flip probability is 50% - # See https://github.com/tensorflow/models/blob/master/research/object_detection/core/preprocessor.py # pylint: disable=line-too-long - - image = ssd_dataloader.color_jitter( - image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) - image = ssd_dataloader.normalize_image(image) - image = tf.cast(image, self.dtype) - - encoded_returns = ssd_encoder.encode_labels(boxes, classes) - encoded_classes, encoded_boxes, num_matched_boxes = encoded_returns - - # Shape of image: [width, height, channel] - # Shape of encoded_boxes: [NUM_SSD_BOXES, 4] - # Shape of encoded_classes: [NUM_SSD_BOXES, 1] - # Shape of num_matched_boxes: [1] - return (image, encoded_boxes, encoded_classes, num_matched_boxes) - - else: - image = tf.image.decode_jpeg(image_buffer) - image = tf.image.resize_images( - image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE)) - # resize_image returns image of dtype float32 and does not change its - # range. Divide by 255 to convert image to [0, 1] range. - image /= 255. - - image = ssd_dataloader.normalize_image(image) - image = tf.cast(image, self.dtype) - - def trim_and_pad(inp_tensor): - """Limit the number of boxes, and pad if necessary.""" - inp_tensor = inp_tensor[:ssd_constants.MAX_NUM_EVAL_BOXES] - num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0] - inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) - return tf.reshape(inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, - inp_tensor.get_shape()[1]]) - - boxes, classes = trim_and_pad(boxes), trim_and_pad(classes) - - # Shape of boxes: [MAX_NUM_EVAL_BOXES, 4] - # Shape of classes: [MAX_NUM_EVAL_BOXES, 1] - # Shape of source_id: [] (scalar tensor) - # Shape of raw_shape: [3] - return (image, boxes, classes, source_id, raw_shape) - - def create_dataset(self, - batch_size, - num_splits, - batch_size_per_split, - dataset, - subset, - train, - datasets_repeat_cached_sample, - num_threads=None, - datasets_use_caching=False, - datasets_parallel_interleave_cycle_length=None, - datasets_sloppy_parallel_interleave=False, - datasets_parallel_interleave_prefetch=None): - """Creates a dataset for the benchmark.""" - try: - import ssd_dataloader # pylint: disable=g-import-not-at-top - except ImportError: - raise ImportError('To use the COCO dataset, you must clone the ' - 'repo https://github.com/tensorflow/models and add ' - 'tensorflow/models and tensorflow/models/research to ' - 'the PYTHONPATH, and compile the protobufs by ' - 'following https://github.com/tensorflow/models/blob/' - 'master/research/object_detection/g3doc/installation.md' - '#protobuf-compilation') - assert self.supports_datasets() - - glob_pattern = dataset.tf_record_pattern(subset) - file_names = gfile.Glob(glob_pattern) - if not file_names: - raise ValueError('Found no files in --data_dir matching: {}' - .format(glob_pattern)) - - ds = tf.data.TFRecordDataset.list_files(file_names) - # TODO(haoyuzhang): Enable map+filter fusion after cl/218399112 in release - # options = tf.data.Options() - # options.experimental_map_and_filter_fusion = True - # ds = ds.with_options(options) - - ds = ds.apply( - interleave_ops.parallel_interleave( - tf.data.TFRecordDataset, - cycle_length=datasets_parallel_interleave_cycle_length or 10, - sloppy=datasets_sloppy_parallel_interleave)) - if datasets_repeat_cached_sample: - # Repeat a single sample element indefinitely to emulate memory-speed IO. - ds = ds.take(1).cache().repeat() - ds = ds.prefetch(buffer_size=batch_size) - if datasets_use_caching: - ds = ds.cache() - if train: - ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000)) - else: - ds = ds.repeat() - - ds = ds.map(ssd_dataloader.ssd_parse_example_proto, num_parallel_calls=64) - ds = ds.filter( - lambda data: tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0)) - ds = ds.apply( - batching.map_and_batch( - map_func=self.preprocess, - batch_size=batch_size_per_split, - num_parallel_batches=num_splits, - drop_remainder=train)) - ds = ds.prefetch(buffer_size=num_splits) - if num_threads: - ds = threadpool.override_threadpool( - ds, - threadpool.PrivateThreadPool( - num_threads, display_name='input_pipeline_thread_pool')) - return ds - - def supports_datasets(self): - return True - - -class LibrispeechPreprocessor(InputPreprocessor): - """Preprocessor for librispeech class for all image model preprocessors.""" - - def __init__(self, batch_size, output_shapes, num_splits, dtype, train, - **kwargs): - del kwargs - super(LibrispeechPreprocessor, self).__init__(batch_size, output_shapes) - self.num_splits = num_splits - self.dtype = dtype - self.is_train = train - if self.batch_size % self.num_splits != 0: - raise ValueError(('batch_size must be a multiple of num_splits: ' - 'batch_size %d, num_splits: %d') % (self.batch_size, - self.num_splits)) - self.batch_size_per_split = self.batch_size // self.num_splits - - def create_dataset(self, - batch_size, - num_splits, - batch_size_per_split, - dataset, - subset, - train, - datasets_repeat_cached_sample, - num_threads=None, - datasets_use_caching=False, - datasets_parallel_interleave_cycle_length=None, - datasets_sloppy_parallel_interleave=False, - datasets_parallel_interleave_prefetch=None): - """Creates a dataset for the benchmark.""" - # TODO(laigd): currently the only difference between this and the one in - # BaseImagePreprocessor is, this uses map() and padded_batch() while the - # latter uses tf.data.experimental.map_and_batch(). Try to merge them. - assert self.supports_datasets() - glob_pattern = dataset.tf_record_pattern(subset) - file_names = gfile.Glob(glob_pattern) - if not file_names: - raise ValueError('Found no files in --data_dir matching: {}' - .format(glob_pattern)) - ds = tf.data.TFRecordDataset.list_files(file_names) - ds = ds.apply( - tf.data.experimental.parallel_interleave( - tf.data.TFRecordDataset, - cycle_length=datasets_parallel_interleave_cycle_length or 10, - sloppy=datasets_sloppy_parallel_interleave, - prefetch_input_elements=datasets_parallel_interleave_prefetch)) - if datasets_repeat_cached_sample: - # Repeat a single sample element indefinitely to emulate memory-speed IO. - ds = ds.take(1).cache().repeat() - counter = tf.data.Dataset.range(batch_size) - counter = counter.repeat() - ds = tf.data.Dataset.zip((ds, counter)) - ds = ds.prefetch(buffer_size=batch_size) - if datasets_use_caching: - ds = ds.cache() - if train: - ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000)) - else: - ds = ds.repeat() - ds = ds.map(map_func=self.parse_and_preprocess, - num_parallel_calls=batch_size_per_split*num_splits) - ds = ds.padded_batch( - batch_size=batch_size_per_split, - padded_shapes=tuple([ - tf.TensorShape(output_shape[1:]) - for output_shape in self.output_shapes - ]), - drop_remainder=True) - ds = ds.prefetch(buffer_size=num_splits) - if num_threads: - ds = threadpool.override_threadpool( - ds, - threadpool.PrivateThreadPool( - num_threads, display_name='input_pipeline_thread_pool')) - return ds - - def minibatch(self, dataset, subset, params, shift_ratio=-1): - assert params.use_datasets - # TODO(laigd): unify this with CNNModel's minibatch() - # TODO(laigd): in distributed mode we use shift_ratio so different workers - # won't work on same inputs, so we should respect that. - del shift_ratio - with tf.name_scope('batch_processing'): - ds = self.create_dataset( - self.batch_size, - self.num_splits, - self.batch_size_per_split, - dataset, - subset, - self.is_train, - datasets_repeat_cached_sample=params.datasets_repeat_cached_sample, - num_threads=params.datasets_num_private_threads, - datasets_use_caching=params.datasets_use_caching, - datasets_parallel_interleave_cycle_length=( - params.datasets_parallel_interleave_cycle_length), - datasets_sloppy_parallel_interleave=( - params.datasets_sloppy_parallel_interleave), - datasets_parallel_interleave_prefetch=( - params.datasets_parallel_interleave_prefetch)) - ds_iterator = self.create_iterator(ds) - - # The four lists are: input spectrogram feature, labels, input lengths, - # label lengths - input_lists = [[None for _ in range(self.num_splits)] for _ in range(4)] - for d in xrange(self.num_splits): - input_list = ds_iterator.get_next() - for i in range(4): - input_lists[i][d] = input_list[i] - - assert self.output_shapes == [ - input_lists[i][0].shape.as_list() for i in range(4) - ] - return tuple(input_lists) - - def supports_datasets(self): - return True - - def parse_and_preprocess(self, value, batch_position): - """Parse an TFRecord.""" - del batch_position - assert self.supports_datasets() - context_features = { - 'labels': tf.VarLenFeature(dtype=tf.int64), - 'input_length': tf.FixedLenFeature([], dtype=tf.int64), - 'label_length': tf.FixedLenFeature([], dtype=tf.int64), - } - sequence_features = { - 'features': tf.FixedLenSequenceFeature([161], dtype=tf.float32) - } - context_parsed, sequence_parsed = tf.parse_single_sequence_example( - serialized=value, - context_features=context_features, - sequence_features=sequence_features, - ) - - return [ - # Input - tf.expand_dims(sequence_parsed['features'], axis=2), - # Label - tf.cast( - tf.reshape( - tf.sparse_tensor_to_dense(context_parsed['labels']), [-1]), - dtype=tf.int32), - # Input length - tf.cast( - tf.reshape(context_parsed['input_length'], [1]), - dtype=tf.int32), - # Label length - tf.cast( - tf.reshape(context_parsed['label_length'], [1]), - dtype=tf.int32), - ] diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/ssd_constants.py b/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/ssd_constants.py deleted file mode 100644 index 77fa0149b..000000000 --- a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/ssd_constants.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright 2018 Google. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Central location for all constants related to MLPerf SSD.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -# ============================================================================== -# == Model ===================================================================== -# ============================================================================== -IMAGE_SIZE = 300 - -# TODO(taylorrobie): MLPerf uses 80, but COCO documents 90. (RetinaNet uses 90) -# Update(taylorrobie): Labels > 81 show up in the pipeline. This will need to -# be resolved. -NUM_CLASSES = 81 # Including "no class". Not all COCO classes are used. - -# Note: Zero is special. (Background class) CLASS_INV_MAP[0] must be zero. -CLASS_INV_MAP = ( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, - 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, - 88, 89, 90) -_MAP = {j: i for i, j in enumerate(CLASS_INV_MAP)} -CLASS_MAP = tuple(_MAP.get(i, -1) for i in range(max(CLASS_INV_MAP) + 1)) - -NUM_SSD_BOXES = 8732 - -RESNET_DEPTH = 34 - -"""SSD specific""" -MIN_LEVEL = 3 -MAX_LEVEL = 8 - -FEATURE_SIZES = (38, 19, 10, 5, 3, 1) -STEPS = (8, 16, 32, 64, 100, 300) - -# https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py -SCALES = (21, 45, 99, 153, 207, 261, 315) -ASPECT_RATIOS = ((2,), (2, 3), (2, 3), (2, 3), (2,), (2,)) -NUM_DEFAULTS = (4, 6, 6, 6, 4, 4) -NUM_DEFAULTS_BY_LEVEL = {3: 4, 4: 6, 5: 6, 6: 6, 7: 4, 8: 4} -SCALE_XY = 0.1 -SCALE_HW = 0.2 -BOX_CODER_SCALES = (1 / SCALE_XY, 1 / SCALE_XY, 1 / SCALE_HW, 1 / SCALE_HW) -MATCH_THRESHOLD = 0.5 - -# https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683 -NORMALIZATION_MEAN = (0.485, 0.456, 0.406) -NORMALIZATION_STD = (0.229, 0.224, 0.225) - -# SSD Cropping -NUM_CROP_PASSES = 50 -CROP_MIN_IOU_CHOICES = (0, 0.1, 0.3, 0.5, 0.7, 0.9) -P_NO_CROP_PER_PASS = 1 / (len(CROP_MIN_IOU_CHOICES) + 1) - -# Hard example mining -NEGS_PER_POSITIVE = 3 - -# Batch normalization -BATCH_NORM_DECAY = 0.997 -BATCH_NORM_EPSILON = 1e-4 - - -# ============================================================================== -# == Optimizer ================================================================= -# ============================================================================== -LEARNING_RATE_SCHEDULE = ( - (0, 1e-3), - (160000, 1e-4), - (200000, 1e-5), -) -MOMENTUM = 0.9 -WEIGHT_DECAY = 5e-4 - - -# ============================================================================== -# == Keys ====================================================================== -# ============================================================================== -BOXES = "boxes" -CLASSES = "classes" -NUM_MATCHED_BOXES = "num_matched_boxes" -IMAGE = "image" -SOURCE_ID = "source_id" -RAW_SHAPE = "raw_shape" -PRED_BOXES = "pred_boxes" -PRED_SCORES = "pred_scores" - - -# ============================================================================== -# == Evaluation ================================================================ -# ============================================================================== - -# Note: This is based on a batch size of 32 -# https://github.com/mlperf/reference/blob/master/single_stage_detector/ssd/train.py#L21-L37 -CHECKPOINT_FREQUENCY = 20000 -MAX_NUM_EVAL_BOXES = 200 -OVERLAP_CRITERIA = 0.5 # Used for nonmax supression -MIN_SCORE = 0.05 # Minimum score to be considered during evaluation. -DUMMY_SCORE = -1e5 # If no boxes are matched. - -ANNOTATION_FILE = "annotations/instances_val2017.json" -COCO_NUM_TRAIN_IMAGES = 118287 -COCO_NUM_VAL_IMAGES = 4952 diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/ssd_dataloader.py b/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/ssd_dataloader.py deleted file mode 100644 index 2f291fd85..000000000 --- a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/ssd_dataloader.py +++ /dev/null @@ -1,382 +0,0 @@ -# Copyright 2018 Google. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Data loader and processing.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import itertools as it -import math - -import numpy as np -import tensorflow as tf - -from object_detection.box_coders import faster_rcnn_box_coder -from object_detection.core import box_list -from object_detection.core import region_similarity_calculator -from object_detection.core import target_assigner -from object_detection.matchers import argmax_matcher -import ssd_constants - - -class DefaultBoxes(object): - """Default bounding boxes for 300x300 5 layer SSD. - - Default bounding boxes generation follows the order of (W, H, anchor_sizes). - Therefore, the tensor converted from DefaultBoxes has a shape of - [anchor_sizes, H, W, 4]. The last dimension is the box coordinates; 'ltrb' - is [ymin, xmin, ymax, xmax] while 'xywh' is [cy, cx, h, w]. - """ - - def __init__(self): - fk = ssd_constants.IMAGE_SIZE / np.array(ssd_constants.STEPS) - - self.default_boxes = [] - # size of feature and number of feature - for idx, feature_size in enumerate(ssd_constants.FEATURE_SIZES): - sk1 = ssd_constants.SCALES[idx] / ssd_constants.IMAGE_SIZE - sk2 = ssd_constants.SCALES[idx+1] / ssd_constants.IMAGE_SIZE - sk3 = math.sqrt(sk1*sk2) - all_sizes = [(sk1, sk1), (sk3, sk3)] - - for alpha in ssd_constants.ASPECT_RATIOS[idx]: - w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha) - all_sizes.append((w, h)) - all_sizes.append((h, w)) - - assert len(all_sizes) == ssd_constants.NUM_DEFAULTS[idx] - - for w, h in all_sizes: - for i, j in it.product(range(feature_size), repeat=2): - cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx] - box = tuple(np.clip(k, 0, 1) for k in (cy, cx, h, w)) - self.default_boxes.append(box) - - assert len(self.default_boxes) == ssd_constants.NUM_SSD_BOXES - - def to_ltrb(cy, cx, h, w): - return cy - h / 2, cx - w / 2, cy + h / 2, cx + w / 2 - - # For IoU calculation - self.default_boxes_ltrb = tuple(to_ltrb(*i) for i in self.default_boxes) - - def __call__(self, order='ltrb'): - if order == 'ltrb': return self.default_boxes_ltrb - if order == 'xywh': return self.default_boxes - - -def calc_iou_tensor(boxes1, boxes2): - """Calculation of IoU based on two boxes tensor. - - Reference to https://github.com/kuangliu/pytorch-ssd - - Args: - boxes1: shape (N, 4), four coordinates of N boxes - boxes2: shape (M, 4), four coordinates of M boxes - Returns: - IoU: shape (N, M), IoU of the i-th box in `boxes1` and j-th box in `boxes2` - """ - b1_left, b1_top, b1_right, b1_bottom = tf.split(boxes1, 4, axis=1) - b2_left, b2_top, b2_right, b2_bottom = tf.split(boxes2, 4, axis=1) - - # Shape of intersect_* (N, M) - intersect_left = tf.maximum(b1_left, tf.transpose(b2_left)) - intersect_top = tf.maximum(b1_top, tf.transpose(b2_top)) - intersect_right = tf.minimum(b1_right, tf.transpose(b2_right)) - intersect_bottom = tf.minimum(b1_bottom, tf.transpose(b2_bottom)) - - boxes1_area = (b1_right - b1_left) * (b1_bottom - b1_top) - boxes2_area = (b2_right - b2_left) * (b2_bottom - b2_top) - - intersect = tf.multiply(tf.maximum((intersect_right - intersect_left), 0), - tf.maximum((intersect_bottom - intersect_top), 0)) - union = boxes1_area + tf.transpose(boxes2_area) - intersect - iou = intersect / union - - return iou - - -def ssd_parse_example_proto(example_serialized): - """Parses an Example proto containing a training example of an image. - - Each Example proto contains the following fields that we care about: - - image/encoded: - image/source_id: tf.string - image/height: tf.int64 - image/width: tf.int64 - image/object/bbox/xmin: tf.VarLenFeature(tf.float32) - image/object/bbox/xmax: tf.VarLenFeature(tf.float32) - image/object/bbox/ymin: tf.VarLenFeature(tf.float32 - image/object/bbox/ymax: tf.VarLenFeature(tf.float32) - image/object/class/label: tf.VarLenFeature(tf.int64) - image/object/class/text: tf.VarLenFeature(tf.string) - - Complete decoder can be found in: - https://github.com/tensorflow/models/blob/master/research/object_detection/data_decoders/tf_example_decoder.py - - Args: - example_serialized: scalar Tensor tf.string containing a serialized - Example protocol buffer. - - Returns: - A dictionary with the following key-values: - image_buffer: Tensor tf.string containing the contents of a JPEG file. - groundtruth_boxes: Tensor tf.float32 of shape [num_boxes, 4], containing - coordinates of object bounding boxes. - groundtruth_classeS: Tensor tf.int64 of shape [num_boxes, 1], containing - class labels of objects. - source_id: unique image identifier. - raw_shape: [height, width, 3]. - """ - feature_map = { - 'image/encoded': tf.FixedLenFeature( - (), dtype=tf.string, default_value=''), - 'image/source_id': tf.FixedLenFeature((), tf.string, default_value=''), - 'image/height': tf.FixedLenFeature((), tf.int64, default_value=1), - 'image/width': tf.FixedLenFeature((), tf.int64, default_value=1), - 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), - 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), - 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), - 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), - 'image/object/class/label': tf.VarLenFeature(dtype=tf.int64), - } - features = tf.parse_single_example(example_serialized, feature_map) - - xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 1) - ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 1) - xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 1) - ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 1) - - image_buffer = features['image/encoded'] - # Bounding box coordinates should be in ltrb order - boxes = tf.concat([ymin, xmin, ymax, xmax], 1) - classes = tf.expand_dims(features['image/object/class/label'].values, 1) - source_id = features['image/source_id'] - raw_shape = tf.stack([features['image/height'], features['image/width'], 3]) - - return {'image_buffer': image_buffer, - 'groundtruth_boxes': boxes, - 'groundtruth_classes': classes, - 'source_id': source_id, - 'raw_shape': raw_shape} - - -def ssd_decode_and_crop(image_buffer, boxes, classes, raw_shape): - """Crop image randomly and decode the cropped region. - - This function will crop an image to meet the following requirements: - 1. height to width ratio between 0.5 and 2; - 2. IoUs of some boxes exceed specified threshold; - 3. At least one box center is in the cropped region. - We defer the jpeg decoding task until after the crop to avoid wasted work. - - Reference: https://github.com/chauhan-utk/ssd.DomainAdaptation - - Args: - image_buffer: Tensor tf.string containing the contents of a JPEG file. - boxes: Tensor tf.float32 of shape [num_boxes, 4], containing coordinates of - object bounding boxes. - classes: Tensor tf.int64 of shape [num_boxes, 1], containing class labels - of objects. - raw_shape: [height, width, 3]. - - Returns: - resized_image: decoded, cropped, and resized image Tensor tf.float32 of - shape [ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE, 3], value - range 0--255. - cropped_boxes: box coordinates for objects in the cropped region. - cropped_classes: class labels for objects in the cropped region. - """ - - num_boxes = tf.shape(boxes)[0] - - def no_crop_check(): - return (tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32) - < ssd_constants.P_NO_CROP_PER_PASS) - - def no_crop_proposal(): - return ( - tf.ones((), tf.bool), - tf.convert_to_tensor([0, 0, 1, 1], dtype=tf.float32), - tf.ones((num_boxes,), tf.bool), - ) - - def crop_proposal(): - rand_vec = lambda minval, maxval: tf.random_uniform( - shape=(ssd_constants.NUM_CROP_PASSES, 1), minval=minval, maxval=maxval, - dtype=tf.float32) - - width, height = rand_vec(0.3, 1), rand_vec(0.3, 1) - left, top = rand_vec(0, 1-width), rand_vec(0, 1-height) - - right = left + width - bottom = top + height - - ltrb = tf.concat([left, top, right, bottom], axis=1) - - min_iou = tf.random_shuffle(ssd_constants.CROP_MIN_IOU_CHOICES)[0] - ious = calc_iou_tensor(ltrb, boxes) - - # discard any bboxes whose center not in the cropped image - xc, yc = [tf.tile(0.5 * (boxes[:, i + 0] + boxes[:, i + 2])[tf.newaxis, :], - (ssd_constants.NUM_CROP_PASSES, 1)) for i in range(2)] - - masks = tf.reduce_all(tf.stack([ - tf.greater(xc, tf.tile(left, (1, num_boxes))), - tf.less(xc, tf.tile(right, (1, num_boxes))), - tf.greater(yc, tf.tile(top, (1, num_boxes))), - tf.less(yc, tf.tile(bottom, (1, num_boxes))), - ], axis=2), axis=2) - - # Checks of whether a crop is valid. - valid_aspect = tf.logical_and(tf.less(height/width, 2), - tf.less(width/height, 2)) - valid_ious = tf.reduce_all(tf.greater(ious, min_iou), axis=1, keepdims=True) - valid_masks = tf.reduce_any(masks, axis=1, keepdims=True) - - valid_all = tf.cast(tf.reduce_all(tf.concat( - [valid_aspect, valid_ious, valid_masks], axis=1), axis=1), tf.int32) - - # One indexed, as zero is needed for the case of no matches. - index = tf.range(1, 1 + ssd_constants.NUM_CROP_PASSES, dtype=tf.int32) - - # Either one-hot, or zeros if there is no valid crop. - selection = tf.equal(tf.reduce_max(index * valid_all), index) - - use_crop = tf.reduce_any(selection) - output_ltrb = tf.reduce_sum(tf.multiply(ltrb, tf.tile(tf.cast( - selection, tf.float32)[:, tf.newaxis], (1, 4))), axis=0) - output_masks = tf.reduce_any(tf.logical_and(masks, tf.tile( - selection[:, tf.newaxis], (1, num_boxes))), axis=0) - - return use_crop, output_ltrb, output_masks - - def proposal(*args): - return tf.cond( - pred=no_crop_check(), - true_fn=no_crop_proposal, - false_fn=crop_proposal, - ) - - _, crop_bounds, box_masks = tf.while_loop( - cond=lambda x, *_: tf.logical_not(x), - body=proposal, - loop_vars=[tf.zeros((), tf.bool), tf.zeros((4,), tf.float32), tf.zeros((num_boxes,), tf.bool)], - ) - - filtered_boxes = tf.boolean_mask(boxes, box_masks, axis=0) - - # Clip boxes to the cropped region. - filtered_boxes = tf.stack([ - tf.maximum(filtered_boxes[:, 0], crop_bounds[0]), - tf.maximum(filtered_boxes[:, 1], crop_bounds[1]), - tf.minimum(filtered_boxes[:, 2], crop_bounds[2]), - tf.minimum(filtered_boxes[:, 3], crop_bounds[3]), - ], axis=1) - - left = crop_bounds[0] - top = crop_bounds[1] - width = crop_bounds[2] - left - height = crop_bounds[3] - top - - cropped_boxes = tf.stack([ - (filtered_boxes[:, 0] - left) / width, - (filtered_boxes[:, 1] - top) / height, - (filtered_boxes[:, 2] - left) / width, - (filtered_boxes[:, 3] - top) / height, - ], axis=1) - - # crop_window containing integer coordinates of cropped region. A normalized - # coordinate value of y should be mapped to the image coordinate at - # y * (height - 1). - raw_shape = tf.cast(raw_shape, tf.float32) - crop_window = tf.stack([left * (raw_shape[0] - 1), - top * (raw_shape[1] - 1), - width * raw_shape[0], - height * raw_shape[1]]) - crop_window = tf.cast(crop_window, tf.int32) - - # Fused op only decodes the cropped portion of an image - cropped_image = tf.image.decode_and_crop_jpeg( - image_buffer, crop_window, channels=3) - - # Resize converts image dtype from uint8 to float32, without rescaling values. - resized_image = tf.image.resize_images( - cropped_image, [ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE]) - - cropped_classes = tf.boolean_mask(classes, box_masks, axis=0) - - return resized_image, cropped_boxes, cropped_classes - - -def color_jitter(image, brightness=0, contrast=0, saturation=0, hue=0): - """Distort the color of the image.""" - with tf.name_scope('distort_color'): - if brightness > 0: - image = tf.image.random_brightness(image, max_delta=brightness) - if contrast > 0: - image = tf.image.random_contrast( - image, lower=1-contrast, upper=1+contrast) - if saturation > 0: - image = tf.image.random_saturation( - image, lower=1-saturation, upper=1+saturation) - if hue > 0: - image = tf.image.random_hue(image, max_delta=hue) - return image - - -def normalize_image(image): - """Normalize the image to zero mean and unit variance. - - Args: - image: 3D tensor of type float32, value in [0, 1] - Returns: - image normalized by mean and stdev. - """ - image = tf.subtract(image, ssd_constants.NORMALIZATION_MEAN) - image = tf.divide(image, ssd_constants.NORMALIZATION_STD) - - return image - - -class Encoder(object): - """Encoder for SSD boxes and labels.""" - - def __init__(self): - similarity_calc = region_similarity_calculator.IouSimilarity() - matcher = argmax_matcher.ArgMaxMatcher( - matched_threshold=ssd_constants.MATCH_THRESHOLD, - unmatched_threshold=ssd_constants.MATCH_THRESHOLD, - negatives_lower_than_unmatched=True, - force_match_for_each_row=True) - - box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( - scale_factors=ssd_constants.BOX_CODER_SCALES) - - self.default_boxes = DefaultBoxes()('ltrb') - self.default_boxes = box_list.BoxList( - tf.convert_to_tensor(self.default_boxes)) - self.assigner = target_assigner.TargetAssigner( - similarity_calc, matcher, box_coder) - - def encode_labels(self, gt_boxes, gt_labels): - target_boxes = box_list.BoxList(gt_boxes) - encoded_classes, _, encoded_boxes, _, matches = self.assigner.assign( - self.default_boxes, target_boxes, gt_labels) - num_matched_boxes = tf.reduce_sum( - tf.cast(tf.not_equal(matches.match_results, -1), tf.float32)) - return encoded_classes, encoded_boxes, num_matched_boxes diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/ssd_model.py b/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/ssd_model.py deleted file mode 100644 index c8d67c24d..000000000 --- a/models/object_detection/tensorflow/ssd-resnet34/inference/fp32/ssd_model.py +++ /dev/null @@ -1,171 +0,0 @@ -# -# -*- coding: utf-8 -*- -# -# Copyright (c) 2019 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: EPL-2.0 -# - -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - - -"""SSD300 Model Configuration. - -References: - Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, - Cheng-Yang Fu, Alexander C. Berg - SSD: Single Shot MultiBox Detector - arXiv:1512.02325 - -Ported from MLPerf reference implementation: - https://github.com/mlperf/reference/tree/ssd/single_stage_detector/ssd - -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import multiprocessing -import os -import re -import threading -import tensorflow as tf - -import ssd_constants - - -class SSD300Model(): - """Single Shot Multibox Detection (SSD) model for 300x300 image datasets.""" - - def __init__(self, data_dir, label_num=ssd_constants.NUM_CLASSES): - # For COCO dataset, 80 categories + 1 background = 81 labels - self.label_num = label_num - self.data_dir = data_dir - - # Collected predictions for eval stage. It maps each image id in eval - # dataset to a dict containing the following information: - # source_id: raw ID of image - # raw_shape: raw shape of image - # pred_box: encoded box coordinates of prediction - # pred_scores: scores of classes in prediction - self.predictions = {} - - # Global step when predictions are collected. - self.eval_global_step = 0 - - # Average precision. In asynchronous eval mode, this is the latest AP we - # get so far and may not be the results at current eval step. - self.eval_coco_ap = 0 - - # Process, queues, and thread for asynchronous evaluation. When enabled, - # create a separte process (async_eval_process) that continously pull - # intermediate results from the predictions queue (a multiprocessing queue), - # process them, and push final results into results queue (another - # multiprocessing queue). The main thread is responsible to push message - # into predictions queue, and start a separate thread to continuously pull - # messages from results queue to update final results. - # Message in predictions queue should be a tuple of two elements: - # (evaluation step, predictions) - # Message in results queue should be a tuple of two elements: - # (evaluation step, final results) - self.async_eval_process = None - self.async_eval_predictions_queue = None - self.async_eval_results_queue = None - self.async_eval_results_getter_thread = None - - # The MLPerf reference uses a starting lr of 1e-3 at bs=32. - self.base_lr_batch_size = 32 - - def skip_final_affine_layer(self): - return True - - def postprocess(self, results): - """Postprocess results returned from model.""" - try: - import coco_metric # pylint: disable=g-import-not-at-top - except ImportError: - raise ImportError('To use the COCO dataset, you must clone the ' - 'repo https://github.com/tensorflow/models and add ' - 'tensorflow/models and tensorflow/models/research to ' - 'the PYTHONPATH, and compile the protobufs by ' - 'following https://github.com/tensorflow/models/blob/' - 'master/research/object_detection/g3doc/installation.md' - '#protobuf-compilation ; To evaluate using COCO' - 'metric, download and install Python COCO API from' - 'https://github.com/cocodataset/cocoapi') - - pred_boxes = results[ssd_constants.PRED_BOXES] - pred_scores = results[ssd_constants.PRED_SCORES] - # TODO(haoyuzhang): maybe use these values for visualization. - # gt_boxes = results['gt_boxes'] - # gt_classes = results['gt_classes'] - source_id = results[ssd_constants.SOURCE_ID] - raw_shape = results[ssd_constants.RAW_SHAPE] - - # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due - # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting - # `num_eval_epochs` to 1 is not enough and will often miss some images. We - # expect user to set `num_eval_epochs` to >1, which will leave some unused - # images from previous steps in `predictions`. Here we check if we are doing - # eval at a new global step. - if results['global_step'] > self.eval_global_step: - self.eval_global_step = results['global_step'] - self.predictions.clear() - - for i, sid in enumerate(source_id): - self.predictions[int(sid)] = { - ssd_constants.PRED_BOXES: pred_boxes[i], - ssd_constants.PRED_SCORES: pred_scores[i], - ssd_constants.SOURCE_ID: source_id[i], - ssd_constants.RAW_SHAPE: raw_shape[i] - } - - # COCO metric calculates mAP only after a full epoch of evaluation. Return - # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py. - if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES: - print('Got results for all {:d} eval examples. Calculate mAP...'.format( - ssd_constants.COCO_NUM_VAL_IMAGES)) - - annotation_file = os.path.join(self.data_dir, - ssd_constants.ANNOTATION_FILE) - # Size of predictions before decoding about 15--30GB, while size after - # decoding is 100--200MB. When using async eval mode, decoding takes - # 20--30 seconds of main thread time but is necessary to avoid OOM during - # inter-process communication. - decoded_preds = coco_metric.decode_predictions(self.predictions.values()) - self.predictions.clear() - - eval_results = coco_metric.compute_map(decoded_preds, annotation_file) - self.eval_coco_ap = eval_results['COCO/AP'] - ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.} - return ret - print('Got {:d} out of {:d} eval examples.' - ' Waiting for the remaining to calculate mAP...'.format( - len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES)) - return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.} diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/int8/__init__.py b/models/object_detection/tensorflow/ssd-resnet34/inference/int8/__init__.py new file mode 100644 index 000000000..159180624 --- /dev/null +++ b/models/object_detection/tensorflow/ssd-resnet34/inference/int8/__init__.py @@ -0,0 +1,20 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + diff --git a/models/object_detection/tensorflow/ssd-resnet34/inference/int8/infer_detections.py b/models/object_detection/tensorflow/ssd-resnet34/inference/int8/infer_detections.py new file mode 100644 index 000000000..657469658 --- /dev/null +++ b/models/object_detection/tensorflow/ssd-resnet34/inference/int8/infer_detections.py @@ -0,0 +1,211 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +import tensorflow as tf +import time + +from argparse import ArgumentParser + +import benchmark_cnn +import datasets +import ssd_constants +from models import ssd_model +from preprocessing import COCOPreprocessor + +IMAGE_SIZE = 300 + +import os + +class ssd_resnet34_infer: + + def __init__(self): + arg_parser = ArgumentParser(description='Parse args') + + arg_parser.add_argument('-b', "--batch-size", + help="Specify the batch size. If this " \ + "parameter is not specified or is -1, the " \ + "largest ideal batch size for the model will " \ + "be used.", + dest="batch_size", type=int, default=-1) + + arg_parser.add_argument('-e', "--inter-op-parallelism-threads", + help='The number of inter-thread.', + dest='num_inter_threads', type=int, default=0) + + arg_parser.add_argument('-a', "--intra-op-parallelism-threads", + help='The number of intra-thread.', + dest='num_intra_threads', type=int, default=0) + + arg_parser.add_argument('-g', "--input-graph", + help='Specify the input graph.', + dest='input_graph') + + arg_parser.add_argument('-d', "--data-location", + help='Specify the location of the data. ' + 'If this parameter is not specified, ' + 'the benchmark will use random/dummy data.', + dest="data_location", default=None) + + arg_parser.add_argument('-r', "--accuracy-only", + help='For accuracy measurement only.', + dest='accuracy_only', action='store_true') + + arg_parser.add_argument("--results-file-path", + help="File path for the inference results", + dest="results_file_path", default=None) + + # parse the arguments + self.args = arg_parser.parse_args() + + self.freeze_graph = self.load_graph(self.args.input_graph) + self.config = tf.ConfigProto() + self.config.intra_op_parallelism_threads = self.args.num_intra_threads + self.config.inter_op_parallelism_threads = self.args.num_inter_threads + + if self.args.batch_size == -1: + self.args.batch_size = 64 + + self.num_batches = (ssd_constants.COCO_NUM_VAL_IMAGES // self.args.batch_size) + \ + (ssd_constants.COCO_NUM_VAL_IMAGES % self.args.batch_size > 0) + + input_layer = 'input' + output_layers = ['v/stack', 'v/Softmax'] + self.input_tensor = self.freeze_graph.get_tensor_by_name(input_layer + ":0") + self.output_tensors = [self.freeze_graph.get_tensor_by_name(x + ":0") for x in output_layers] + + + def load_graph(self, frozen_graph_filename): + print('load graph from: ' + frozen_graph_filename) + with tf.gfile.GFile(frozen_graph_filename, "rb") as f: + graph_def = tf.GraphDef() + graph_def.ParseFromString(f.read()) + + # Then, we import the graph_def into a new Graph and returns it + with tf.Graph().as_default() as graph: + # Since we load everything in a new graph, this is not needed + tf.import_graph_def(graph_def, name='') + return graph + + def run_benchmark(self): + print("Inference with dummy data.") + with tf.Session(graph=self.freeze_graph, config=self.config) as sess: + + input_images = sess.run(tf.truncated_normal( + [self.args.batch_size, IMAGE_SIZE, IMAGE_SIZE, 3], + dtype=tf.float32, + stddev=10, + name='synthetic_images')) + + total_iter = 1000 + warmup_iter = 200 + ttime = 0.0 + + print('total iteration is {0}'.format(str(total_iter))) + print('warm up iteration is {0}'.format(str(warmup_iter))) + + for step in range(total_iter): + start_time = time.time() + _ = sess.run(self.output_tensors, {self.input_tensor: input_images}) + end_time = time.time() + + duration = end_time - start_time + if (step + 1) % 10 == 0: + print('steps = {0}, {1} sec'.format(str(step), str(duration))) + + if step + 1 > warmup_iter: + ttime += duration + + total_batches = total_iter - warmup_iter + print ('Batchsize: {0}'.format(str(self.args.batch_size))) + print ('Time spent per BATCH: {0:10.4f} ms'.format(ttime / total_batches * 1000)) + print ('Total samples/sec: {0:10.4f} samples/s'.format(total_batches * self.args.batch_size / ttime)) + + + def __get_input(self): + preprocessor = COCOPreprocessor( + batch_size=self.args.batch_size, + output_shapes=[[self.args.batch_size, IMAGE_SIZE, IMAGE_SIZE, 3]], + num_splits=1, + dtype=tf.float32, + train=False, + distortions=True, + resize_method=None, + shift_ratio=0 + ) + + class params: + datasets_repeat_cached_sample = False + + self.params = params() + self.dataset = datasets.create_dataset(self.args.data_location, 'coco') + + return preprocessor.minibatch( + self.dataset, + subset='validation', + params=self.params, + shift_ratio=0) + + + def accuracy_check(self): + print(self.args) + input_list = self.__get_input() + ds_init = tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS) + + ds_sess = tf.Session() + params = benchmark_cnn.make_params(data_dir=self.args.data_location) + self.model = ssd_model.SSD300Model(params=params) + + print("Inference for accuracy check.") + with tf.Session(graph=self.freeze_graph, config=self.config) as sess: + ds_sess.run(ds_init) + global_step = 0 + + for _ in range(self.num_batches): + results = {} + input_lists = ds_sess.run(input_list) + input_images = input_lists[0][0] + input_ids = input_lists[3][0] + input_raw_shapes = input_lists[4][0] + + result = sess.run(self.output_tensors, {self.input_tensor: input_images}) + # Make global_step available in results for postprocessing. + results['global_step'] = global_step + results[ssd_constants.SOURCE_ID] = input_ids + results[ssd_constants.RAW_SHAPE] = input_raw_shapes + + results[ssd_constants.PRED_BOXES] = result[0] + results[ssd_constants.PRED_SCORES] = result[1] + + results = self.model.postprocess(results) + + + + def run(self): + if self.args.accuracy_only: + self.accuracy_check() + else: + self.run_benchmark() + + + +if __name__ == "__main__": + infer = ssd_resnet34_infer() + infer.run() + diff --git a/models/object_detection/tensorflow/ssd_vgg16/__init__.py b/models/object_detection/tensorflow/ssd_vgg16/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/models/object_detection/tensorflow/ssd_vgg16/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/models/object_detection/tensorflow/ssd_vgg16/inference/__init__.py b/models/object_detection/tensorflow/ssd_vgg16/inference/__init__.py new file mode 100644 index 000000000..d9c4123de --- /dev/null +++ b/models/object_detection/tensorflow/ssd_vgg16/inference/__init__.py @@ -0,0 +1,19 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# diff --git a/models/object_detection/tensorflow/ssd_vgg16/inference/anchor_manipulator.py b/models/object_detection/tensorflow/ssd_vgg16/inference/anchor_manipulator.py new file mode 100644 index 000000000..f52acdc08 --- /dev/null +++ b/models/object_detection/tensorflow/ssd_vgg16/inference/anchor_manipulator.py @@ -0,0 +1,353 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# +import math + +import tensorflow as tf +import numpy as np + +from tensorflow.contrib.image.python.ops import image_ops + +def areas(gt_bboxes): + with tf.name_scope('bboxes_areas', values=[gt_bboxes]): + ymin, xmin, ymax, xmax = tf.split(gt_bboxes, 4, axis=1) + return (xmax - xmin) * (ymax - ymin) + +def intersection(gt_bboxes, default_bboxes): + with tf.name_scope('bboxes_intersection', values=[gt_bboxes, default_bboxes]): + # num_anchors x 1 + ymin, xmin, ymax, xmax = tf.split(gt_bboxes, 4, axis=1) + # 1 x num_anchors + gt_ymin, gt_xmin, gt_ymax, gt_xmax = [tf.transpose(b, perm=[1, 0]) for b in tf.split(default_bboxes, 4, axis=1)] + # broadcast here to generate the full matrix + int_ymin = tf.maximum(ymin, gt_ymin) + int_xmin = tf.maximum(xmin, gt_xmin) + int_ymax = tf.minimum(ymax, gt_ymax) + int_xmax = tf.minimum(xmax, gt_xmax) + h = tf.maximum(int_ymax - int_ymin, 0.) + w = tf.maximum(int_xmax - int_xmin, 0.) + + return h * w +def iou_matrix(gt_bboxes, default_bboxes): + with tf.name_scope('iou_matrix', values = [gt_bboxes, default_bboxes]): + inter_vol = intersection(gt_bboxes, default_bboxes) + # broadcast + union_vol = areas(gt_bboxes) + tf.transpose(areas(default_bboxes), perm=[1, 0]) - inter_vol + + return tf.where(tf.equal(union_vol, 0.0), + tf.zeros_like(inter_vol), tf.truediv(inter_vol, union_vol)) + +def do_dual_max_match(overlap_matrix, low_thres, high_thres, ignore_between=True, gt_max_first=True): + ''' + overlap_matrix: num_gt * num_anchors + ''' + with tf.name_scope('dual_max_match', values=[overlap_matrix]): + # first match from anchors' side + anchors_to_gt = tf.argmax(overlap_matrix, axis=0) + # the matching degree + match_values = tf.reduce_max(overlap_matrix, axis=0) + + #positive_mask = tf.greater(match_values, high_thres) + less_mask = tf.less(match_values, low_thres) + between_mask = tf.logical_and(tf.less(match_values, high_thres), tf.greater_equal(match_values, low_thres)) + negative_mask = less_mask if ignore_between else between_mask + ignore_mask = between_mask if ignore_between else less_mask + # fill all negative positions with -1, all ignore positions is -2 + match_indices = tf.where(negative_mask, -1 * tf.ones_like(anchors_to_gt), anchors_to_gt) + match_indices = tf.where(ignore_mask, -2 * tf.ones_like(match_indices), match_indices) + + # negtive values has no effect in tf.one_hot, that means all zeros along that axis + # so all positive match positions in anchors_to_gt_mask is 1, all others are 0 + anchors_to_gt_mask = tf.one_hot(tf.clip_by_value(match_indices, -1, tf.cast(tf.shape(overlap_matrix)[0], tf.int64)), + tf.shape(overlap_matrix)[0], on_value=1, off_value=0, axis=0, dtype=tf.int32) + # match from ground truth's side + gt_to_anchors = tf.argmax(overlap_matrix, axis=1) + + if gt_max_first: + # the max match from ground truth's side has higher priority + left_gt_to_anchors_mask = tf.one_hot(gt_to_anchors, tf.shape(overlap_matrix)[1], on_value=1, off_value=0, axis=1, dtype=tf.int32) + else: + # the max match from anchors' side has higher priority + # use match result from ground truth's side only when the the matching degree from anchors' side is lower than position threshold + left_gt_to_anchors_mask = tf.cast(tf.logical_and(tf.reduce_max(anchors_to_gt_mask, axis=1, keep_dims=True) < 1, + tf.one_hot(gt_to_anchors, tf.shape(overlap_matrix)[1], + on_value=True, off_value=False, axis=1, dtype=tf.bool) + ), tf.int64) + # can not use left_gt_to_anchors_mask here, because there are many ground truthes match to one anchor, we should pick the highest one even when we are merging matching from ground truth side + left_gt_to_anchors_scores = overlap_matrix * tf.to_float(left_gt_to_anchors_mask) + # merge matching results from ground truth's side with the original matching results from anchors' side + # then select all the overlap score of those matching pairs + selected_scores = tf.gather_nd(overlap_matrix, tf.stack([tf.where(tf.reduce_max(left_gt_to_anchors_mask, axis=0) > 0, + tf.argmax(left_gt_to_anchors_scores, axis=0), + anchors_to_gt), + tf.range(tf.cast(tf.shape(overlap_matrix)[1], tf.int64))], axis=1)) + # return the matching results for both foreground anchors and background anchors, also with overlap scores + return tf.where(tf.reduce_max(left_gt_to_anchors_mask, axis=0) > 0, + tf.argmax(left_gt_to_anchors_scores, axis=0), + match_indices), selected_scores + +# def save_anchors(bboxes, labels, anchors_point): +# if not hasattr(save_image_with_bbox, "counter"): +# save_image_with_bbox.counter = 0 # it doesn't exist yet, so initialize it +# save_image_with_bbox.counter += 1 + +# np.save('./debug/bboxes_{}.npy'.format(save_image_with_bbox.counter), np.copy(bboxes)) +# np.save('./debug/labels_{}.npy'.format(save_image_with_bbox.counter), np.copy(labels)) +# np.save('./debug/anchors_{}.npy'.format(save_image_with_bbox.counter), np.copy(anchors_point)) +# return save_image_with_bbox.counter + +class AnchorEncoder(object): + def __init__(self, allowed_borders, positive_threshold, ignore_threshold, prior_scaling, clip=False): + super(AnchorEncoder, self).__init__() + self._all_anchors = None + self._allowed_borders = allowed_borders + self._positive_threshold = positive_threshold + self._ignore_threshold = ignore_threshold + self._prior_scaling = prior_scaling + self._clip = clip + + def center2point(self, center_y, center_x, height, width): + return center_y - height / 2., center_x - width / 2., center_y + height / 2., center_x + width / 2., + + def point2center(self, ymin, xmin, ymax, xmax): + height, width = (ymax - ymin), (xmax - xmin) + return ymin + height / 2., xmin + width / 2., height, width + + def encode_all_anchors(self, labels, bboxes, all_anchors, all_num_anchors_depth, all_num_anchors_spatial, debug=False): + # y, x, h, w are all in range [0, 1] relative to the original image size + # shape info: + # y_on_image, x_on_image: layers_shapes[0] * layers_shapes[1] + # h_on_image, w_on_image: num_anchors + assert (len(all_num_anchors_depth)==len(all_num_anchors_spatial)) and (len(all_num_anchors_depth)==len(all_anchors)), 'inconsist num layers for anchors.' + with tf.name_scope('encode_all_anchors'): + num_layers = len(all_num_anchors_depth) + list_anchors_ymin = [] + list_anchors_xmin = [] + list_anchors_ymax = [] + list_anchors_xmax = [] + tiled_allowed_borders = [] + for ind, anchor in enumerate(all_anchors): + anchors_ymin_, anchors_xmin_, anchors_ymax_, anchors_xmax_ = self.center2point(anchor[0], anchor[1], anchor[2], anchor[3]) + + list_anchors_ymin.append(tf.reshape(anchors_ymin_, [-1])) + list_anchors_xmin.append(tf.reshape(anchors_xmin_, [-1])) + list_anchors_ymax.append(tf.reshape(anchors_ymax_, [-1])) + list_anchors_xmax.append(tf.reshape(anchors_xmax_, [-1])) + + tiled_allowed_borders.extend([self._allowed_borders[ind]] * all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) + + anchors_ymin = tf.concat(list_anchors_ymin, 0, name='concat_ymin') + anchors_xmin = tf.concat(list_anchors_xmin, 0, name='concat_xmin') + anchors_ymax = tf.concat(list_anchors_ymax, 0, name='concat_ymax') + anchors_xmax = tf.concat(list_anchors_xmax, 0, name='concat_xmax') + + if self._clip: + anchors_ymin = tf.clip_by_value(anchors_ymin, 0., 1.) + anchors_xmin = tf.clip_by_value(anchors_xmin, 0., 1.) + anchors_ymax = tf.clip_by_value(anchors_ymax, 0., 1.) + anchors_xmax = tf.clip_by_value(anchors_xmax, 0., 1.) + + anchor_allowed_borders = tf.stack(tiled_allowed_borders, 0, name='concat_allowed_borders') + + inside_mask = tf.logical_and(tf.logical_and(anchors_ymin > -anchor_allowed_borders * 1., + anchors_xmin > -anchor_allowed_borders * 1.), + tf.logical_and(anchors_ymax < (1. + anchor_allowed_borders * 1.), + anchors_xmax < (1. + anchor_allowed_borders * 1.))) + + anchors_point = tf.stack([anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax], axis=-1) + + # save_anchors_op = tf.py_func(save_anchors, + # [bboxes, + # labels, + # anchors_point], + # tf.int64, stateful=True) + + # with tf.control_dependencies([save_anchors_op]): + overlap_matrix = iou_matrix(bboxes, anchors_point) * tf.cast(tf.expand_dims(inside_mask, 0), tf.float32) + matched_gt, gt_scores = do_dual_max_match(overlap_matrix, self._ignore_threshold, self._positive_threshold) + # get all positive matching positions + matched_gt_mask = matched_gt > -1 + matched_indices = tf.clip_by_value(matched_gt, 0, tf.int64.max) + # the labels here maybe chaos at those non-positive positions + gt_labels = tf.gather(labels, matched_indices) + # filter the invalid labels + gt_labels = gt_labels * tf.cast(matched_gt_mask, tf.int64) + # set those ignored positions to -1 + gt_labels = gt_labels + (-1 * tf.cast(matched_gt < -1, tf.int64)) + + gt_ymin, gt_xmin, gt_ymax, gt_xmax = tf.unstack(tf.gather(bboxes, matched_indices), 4, axis=-1) + + # transform to center / size. + gt_cy, gt_cx, gt_h, gt_w = self.point2center(gt_ymin, gt_xmin, gt_ymax, gt_xmax) + anchor_cy, anchor_cx, anchor_h, anchor_w = self.point2center(anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax) + # encode features. + # the prior_scaling (in fact is 5 and 10) is use for balance the regression loss of center and with(or height) + gt_cy = (gt_cy - anchor_cy) / anchor_h / self._prior_scaling[0] + gt_cx = (gt_cx - anchor_cx) / anchor_w / self._prior_scaling[1] + gt_h = tf.log(gt_h / anchor_h) / self._prior_scaling[2] + gt_w = tf.log(gt_w / anchor_w) / self._prior_scaling[3] + # now gt_localizations is our regression object, but also maybe chaos at those non-positive positions + if debug: + gt_targets = tf.stack([anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax], axis=-1) + else: + gt_targets = tf.stack([gt_cy, gt_cx, gt_h, gt_w], axis=-1) + # set all targets of non-positive positions to 0 + gt_targets = tf.expand_dims(tf.cast(matched_gt_mask, tf.float32), -1) * gt_targets + self._all_anchors = (anchor_cy, anchor_cx, anchor_h, anchor_w) + return gt_targets, gt_labels, gt_scores + + # return a list, of which each is: + # shape: [feature_h, feature_w, num_anchors, 4] + # order: ymin, xmin, ymax, xmax + def decode_all_anchors(self, pred_location, num_anchors_per_layer): + assert self._all_anchors is not None, 'no anchors to decode.' + with tf.name_scope('decode_all_anchors', values=[pred_location]): + anchor_cy, anchor_cx, anchor_h, anchor_w = self._all_anchors + + pred_h = tf.exp(pred_location[:, -2] * self._prior_scaling[2]) * anchor_h + pred_w = tf.exp(pred_location[:, -1] * self._prior_scaling[3]) * anchor_w + pred_cy = pred_location[:, 0] * self._prior_scaling[0] * anchor_h + anchor_cy + pred_cx = pred_location[:, 1] * self._prior_scaling[1] * anchor_w + anchor_cx + + return tf.split(tf.stack(self.center2point(pred_cy, pred_cx, pred_h, pred_w), axis=-1), num_anchors_per_layer, axis=0) + + def ext_decode_all_anchors(self, pred_location, all_anchors, all_num_anchors_depth, all_num_anchors_spatial): + assert (len(all_num_anchors_depth)==len(all_num_anchors_spatial)) and (len(all_num_anchors_depth)==len(all_anchors)), 'inconsist num layers for anchors.' + with tf.name_scope('ext_decode_all_anchors', values=[pred_location]): + num_anchors_per_layer = [] + for ind in range(len(all_anchors)): + num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) + + num_layers = len(all_num_anchors_depth) + list_anchors_ymin = [] + list_anchors_xmin = [] + list_anchors_ymax = [] + list_anchors_xmax = [] + tiled_allowed_borders = [] + for ind, anchor in enumerate(all_anchors): + anchors_ymin_, anchors_xmin_, anchors_ymax_, anchors_xmax_ = self.center2point(anchor[0], anchor[1], anchor[2], anchor[3]) + + list_anchors_ymin.append(tf.reshape(anchors_ymin_, [-1])) + list_anchors_xmin.append(tf.reshape(anchors_xmin_, [-1])) + list_anchors_ymax.append(tf.reshape(anchors_ymax_, [-1])) + list_anchors_xmax.append(tf.reshape(anchors_xmax_, [-1])) + + anchors_ymin = tf.concat(list_anchors_ymin, 0, name='concat_ymin') + anchors_xmin = tf.concat(list_anchors_xmin, 0, name='concat_xmin') + anchors_ymax = tf.concat(list_anchors_ymax, 0, name='concat_ymax') + anchors_xmax = tf.concat(list_anchors_xmax, 0, name='concat_xmax') + + anchor_cy, anchor_cx, anchor_h, anchor_w = self.point2center(anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax) + + pred_h = tf.exp(pred_location[:,-2] * self._prior_scaling[2]) * anchor_h + pred_w = tf.exp(pred_location[:, -1] * self._prior_scaling[3]) * anchor_w + pred_cy = pred_location[:, 0] * self._prior_scaling[0] * anchor_h + anchor_cy + pred_cx = pred_location[:, 1] * self._prior_scaling[1] * anchor_w + anchor_cx + + return tf.split(tf.stack(self.center2point(pred_cy, pred_cx, pred_h, pred_w), axis=-1), num_anchors_per_layer, axis=0) + +class AnchorCreator(object): + def __init__(self, img_shape, layers_shapes, anchor_scales, extra_anchor_scales, anchor_ratios, layer_steps): + super(AnchorCreator, self).__init__() + # img_shape -> (height, width) + self._img_shape = img_shape + self._layers_shapes = layers_shapes + self._anchor_scales = anchor_scales + self._extra_anchor_scales = extra_anchor_scales + self._anchor_ratios = anchor_ratios + self._layer_steps = layer_steps + self._anchor_offset = [0.5] * len(self._layers_shapes) + + def get_layer_anchors(self, layer_shape, anchor_scale, extra_anchor_scale, anchor_ratio, layer_step, offset = 0.5): + ''' assume layer_shape[0] = 6, layer_shape[1] = 5 + x_on_layer = [[0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4]] + y_on_layer = [[0, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, 3], + [4, 4, 4, 4, 4], + [5, 5, 5, 5, 5]] + ''' + with tf.name_scope('get_layer_anchors'): + x_on_layer, y_on_layer = tf.meshgrid(tf.range(layer_shape[1]), tf.range(layer_shape[0])) + + y_on_image = (tf.cast(y_on_layer, tf.float32) + offset) * layer_step / self._img_shape[0] + x_on_image = (tf.cast(x_on_layer, tf.float32) + offset) * layer_step / self._img_shape[1] + + num_anchors_along_depth = len(anchor_scale) * len(anchor_ratio) + len(extra_anchor_scale) + num_anchors_along_spatial = layer_shape[1] * layer_shape[0] + + list_h_on_image = [] + list_w_on_image = [] + + global_index = 0 + # for square anchors + for _, scale in enumerate(extra_anchor_scale): + list_h_on_image.append(scale) + list_w_on_image.append(scale) + global_index += 1 + # for other aspect ratio anchors + for scale_index, scale in enumerate(anchor_scale): + for ratio_index, ratio in enumerate(anchor_ratio): + list_h_on_image.append(scale / math.sqrt(ratio)) + list_w_on_image.append(scale * math.sqrt(ratio)) + global_index += 1 + # shape info: + # y_on_image, x_on_image: layers_shapes[0] * layers_shapes[1] + # h_on_image, w_on_image: num_anchors_along_depth + return tf.expand_dims(y_on_image, axis=-1), tf.expand_dims(x_on_image, axis=-1), \ + tf.constant(list_h_on_image, dtype=tf.float32), \ + tf.constant(list_w_on_image, dtype=tf.float32), num_anchors_along_depth, num_anchors_along_spatial + + def get_all_anchors(self): + all_anchors = [] + all_num_anchors_depth = [] + all_num_anchors_spatial = [] + for layer_index, layer_shape in enumerate(self._layers_shapes): + anchors_this_layer = self.get_layer_anchors(layer_shape, + self._anchor_scales[layer_index], + self._extra_anchor_scales[layer_index], + self._anchor_ratios[layer_index], + self._layer_steps[layer_index], + self._anchor_offset[layer_index]) + all_anchors.append(anchors_this_layer[:-2]) + all_num_anchors_depth.append(anchors_this_layer[-2]) + all_num_anchors_spatial.append(anchors_this_layer[-1]) + return all_anchors, all_num_anchors_depth, all_num_anchors_spatial + diff --git a/models/object_detection/tensorflow/ssd_vgg16/inference/eval_ssd.py b/models/object_detection/tensorflow/ssd_vgg16/inference/eval_ssd.py new file mode 100644 index 000000000..fdbb4a44d --- /dev/null +++ b/models/object_detection/tensorflow/ssd_vgg16/inference/eval_ssd.py @@ -0,0 +1,316 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import numpy as np +import time +from argparse import ArgumentParser +import sys +from google.protobuf import text_format +import tensorflow as tf + +from dataset import dataset_common +from preprocessing import ssd_preprocessing +import anchor_manipulator + +SSD_VGG16_IMAGE_SIZE = 300 +NUM_CLASSES = 81 +NEGATIVE_RATIO = 1.0 +SELECT_THRESHOLD = 0.1 +MATCH_THRESHOLD = 0.5 +NEG_THRESHOLD = 0.5 +DATA_FORMAT = 'channels_last' +NUM_READERS = 10 +NUM_PREPROCESSING_THREADS = 28 + + +def input_fn(dataset_pattern='val-*', batch_size=1, data_location=None): + out_shape = [SSD_VGG16_IMAGE_SIZE] * 2 + anchor_creator = anchor_manipulator.AnchorCreator(out_shape, + layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), + (1, 1)], + anchor_scales=[(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), + (0.9,)], + extra_anchor_scales=[(0.1414,), (0.2739,), (0.4541,), (0.6315,), + (0.8078,), (0.9836,)], + anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333), + (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), + (1., 2., .5), (1., 2., .5)], + layer_steps=[8, 16, 32, 64, 100, 300]) + all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() + + num_anchors_per_layer = [] + for ind in range(len(all_anchors)): + num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) + + anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders=[1.0] * 6, + positive_threshold=MATCH_THRESHOLD, + ignore_threshold=NEG_THRESHOLD, + prior_scaling=[0.1, 0.1, 0.2, 0.2]) + + image_preprocessing_fn = lambda image_, labels_, bboxes_: ssd_preprocessing.preprocess_image(image_, labels_, + bboxes_, out_shape, + is_training=False, + data_format=DATA_FORMAT, + output_rgb=False) + anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, + all_anchors, + all_num_anchors_depth, + all_num_anchors_spatial) + + image, filename, shape, loc_targets, cls_targets, match_scores = \ + dataset_common.slim_get_batch(NUM_CLASSES, + batch_size, + 'val', + os.path.join( + data_location, + dataset_pattern), + NUM_READERS, + NUM_PREPROCESSING_THREADS, + image_preprocessing_fn, + anchor_encoder_fn, + num_epochs=1, + is_training=False) + return image, filename, shape + + +class EvaluateSSDModel(): + def __init__(self): + + arg_parser = ArgumentParser(description='Parse args') + + arg_parser.add_argument('-b', "--batch-size", + help="Specify the batch size. If this " \ + "parameter is not specified or is -1, the " \ + "largest ideal batch size for the model will " \ + "be used.", + dest="batch_size", type=int, default=1) + + arg_parser.add_argument('-e', "--num-inter-threads", + help='The number of inter-thread.', + dest='num_inter_threads', type=int, default=0) + + arg_parser.add_argument('-a', "--num-intra-threads", + help='The number of intra-thread.', + dest='num_intra_threads', type=int, default=0) + + arg_parser.add_argument('--data-num-inter-threads', dest='data_num_inter_threads', + help='number threads across operators', + type=int, default=21) + + arg_parser.add_argument('--data-num-intra-threads', dest='data_num_intra_threads', + help='number threads for data layer operator', + type=int, default=28) + + arg_parser.add_argument('--kmp-blocktime', dest='kmp_blocktime', + help='number of kmp blocktime', + type=int, default=1) + + arg_parser.add_argument('-g', "--input-graph", + help='Specify the input graph for the transform tool', + dest='input_graph') + + arg_parser.add_argument('-d', "--data-location", + help='Specify the location of the data. ' + 'If this parameter is not specified, ' + 'the benchmark will use random/dummy data.', + dest="data_location", default=None) + + arg_parser.add_argument('-r', "--accuracy-only", + help='For accuracy measurement only.', + dest='accuracy_only', action='store_true') + + arg_parser.add_argument("--warmup-steps", type=int, default=10, + help="number of warmup steps") + + arg_parser.add_argument("--steps", type=int, default=50, + help="number of steps") + + self.args = arg_parser.parse_args() + + os.environ["KMP_BLOCKTIME"] = str(self.args.kmp_blocktime) + + def eval(self): + + data_config = tf.ConfigProto() + data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads + data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads + data_config.use_per_session_threads = 1 + + infer_config = tf.ConfigProto() + infer_config.inter_op_parallelism_threads = self.args.num_inter_threads # self.args.num_inter_threads + infer_config.intra_op_parallelism_threads = self.args.num_intra_threads # self.args.num_intra_threads + infer_config.use_per_session_threads = 1 + + data_graph = tf.Graph() + with data_graph.as_default(): + if self.args.data_location: # real data + image, filename, shape = \ + input_fn(dataset_pattern='val-*', batch_size=self.args.batch_size, data_location=self.args.data_location) + else: # dummy data + input_shape = [self.args.batch_size, SSD_VGG16_IMAGE_SIZE, SSD_VGG16_IMAGE_SIZE, 3] + image = tf.random.uniform(input_shape, -123.68, 151.06, dtype=tf.float32, name='synthetic_images') + + infer_graph = tf.Graph() + model_file = self.args.input_graph + with infer_graph.as_default(): + graph_def = tf.GraphDef() + file_ext = os.path.splitext(model_file)[1] + with open(model_file, "rb") as f: + if file_ext == '.pbtxt': + text_format.Merge(f.read(), graph_def) + else: + graph_def.ParseFromString(f.read()) + tf.import_graph_def(graph_def, name='') + + # Define input and output Tensors for inference graph + output_names = ["ExpandDims"] + for i in range(1, 160): + output_names.append("ExpandDims_" + str(i)) + + input_operation = infer_graph.get_operation_by_name("input") + output_operations = [] + for name in output_names: + output_operations.append(infer_graph.get_operation_by_name(name).outputs[0]) + + infer_sess = tf.Session(graph=infer_graph, config=infer_config) + + if not self.args.accuracy_only: # benchmark + step = 0 + total_steps = self.args.warmup_steps + self.args.steps + + total_images = 0 + total_duration = 0 + + if not self.args.data_location: # inference with dummy data + print("Inference with dummy data") + data_sess = tf.Session(graph=data_graph, config=data_config) + + while step < total_steps: + step += 1 + image_np = data_sess.run(image) + start_time = time.time() + + infer_sess.run(output_operations, {input_operation.outputs[0]: image_np}) + duration = time.time() - start_time + + if step > self.args.warmup_steps: + total_duration += duration + total_images += self.args.batch_size + print('Iteration %d: %.6f sec' % (step, duration)) + sys.stdout.flush() + + else: # benchmark with real data + print("Inference with real data") + with data_graph.as_default(): + with tf.train.MonitoredTrainingSession(config=data_config) as data_sess: + while not data_sess.should_stop() and step < total_steps: + step += 1 + start_time = time.time() + image_np, _, _ = data_sess.run([image, filename, shape]) + infer_sess.run(output_operations, {input_operation.outputs[0]: image_np}) + duration = time.time() - start_time + + if step > self.args.warmup_steps: + total_duration += duration + total_images += self.args.batch_size + print('Iteration %d: %.6f sec' % (step, duration)) + sys.stdout.flush() + + print('Batch size = %d' % self.args.batch_size) + print('Throughput: %.3f images/sec' % (total_images / total_duration)) + if (self.args.batch_size == 1): + latency = (total_duration / total_images) * 1000 + print('Latency: %.3f ms' % (latency)) + + else: # accuracy only + results = [] + filenames = [] + shapes = [] + total_processed_images = 0 + with data_graph.as_default(): + with tf.train.MonitoredTrainingSession(config=data_config) as data_sess: + while not data_sess.should_stop(): + image_np, filename_np, shape_np = data_sess.run([image, filename, shape]) + total_processed_images += self.args.batch_size + predict = infer_sess.run(output_operations, {input_operation.outputs[0]: image_np}) + if (total_processed_images % 30 == 0): + print("Predicting results for {} images...".format(total_processed_images)) + sys.stdout.flush() + results.append(predict) + filenames.append(filename_np[0]) + shapes.append(shape_np[0]) + + log_dir = os.path.join('./', 'logs') + # if it doesn't exist, create. + if not os.path.exists(log_dir): + os.makedirs(log_dir) + for class_ind in range(1, NUM_CLASSES): + with open(os.path.join(log_dir, 'results_{}.txt'.format(class_ind)), 'wt') as f: + for image_ind, pred in enumerate(results): + shape = shapes[image_ind] + filename = filenames[image_ind] + # parsing prediction results and calculate bbox + scores = pred[(class_ind * 2) - 2][0] + bboxes = pred[(class_ind * 2) - 1][0] + bboxes[:, 0] = (bboxes[:, 0] * shape[0]).astype(np.int32, copy=False) + 1 + bboxes[:, 1] = (bboxes[:, 1] * shape[1]).astype(np.int32, copy=False) + 1 + bboxes[:, 2] = (bboxes[:, 2] * shape[0]).astype(np.int32, copy=False) + 1 + bboxes[:, 3] = (bboxes[:, 3] * shape[1]).astype(np.int32, copy=False) + 1 + + valid_mask = np.logical_and((bboxes[:, 2] - bboxes[:, 0] > 0), + (bboxes[:, 3] - bboxes[:, 1] > 0)) + + for det_ind in range(valid_mask.shape[0]): + if not valid_mask[det_ind]: + continue + f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. + format(filename.decode('utf8')[:-4], scores[det_ind], + bboxes[det_ind, 1], bboxes[det_ind, 0], + bboxes[det_ind, 3], bboxes[det_ind, 2])) + + coco_eval = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "validate_ssd_vgg16.py") + cmd_prefix = "python " + coco_eval + cmd_prefix += " --detections_path ./logs" + cmd_prefix += " --annotations_file {}/instances_val2017.json".format(self.args.data_location) + cmd = cmd_prefix + os.system(cmd) + +if __name__ == "__main__": + obj = EvaluateSSDModel() + obj.eval() diff --git a/models/object_detection/tensorflow/ssd_vgg16/inference/generate_coco_records.py b/models/object_detection/tensorflow/ssd_vgg16/inference/generate_coco_records.py new file mode 100755 index 000000000..5cc72cf7a --- /dev/null +++ b/models/object_detection/tensorflow/ssd_vgg16/inference/generate_coco_records.py @@ -0,0 +1,205 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# + +import argparse +import os +import json +import numpy as np +from tqdm import tqdm +import tensorflow as tf +from convert_tfrecords import ImageCoder, _process_image, _int64_feature, _float_feature, _bytes_feature, _bytes_list_feature + + +def load_annotation_data(annotations_filename): + + # Load annotation data + with open(annotations_filename, 'r') as annotations_file: + data = json.load(annotations_file) + + # Create map of category IDs to category names + category_map = {} + for category_datum in data['categories']: + category_map[category_datum['id']] = category_datum['name'] + + # Create map of file IDs to annotation data + annotation_map = {} + for annotation_datum in data['annotations']: + image_id = annotation_datum['image_id'] + if (image_id not in annotation_map): + annotation_map[image_id] = [] + + # Add annotation datum for current image ID + annotation_map[image_id].append(annotation_datum) + + # Create map of file IDs to image data + image_map = {} + for image_datum in data['images']: + image_id = image_datum['id'] + if (image_id in annotation_map): + image_map[image_id] = image_datum + + return image_map, annotation_map, category_map + + +def get_annotation_data(image_data, annotation_data, category_map): + + LABEL_MAP = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, + 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, + 23: 22, 24: 23, 25: 24, 27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, + 36: 32, 37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 46: 41, + 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48, 54: 49, 55: 50, 56: 51, + 57: 52, 58: 53, 59: 54, 60: 55, 61: 56, 62: 57, 63: 58, 64: 59, 65: 60, 67: 61, + 70: 62, 72: 63, 73: 64, 74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, 80: 71, + 81: 72, 82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80} + + # Retrieve image width and height + image_width = image_data['width'] + image_height = image_data['height'] + + bboxes = [] + labels = [] + label_names = [] + difficult = [] + truncated = [] + for annotation_datum in annotation_data: + # Scale bounding box coordinates + # COCO bounding boxes are [x, y, width, height] but https://github.com/HiKapok/SSD.TensorFlow.git expects [ymin, xmin, ymax, xmax] + bbox = annotation_datum['bbox'] + ymin = bbox[1] / image_height + xmin = bbox[0] / image_width + ymax = (bbox[1] + bbox[3]) / image_height + xmax = (bbox[0] + bbox[2]) / image_width + bboxes.append([ymin, xmin, ymax, xmax]) + + labels.append(LABEL_MAP[annotation_datum['category_id']]) + label_names.append(category_map[annotation_datum['category_id']].encode('ascii')) + + # Append difficult and truncated flags + difficult.append(0) + truncated.append(0) + + return bboxes, labels, label_names, difficult, truncated + + +def get_record(filename, buffer, width, height, bboxes, labels, label_names, difficult, truncated): + + CHANNEL_COUNT = 3 + IMAGE_FORMAT = 'JPEG' + + # Extract bounding box coordinates + ymin = [] + xmin = [] + ymax = [] + xmax = [] + for bbox in bboxes: + ymin.append(bbox[0]) + xmin.append(bbox[1]) + ymax.append(bbox[2]) + xmax.append(bbox[3]) + + # Create record features + features = { + 'image/width': _int64_feature(width), + 'image/height': _int64_feature(height), + 'image/channels': _int64_feature(CHANNEL_COUNT), + 'image/shape': _int64_feature([height, width, CHANNEL_COUNT]), + 'image/object/bbox/xmin': _float_feature(xmin), + 'image/object/bbox/xmax': _float_feature(xmax), + 'image/object/bbox/ymin': _float_feature(ymin), + 'image/object/bbox/ymax': _float_feature(ymax), + 'image/object/bbox/label': _int64_feature(labels), + 'image/object/bbox/label_text': _bytes_list_feature(label_names), + 'image/object/bbox/difficult': _int64_feature(difficult), + 'image/object/bbox/truncated': _int64_feature(truncated), + 'image/format': _bytes_feature(IMAGE_FORMAT), + 'image/filename': _bytes_feature(filename.encode('utf8')), + 'image/encoded': _bytes_feature(buffer)} + + return tf.train.Example(features = tf.train.Features(feature = features)) + + +def check_for_link(value): + """ + Throws an error if the specified path is a link. os.islink returns + True for sym links. For files, we also look at the number of links in + os.stat() to determine if it's a hard link. + """ + if os.path.islink(value) or \ + (os.path.isfile(value) and os.stat(value).st_nlink > 1): + raise argparse.ArgumentTypeError("{} cannot be a link.".format(value)) + +def check_valid_file_or_folder(value): + """verifies filename exists and isn't a link""" + if value is not None: + if not os.path.isfile(value) and not os.path.isdir(value): + raise argparse.ArgumentTypeError("{} does not exist or is not a file/folder.". + format(value)) + check_for_link(value) + return value + + +def main(): + + RECORDS_PER_FILE = 1024 + RECORD_FILENAME_FORMAT = '%s-%.5d-of-%.5d' + + parser = argparse.ArgumentParser() + parser.add_argument('--image_path', type=check_valid_file_or_folder, required=True, help='path to the input validation image files') + parser.add_argument('--annotations_file', type=check_valid_file_or_folder, required=True, help='name of the input validation annotations file') + parser.add_argument('--output_prefix', type=str, required=True, help='prefix of the output TensorFlow record files') + parser.add_argument('--output_path', type=check_valid_file_or_folder, required=True, help='path to the output TensorFlow record files') + + args = parser.parse_args() + + # Load annotation data + image_map, annotation_map, category_map = load_annotation_data(args.annotations_file) + + # Create output path if necessary + if (not os.path.exists(args.output_path)): + os.makedirs(args.output_path) + + # Create image coder + image_coder = ImageCoder() + + record_file_index = 0 + record_file_count = np.ceil(len(image_map) / RECORDS_PER_FILE).astype(int) + for index, image_id in tqdm(enumerate(image_map), desc = 'Generating', total = len(image_map), unit = ' file'): + # Create record writer + if (index % RECORDS_PER_FILE == 0): + output_filename = os.path.join(args.output_path, RECORD_FILENAME_FORMAT % (args.output_prefix, record_file_index, record_file_count)) + writer = tf.python_io.TFRecordWriter(output_filename) + record_file_index += 1 + + # Extract image data from current image file + image_filename = image_map[image_id]['file_name'] + image_buffer, _, _ = _process_image(os.path.join(args.image_path, image_filename), image_coder) + + # Retrieve annotation data associated with current image file + bboxes, labels, label_names, difficult, truncated = get_annotation_data(image_map[image_id], annotation_map[image_id], category_map) + + # Write TF record for current image file + image_width, image_height = image_map[image_id]['width'], image_map[image_id]['height'] + record = get_record(image_filename, image_buffer, image_width, image_height, bboxes, labels, label_names, difficult, truncated) + writer.write(record.SerializeToString()) + + +if __name__ == '__main__': + + main() \ No newline at end of file diff --git a/models/object_detection/tensorflow/ssd_vgg16/inference/validate_ssd_vgg16.py b/models/object_detection/tensorflow/ssd_vgg16/inference/validate_ssd_vgg16.py new file mode 100644 index 000000000..c580fc022 --- /dev/null +++ b/models/object_detection/tensorflow/ssd_vgg16/inference/validate_ssd_vgg16.py @@ -0,0 +1,111 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: EPL-2.0 +# +import argparse +import os +import json +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval + + +def convert_detection(label, detection): + + ID_INDEX = 0 + SCORE_INDEX = 1 + XMIN_INDEX = 2 + YMIN_INDEX = 3 + XMAX_INDEX = 4 + YMAX_INDEX = 5 + LABEL_MAP = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, + 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, + 22: 23, 23: 24, 24: 25, 25: 27, 26: 28, 27: 31, 28: 32, 29: 33, 30: 34, 31: 35, + 32: 36, 33: 37, 34: 38, 35: 39, 36: 40, 37: 41, 38: 42, 39: 43, 40: 44, 41: 46, + 42: 47, 43: 48, 44: 49, 45: 50, 46: 51, 47: 52, 48: 53, 49: 54, 50: 55, 51: 56, + 52: 57, 53: 58, 54: 59, 55: 60, 56: 61, 57: 62, 58: 63, 59: 64, 60: 65, 61: 67, + 62: 70, 63: 72, 64: 73, 65: 74, 66: 75, 67: 76, 68: 77, 69: 78, 70: 79, 71: 80, + 72: 81, 73: 82, 74: 84, 75: 85, 76: 86, 77: 87, 78: 88, 79: 89, 80: 90} + + # Extract image ID and bounding box score from detection + image_id = int(detection[ID_INDEX]) + score = float(detection[SCORE_INDEX]) + + # Convert bounding box coordinates [xmin, ymin, xmax, ymax] to [x, y, width, height] + x = float(detection[XMIN_INDEX]) + y = float(detection[YMIN_INDEX]) + width = float(detection[XMAX_INDEX]) - x + height = float(detection[YMAX_INDEX]) - y + bbox = [x, y, width, height] + + return {'category_id': LABEL_MAP[label], 'image_id': image_id, 'score': score, 'bbox': bbox} + + +def generate_results_file(detections_path, results_filename): + + DETECTIONS_EXTENSION = '.txt' + + # Retrieve detections filenames + filenames = [filename for filename in os.listdir(detections_path) if filename.endswith(DETECTIONS_EXTENSION)] + + results = [] + for filename in filenames: + # Read detections from current file + with open(os.path.join(detections_path, filename), 'r') as detections_file: + lines = detections_file.readlines() + + # Convert detections from current file + label = int(os.path.splitext(filename)[0].split('_')[1]) + for line in lines: + results.append(convert_detection(label, line.strip().split())) + + # Write results to file + with open(os.path.join(detections_path, results_filename), 'w') as results_file: + json.dump(results, results_file) + + +def main(): + + RESULTS_FILENAME = 'results.json' + ANNOTATION_TYPE = 'bbox' + + parser = argparse.ArgumentParser() + parser.add_argument('--detections_path', type = str, required = True, help = 'path to the input detected bounding box files') + parser.add_argument('--annotations_file', type = str, required = True, help = 'name of the input validation annotations file') + + args = parser.parse_args() + + # Generate COCO results file + print('Generating COCO results...') + generate_results_file(args.detections_path, RESULTS_FILENAME) + + # Create COCO instance + cocoGt = COCO(args.annotations_file) + + # Load COCO results + cocoDt = cocoGt.loadRes(os.path.join(args.detections_path, RESULTS_FILENAME)) + + # Evaluate results + cocoEval = COCOeval(cocoGt, cocoDt, ANNOTATION_TYPE) + cocoEval.evaluate() + cocoEval.accumulate() + cocoEval.summarize() + + +if __name__ == '__main__': + + main() \ No newline at end of file diff --git a/models_directory_structure.png b/models_directory_structure.png new file mode 100644 index 000000000..906cfdf02 Binary files /dev/null and b/models_directory_structure.png differ diff --git a/requirements-test.txt b/requirements-test.txt index 5102c19b3..fe0bf31ab 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,6 +1,6 @@ conditional flake8==3.7.5 -pytest +pytest==4.6.3 pytest-cov pytest-xdist mock diff --git a/tests/test_utils/io.py b/tests/test_utils/io.py index 50f8e5e61..5ec580f94 100644 --- a/tests/test_utils/io.py +++ b/tests/test_utils/io.py @@ -18,19 +18,21 @@ # SPDX-License-Identifier: EPL-2.0 # -import csv +import os +import json -def parse_csv_file(file_path, expected_num_columns): +def parse_json_files(json_dir_path): """ - Reads the specified csv file. Checks for a value number of columns in - each row. Returns the csv file values as a list of tuples. + Reads the JSON files in the specified directory. Checks for a value number of columns in + each row. Returns the JSON files values as a list of tuples. """ values = [] - with open(file_path) as csv_file: - csv_reader = csv.reader(csv_file, delimiter=',', - skipinitialspace=True) - for row in csv_reader: - assert len(row) == expected_num_columns - values.append(tuple(row)) + for model_file in os.listdir(json_dir_path): + file_path = os.path.join(json_dir_path, model_file) + with open(file_path) as f: + data = json.load(f) + for x in data: + values.append( + tuple((x['input'], x['output'], model_file + " :: " + x['_comment']))) return values diff --git a/tests/unit/common/tensorflow/test_run_tf_benchmarks.py b/tests/unit/common/tensorflow/test_run_tf_benchmarks.py index b5407f00e..0b58ac411 100644 --- a/tests/unit/common/tensorflow/test_run_tf_benchmarks.py +++ b/tests/unit/common/tensorflow/test_run_tf_benchmarks.py @@ -28,19 +28,19 @@ from benchmarks.common.tensorflow.run_tf_benchmark import ModelBenchmarkUtil from test_utils import platform_config -from test_utils.io import parse_csv_file +from test_utils.io import parse_json_files def parse_model_args_file(): """ - Gets test args from the tf_model_args.txt file to use as parameters + Gets test args from the models files in the specified directory to use as parameters for testing model benchmarking scripts. The file has a run_tf_benchmarks.py command with args with the corresponding run command that should get called from model_init.py """ current_dir = os.path.dirname(os.path.realpath(__file__)) - csv_file_path = os.path.join(current_dir, "tf_model_args.txt") - return parse_csv_file(csv_file_path, 2) + models_args_path = os.path.join(current_dir, "tf_model_args") + return parse_json_files(models_args_path) def delete_env_var(env_var): @@ -63,7 +63,7 @@ def clear_kmp_env_vars(): test_arg_values = parse_model_args_file() -@pytest.mark.parametrize("test_args,expected_cmd", test_arg_values) +@pytest.mark.parametrize("test_args,expected_cmd,comment", test_arg_values) @patch("os.mkdir") @patch("shutil.rmtree") @patch("os.listdir") @@ -73,18 +73,21 @@ def clear_kmp_env_vars(): @patch("os.stat") @patch("os.chdir") @patch("os.remove") +@patch("glob.glob") @patch("common.platform_util.os") @patch("common.platform_util.system_platform") @patch("common.platform_util.subprocess") @patch("common.base_model_init.BaseModelInitializer.run_command") -def test_run_benchmark(mock_run_command, mock_subprocess, mock_platform, - mock_os, mock_remove, mock_chdir, mock_stat, mock_path_exists, mock_is_file, mock_is_dir, - mock_listdir, mock_rmtree, mock_mkdir, test_args, expected_cmd): +def test_run_benchmark(mock_run_command, mock_subprocess, mock_platform, mock_os, + mock_glob, mock_remove, mock_chdir, mock_stat, mock_path_exists, + mock_is_file, mock_is_dir, mock_listdir, mock_rmtree, mock_mkdir, + test_args, expected_cmd, comment): """ Runs through executing the specified run_tf_benchmarks.py command from the test_args and verifying that the model_init file calls run_command with the expected_cmd string. """ + print("****** Running The {} test ******".format(comment)) os.environ["PYTHON_EXE"] = "python" mock_path_exists.return_value = True mock_is_dir.return_value = True @@ -92,6 +95,7 @@ def test_run_benchmark(mock_run_command, mock_subprocess, mock_platform, mock_stat.return_value = MagicMock(st_nlink=0) parse_model_args_file() mock_listdir.return_value = True + mock_glob.return_value = ["/usr/lib/libtcmalloc.so.4.2.6"] clear_kmp_env_vars() platform_config.set_mock_system_type(mock_platform) platform_config.set_mock_os_access(mock_os) diff --git a/tests/unit/common/tensorflow/tf_model_args.txt b/tests/unit/common/tensorflow/tf_model_args.txt deleted file mode 100755 index 6381db35a..000000000 --- a/tests/unit/common/tensorflow/tf_model_args.txt +++ /dev/null @@ -1,82 +0,0 @@ -run_tf_benchmark.py --framework tensorflow --use-case recommendation --precision fp32 --mode inference --model-name wide_deep --batch-size 1024 --data-location /dataset --checkpoint /checkpoints --intelai-models . --verbose, OMP_NUM_THREADS=1 numactl --cpunodebind=0 --membind=0 python inference/fp32/wide_deep_inference.py --data_dir=/dataset --model_dir=/checkpoints --batch_size=1024 -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 100 --in-graph /final_int8_inceptionv3.pb --intelai-models . --accuracy-only --verbose,python ./int8/accuracy.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/final_int8_inceptionv3.pb -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 1 --in-graph /final_int8_inceptionv3.pb --intelai-models . --benchmark-only --socket-id 0 --verbose,numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28 -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /final_int8_inceptionv3.pb --intelai-models . --benchmark-only --socket-id 0 --verbose,numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inceptionv3 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/inception_frozen_max_min.pb --steps=200 --warmup-steps=20,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/int8/benchmark.py --warmup_steps=20 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/in_graph/inception_frozen_max_min.pb --steps=200 --num_cores=28 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inceptionv3 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/inception_frozen_max_min.pb --steps=200 --warmup-steps=20,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/int8/benchmark.py --warmup_steps=20 --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/inception_frozen_max_min.pb --steps=200 --num_cores=28 -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 1 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose,numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28 -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose,numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28 -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose --num-inter-threads 4 --num-intra-threads 16,numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28 -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 100 --accuracy-only --data-location /dataset --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --verbose,python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=int8 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/inceptionv4_int8_pretrained_model.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/accuracy.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28 --data_location=/dataset -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=fp32 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/accuracy.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --num_intra_threads=28 --data_location=/dataset -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=int8 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/inceptionv4_int8_pretrained_model.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=int8 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/inceptionv4_int8_pretrained_model.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=1 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --accuracy-only --verbose --in-graph=/in_graph/inception_resnet_v2_fp32_pretrained_model.pb --data-location=/dataset,python /workspace/intelai_models/eval_image_classifier_accuracy.py --input_graph=/in_graph/inception_resnet_v2_fp32_pretrained_model.pb --data_location=/dataset --input_height=299 --input_width=299 --num_inter_threads=2 --num_intra_threads=56 --output_layer=InceptionResnetV2/Logits/Predictions --batch_size=100 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/eval_image_classifier.py --dataset_name=imagenet --checkpoint_path=/checkpoints --eval_dir=/checkpoints --dataset_dir=/dataset --dataset_split_name=validation --clone_on_cpu=True --model_name=inception_resnet_v2 --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=28 --batch_size=1 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/eval_image_classifier.py --dataset_name=imagenet --checkpoint_path=/checkpoints --eval_dir=/checkpoints --dataset_dir=/dataset --dataset_split_name=validation --clone_on_cpu=True --model_name=inception_resnet_v2 --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=28 --batch_size=128 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --accuracy-only --verbose --in-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --data-location=/dataset,python /workspace/intelai_models/eval_image_classifier_accuracy.py --input_graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --data_location=/dataset --input_height=299 --input_width=299 --num_inter_threads=2 --num_intra_threads=56 --output_layer=InceptionResnetV2/Logits/Predictions --batch_size=100 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --in-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/eval_image_classifier_benchmark.py --input-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --inter-op-parallelism-threads=1 --intra-op-parallelism-threads=28 --batch-size=1 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --benchmark-only --verbose --in-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/eval_image_classifier_benchmark.py --input-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --inter-op-parallelism-threads=1 --intra-op-parallelism-threads=28 --batch-size=128 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50_int8_pretrained_model.pb --data-location=/dataset --calibration-only,python /workspace/intelai_models/int8/generate_calibration_data.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/in_graph/resnet50_int8_pretrained_model.pb --data_location=/dataset -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name resnet50 --batch-size 100 --data-location /dataset --in-graph /final_int8_resnet50.pb --intelai-models . --accuracy-only --verbose,python ./inference/eval_image_classifier_inference.py --input-graph=/final_int8_resnet50.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name resnet50 --batch-size 128 --in-graph /final_int8_resnet50.pb --intelai-models . --benchmark-only --verbose,python ./inference/eval_image_classifier_inference.py --input-graph=/final_int8_resnet50.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=128 --warmup-steps=10 --steps=50 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/resnet50_int8_pretrained_model.pb --steps=200 --warmup-steps=20,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200 -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50 --batch-size 1 --in-graph /freezed_resnet50.pb --intelai-models . --socket-id 0 --verbose,numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28 -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50 --batch-size 128 --in-graph /freezed_resnet50.pb --intelai-models . --socket-id 0 --verbose,numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28 -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50 --batch-size 1 --in-graph /freezed_resnet50.pb --intelai-models . --socket-id 0 --verbose --num-inter-threads 4 --num-intra-threads 16,numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size 100 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/freezed_resnet50.pb --accuracy-only --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=100 --data-location=/dataset --accuracy-only --num-cores=28 --warmup-steps=10 --steps=50 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --output-results --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50_fp32_pretrained_model.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --results-file-path /workspace/benchmarks/common/tensorflow/logs/resnet50_fp32_inference_results*.txt -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name squeezenet --batch-size 64 --checkpoint /checkpoints --intelai-models . --socket-id 0 --verbose,taskset -c 0-27 python ./fp32/train_squeezenet.py --data_location None --batch_size 64 --num_inter_threads 1 --num_intra_threads 28 --model_dir /checkpoints --inference-only --verbose -run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name squeezenet --batch-size 1 --checkpoint /checkpoints --intelai-models . --socket-id 0 --verbose,taskset -c 0-27 python ./fp32/train_squeezenet.py --data_location None --batch_size 1 --num_inter_threads 1 --num_intra_threads 28 --model_dir /checkpoints --inference-only --verbose -run_tf_benchmark.py --framework tensorflow --use-case object_detection --precision fp32 --mode inference --model-name faster_rcnn --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --config_file=pipeline.config,numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --num_inter_threads 1 --num_intra_threads 28 --pipeline_config_path /checkpoints/pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/log/eval -run_tf_benchmark.py --framework tensorflow --use-case object_detection --precision fp32 --mode inference --model-name faster_rcnn --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --config_file=pipeline.config --num-inter-threads 4 --num-intra-threads 16,numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --num_inter_threads 4 --num_intra_threads 16 --pipeline_config_path /checkpoints/pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/log/eval -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=-1 --accuracy-only --verbose --checkpoint=/checkpoints --data-location=/dataset --in-graph=/in_graph/frozen_inference_graph.pb,sh /workspace/intelai_models/inference/fp32/coco_accuracy.sh /in_graph/frozen_inference_graph.pb /dataset/coco_val.record /workspace/models -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --data-location=/dataset --verbose --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb --accuracy-only --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb,sh /workspace/intelai_models/inference/int8/coco_int8.sh /in_graph/ssdmobilenet_int8_pretrained_model.pb /dataset/coco_val.record -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size=1 --socket-id 0 --data-location=/dataset --verbose --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb --benchmark-only --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/run_frozen_graph_ssdmob.py -g /in_graph/ssdmobilenet_int8_pretrained_model.pb -n 5000 -d /dataset -x --num-inter-threads 2 --num-intra-threads 28 -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --benchmark-only --verbose --in-graph=/in_graph/frozen_inference_graph.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/infer_detections.py --input_tfrecord_paths=/dataset --output_tfrecord_path=/SSD-mobilenet-out.tfrecord --inference_graph=/in_graph/frozen_inference_graph.pb --discard_image_pixels=True --num_inter_threads=2 --num_intra_threads=28 -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --accuracy-only --verbose --in-graph=/in_graph/frozen_inference_graph.pb --benchmark-dir=/workspace/benchmarks --data-location=/dataset,sh /workspace/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/ssdmobilenet_accuracy.sh /in_graph/frozen_inference_graph.pb /dataset -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --accuracy-only --data-location /dataset -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 -run_tf_benchmark.py --framework tensorflow --use-case object_detection --precision fp32 --mode inference --model-name rfcn --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --config_file=rfcn_pipeline.config,numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --inter_op 1 --intra_op 28 --omp 28 --pipeline_config_path /checkpoints/rfcn_pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/models/rfcn/eval --logtostderr --blocktime=0 --run_once=True -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --verbose --in-graph=/in_graph/frozen_inference_graph.pb --data-location=/dataset --accuracy-only --split=accuracy_message,FROZEN_GRAPH=/in_graph/frozen_inference_graph.pb TF_RECORD_FILE=/dataset SPLIT=accuracy_message TF_MODELS_ROOT=/workspace/models /workspace/intelai_models/inference/fp32/coco_mAP.sh -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --verbose --in-graph=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --data-location=/dataset --benchmark-only --number_of_steps=500,python /workspace/intelai_models/inference/int8/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb -x 500 -d /dataset --num-inter-threads 2 --num-intra-threads 56 -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --verbose --in-graph=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --data-location=/dataset --accuracy-only --split=accuracy_message,FROZEN_GRAPH=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb TF_RECORD_FILE=/dataset SPLIT=accuracy_message TF_MODELS_ROOT=/workspace/models /workspace/intelai_models/inference/int8/coco_mAP.sh -run_tf_benchmark.py --framework tensorflow --use-case text_to_speech --precision fp32 --mode inference --model-name wavenet --num-cores 1 --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --checkpoint_name=model.ckpt-99 --sample=8510,numactl --physcpubind=0-0 --membind=0 python generate.py /checkpoints/model.ckpt-99 --num_inter_threads=1 --num_intra_threads=1 --sample=8510 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=128 --num-inter-threads=1 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=28 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --accuracy-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb --data-location=/dataset,python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=128 --num-inter-threads=2 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=56 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=128 --num-inter-threads=1 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=28 --warmup-steps=10 --steps=50 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=1 --num-inter-threads=1 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=28 --warmup-steps=10 --steps=50 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/resnet101_int8_model.pb,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=1 --num-intra-threads=28 --num-inter-threads=1 --input-graph=/in_graph/resnet101_int8_model.pb --warmup-steps=40 --steps=100 -"run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_language --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --decode_from_file=/checkpoints/newstest2015.en --reference=/checkpoints/newstest2015.de","numactl --cpunodebind=0 --membind=0 python /workspace/models/tensor2tensor/bin/t2t_decoder.py --problem=translate_ende_wmt32k --model=transformer --hparams_set=transformer_base_single_gpu --decode_hparams=beam_size=4,alpha=0.6,batch_size=1 --data_dir=/dataset --output_dir=/checkpoints --decode_from_file=/checkpoints/newstest2015.en --decode_to_file=/workspace/models/out_dir/output_infer --reference=/checkpoints/newstest2015.de --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28" -"run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_language --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=32 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --decode_from_file=/checkpoints/newstest2015.en --reference=/checkpoints/newstest2015.de","numactl --cpunodebind=0 --membind=0 python /workspace/models/tensor2tensor/bin/t2t_decoder.py --problem=translate_ende_wmt32k --model=transformer --hparams_set=transformer_base_single_gpu --decode_hparams=beam_size=4,alpha=0.6,batch_size=32 --data_dir=/dataset --output_dir=/checkpoints --decode_from_file=/checkpoints/newstest2015.en --decode_to_file=/workspace/models/out_dir/output_infer --reference=/checkpoints/newstest2015.de --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28" -run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_lt_official --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --file=/dataset/newstest2014.en --reference=/dataset/newstest2014.de --vocab_file=/dataset/vocab.txt --in_graph=/in_graph/fp32_graphdef.pb,numactl --cpunodebind=0 --membind=0 python /workspace/models/official/transformer/infer_ab.py --param_set=big --in_graph=/in_graph/fp32_graphdef.pb --batch_size=1 --file=/dataset/newstest2014.en --file_out=/models/benchmarks/common/tensorflow/logs/translate.txt --vocab_file=/dataset/vocab.txt -run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_lt_official --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=64 --socket-id=0 --benchmark-only --verbose --file=/dataset/newstest2014.en --reference=/dataset/newstest2014.de --vocab_file=/dataset/vocab.txt --in_graph=/in_graph/fp32_graphdef.pb,numactl --cpunodebind=0 --membind=0 python /workspace/models/official/transformer/infer_ab.py --param_set=big --in_graph=/in_graph/fp32_graphdef.pb --batch_size=64 --file=/dataset/newstest2014.en --file_out=/models/benchmarks/common/tensorflow/logs/translate.txt --vocab_file=/dataset/vocab.txt -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/fp32/eval_image_classifier.py --dataset_name imagenet --checkpoint_path /checkpoints --dataset_dir /dataset --dataset_split_name=validation --clone_on_cpu=True --model_name mobilenet_v1 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 28 --batch_size 1 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/fp32/eval_image_classifier.py --dataset_name imagenet --checkpoint_path /checkpoints --dataset_dir /dataset --dataset_split_name=validation --clone_on_cpu=True --model_name mobilenet_v1 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 28 --batch_size 100 -run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --accuracy-only --verbose --checkpoint=/checkpoints --in-graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --data-location=/dataset,python /workspace/intelai_models/inference/fp32/accuracy.py --batch_size=100 --num_inter_threads=2 --input_graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --num_intra_threads=56 --data_location=/dataset -run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=256 --socket-id 0 --accuracy-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=256 --inference_only --accuracy_only -run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=256 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=256 --inference_only --benchmark_only -run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=1 --inference_only --benchmark_only -run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=maskrcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/coco.py evaluate --dataset=/dataset --num_inter_threads 1 --num_intra_threads 28 --nw 5 --nb 50 --model=coco --infbs 1 -run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=maskrcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --data-location=/dataset --num-inter-threads 4 --num-intra-threads 16,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/coco.py evaluate --dataset=/dataset --num_inter_threads 4 --num_intra_threads 16 --nw 5 --nb 50 --model=coco --infbs 1 -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --accuracy-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset,/workspace/intelai_models/inference/int8/coco_int8.sh /in_graph/pretrained_int8_faster_rcnn_model.pb /dataset /workspace/models -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --benchmark-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset, python /workspace/intelai_models/inference/int8/run_frozen_graph_rcnn.py -g /in_graph/pretrained_int8_faster_rcnn_model.pb -d /dataset --num-inter-threads 2 --num-intra-threads 56 -run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --benchmark-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset --number-of-steps=500, python /workspace/intelai_models/inference/int8/run_frozen_graph_rcnn.py -g /in_graph/pretrained_int8_faster_rcnn_model.pb -n 500 -d /dataset --num-inter-threads 2 --num-intra-threads 56 -run_tf_benchmark.py --framework=tensorflow --use-case=adversarial_networks --model-name=dcgan --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/inference_bench.py -ckpt /checkpoints -dl /dataset --num_inter_threads 1 --num_intra_threads 28 -nw 100 -nb 500 --bs 100 --kmp_blocktime 1 --kmp_settings 1 -run_tf_benchmark.py --framework=tensorflow --use-case=content_creation --model-name=draw --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/draw_inf.py --cp /checkpoints --num_inter_threads 1 --num_intra_threads 28 --bs 1 --dl /dataset --nw 100 --nb 200 -run_tf_benchmark.py --framework=tensorflow --use-case=content_creation --model-name=draw --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/draw_inf.py --cp /checkpoints --num_inter_threads 1 --num_intra_threads 28 --bs 100 --dl /dataset --nw 100 --nb 200 -run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=gnmt --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --infer_mode=beam_search,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/nmt.py --src=de --tgt=en --hparams_path=/workspace/intelai_models/fp32/standard_hparams/wmt16_gnmt_4_layer_internal.json --out_dir=/workspace/benchmarks/common/tensorflow/logs --vocab_prefix=/dataset/vocab.bpe.32000 --ckpt=/checkpoints/translate.ckpt --infer_batch_size=1 --inference_input_file=/dataset/newstest2015.tok.bpe.32000.de --inference_output_file=/workspace/benchmarks/common/tensorflow/logs/output_infer --inference_ref_file=/dataset/newstest2015.tok.bpe.32000.en --num_inter_threads=1 --num_intra_threads=28 --infer_mode=beam_search -run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=gnmt --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=32 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --infer_mode=beam_search,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/nmt.py --src=de --tgt=en --hparams_path=/workspace/intelai_models/fp32/standard_hparams/wmt16_gnmt_4_layer_internal.json --out_dir=/workspace/benchmarks/common/tensorflow/logs --vocab_prefix=/dataset/vocab.bpe.32000 --ckpt=/checkpoints/translate.ckpt --infer_batch_size=32 --inference_input_file=/dataset/newstest2015.tok.bpe.32000.de --inference_output_file=/workspace/benchmarks/common/tensorflow/logs/output_infer --inference_ref_file=/dataset/newstest2015.tok.bpe.32000.en --num_inter_threads=1 --num_intra_threads=28 --infer_mode=beam_search -run_tf_benchmark.py --framework=tensorflow --use-case=content_creation --model-name=draw --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --num-inter-threads 4 --num-intra-threads 16,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/draw_inf.py --cp /checkpoints --num_inter_threads 4 --num_intra_threads 16 --bs 100 --dl /dataset --nw 100 --nb 200 -run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=unet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --checkpoint_name=model.ckpt,numactl --cpunodebind=0 --membind=0 python /workspace/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/unet_infer.py -bs 1 -cp /checkpoints/model.ckpt --num_inter_threads 1 --num_intra_threads 28 -nw 80 -nb 400 -run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14 -run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=1 --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14 -run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=512 --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14 -run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14 -run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=1 --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14 -run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=512 --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14 -run_tf_benchmark.py --framework=tensorflow --use-case=face_detection_and_alignment --model-name=mtcc --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset,numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/one_image_test.py --num_inter_threads 1 --num_intra_threads 28 -ckpt /checkpoints -dl /dataset -run_tf_benchmark.py --framework=tensorflow --use-case=face_detection_and_alignment --model-name=facenet --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --checkpoint=/checkpoints --data-location=/dataset, numactl --cpunodebind=0 --membind=0 python /workspace/models/src/validate_on_lfw.py /dataset /checkpoints --distance_metric 1 --use_flipped_images --subtract_mean --use_fixed_image_standardization --num_inter_threads=1 --num_intra_threads=28 --lfw_batch_size=1 --lfw_pairs=/workspace/models/data/pairs.txt --warmup_steps=200 --max_steps=1000 -run_tf_benchmark.py --framework=tensorflow --use-case=face_detection_and_alignment --model-name=facenet --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --checkpoint=/checkpoints --data-location=/dataset, numactl --cpunodebind=0 --membind=0 python /workspace/models/src/validate_on_lfw.py /dataset /checkpoints --distance_metric 1 --use_flipped_images --subtract_mean --use_fixed_image_standardization --num_inter_threads=2 --num_intra_threads=28 --lfw_batch_size=100 --lfw_pairs=/workspace/models/data/pairs.txt --warmup_steps=40 --max_steps=1000 -run_tf_benchmark.py --framework=tensorflow --use-case=face_detection_and_alignment --model-name=facenet --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --checkpoint=/checkpoints --data-location=/dataset, numactl --cpunodebind=0 --membind=0 python /workspace/models/src/validate_on_lfw.py /dataset /checkpoints --distance_metric 1 --use_flipped_images --subtract_mean --use_fixed_image_standardization --num_inter_threads=2 --num_intra_threads=28 --lfw_batch_size=100 --lfw_pairs=/workspace/models/data/pairs.txt --warmup_steps=40 --max_steps=1000 diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_dcgan_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_dcgan_args.json new file mode 100644 index 000000000..e5802f700 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_dcgan_args.json @@ -0,0 +1,5 @@ +[ + { "_comment": "FP32 benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=adversarial_networks --model-name=dcgan --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/inference_bench.py -ckpt /checkpoints -dl /dataset --num_inter_threads 1 --num_intra_threads 28 -nw 100 -nb 500 --bs 100 --kmp_blocktime 1 --kmp_settings 1"} +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_densenet169_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_densenet169_args.json new file mode 100644 index 000000000..a5d665547 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_densenet169_args.json @@ -0,0 +1,15 @@ +[ + { "_comment": "FP32 latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=densenet169 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/densenet169_fp32_pretrained_model.pb", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/benchmark.py --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/densenet169_fp32_pretrained_model.pb"}, + + { "_comment": "Fp32 accuracy", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=densenet169 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/densenet169_fp32_pretrained_model.pb --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/accuracy.py --num_intra_threads=28 --num_inter_threads=1 --batch_size=100 --input_graph=/in_graph/densenet169_fp32_pretrained_model.pb --data_location=/dataset"}, + + { "_comment": "FP32 Throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=densenet169 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/densenet169_fp32_pretrained_model.pb", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/benchmark.py --num_intra_threads=28 --num_inter_threads=1 --batch_size=100 --input_graph=/in_graph/densenet169_fp32_pretrained_model.pb"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_draw_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_draw_args.json new file mode 100644 index 000000000..d638d7492 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_draw_args.json @@ -0,0 +1,15 @@ +[ + { "_comment": "FP32 latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=content_creation --model-name=draw --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/draw_inf.py --cp /checkpoints --num_inter_threads 1 --num_intra_threads 28 --bs 1 --dl /dataset --nw 100 --nb 200"}, + + { "_comment": "FP32 throughput benchmark with --num-inter-threads 4 --num-intra-threads 16", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=content_creation --model-name=draw --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --num-inter-threads 4 --num-intra-threads 16", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/draw_inf.py --cp /checkpoints --num_inter_threads 4 --num_intra_threads 16 --bs 100 --dl /dataset --nw 100 --nb 200"}, + + { "_comment": "FP32 Throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=content_creation --model-name=draw --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/draw_inf.py --cp /checkpoints --num_inter_threads 1 --num_intra_threads 28 --bs 100 --dl /dataset --nw 100 --nb 200"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_facenet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_facenet_args.json new file mode 100644 index 000000000..34b5af1fe --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_facenet_args.json @@ -0,0 +1,13 @@ +[ + { "_comment": "FP32 latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=face_detection_and_alignment --model-name=facenet --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/models/src/validate_on_lfw.py /dataset /checkpoints --distance_metric 1 --use_flipped_images --subtract_mean --use_fixed_image_standardization --num_inter_threads=1 --num_intra_threads=28 --lfw_batch_size=1 --lfw_pairs=/workspace/models/data/pairs.txt --warmup_steps=200 --max_steps=1000"}, + + { "_comment": "Fp32 accuracy", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=face_detection_and_alignment --model-name=facenet --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/models/src/validate_on_lfw.py /dataset /checkpoints --distance_metric 1 --use_flipped_images --subtract_mean --use_fixed_image_standardization --num_inter_threads=2 --num_intra_threads=28 --lfw_batch_size=100 --lfw_pairs=/workspace/models/data/pairs.txt --warmup_steps=40 --max_steps=1000"}, + + { "_comment": "FP32 Throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=face_detection_and_alignment --model-name=facenet --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/models/src/validate_on_lfw.py /dataset /checkpoints --distance_metric 1 --use_flipped_images --subtract_mean --use_fixed_image_standardization --num_inter_threads=2 --num_intra_threads=28 --lfw_batch_size=100 --lfw_pairs=/workspace/models/data/pairs.txt --warmup_steps=40 --max_steps=1000"} +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_faster_rcnn_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_faster_rcnn_args.json new file mode 100644 index 000000000..ea6c0a75a --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_faster_rcnn_args.json @@ -0,0 +1,28 @@ +[ + { "_comment": "FP32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=-1 --accuracy-only --verbose --checkpoint=/checkpoints --data-location=/dataset --in-graph=/in_graph/frozen_inference_graph.pb", + "output": "sh /workspace/intelai_models/inference/fp32/coco_accuracy.sh /in_graph/frozen_inference_graph.pb /dataset/coco_val.record /workspace/models"}, + + { "_comment": "FP32 benchmark command", + "input": "run_tf_benchmark.py --framework tensorflow --use-case object_detection --precision fp32 --mode inference --model-name faster_rcnn --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --config_file=pipeline.config", + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --num_inter_threads 1 --num_intra_threads 28 --pipeline_config_path /checkpoints/pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/log/eval"}, + + { "_comment": "FP32 benchmark command with custom --num_inter_threads 4 --num_intra_threads 16", + "input": "run_tf_benchmark.py --framework tensorflow --use-case object_detection --precision fp32 --mode inference --model-name faster_rcnn --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --config_file=pipeline.config --num-inter-threads 4 --num-intra-threads 16", + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --num_inter_threads 4 --num_intra_threads 16 --pipeline_config_path /checkpoints/pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/log/eval"}, + + { "_comment": "Int8 command for throughput benchmark with --number-of-steps enabled.", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --benchmark-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset --number-of-steps=500", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/run_frozen_graph_rcnn.py -g /in_graph/pretrained_int8_faster_rcnn_model.pb -n 500 -d /dataset --num-inter-threads 2 --num-intra-threads 56"}, + + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --accuracy-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 /workspace/intelai_models/inference/int8/coco_int8.sh /in_graph/pretrained_int8_faster_rcnn_model.pb /dataset /workspace/models"}, + + { "_comment": "FP32 benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --benchmark-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/run_frozen_graph_rcnn.py -g /in_graph/pretrained_int8_faster_rcnn_model.pb -d /dataset --num-inter-threads 2 --num-intra-threads 56" + } +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_gnmt_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_gnmt_args.json new file mode 100644 index 000000000..7fe7db376 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_gnmt_args.json @@ -0,0 +1,11 @@ +[ + { "_comment": "FP32 latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=gnmt --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --infer_mode=beam_search", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/nmt.py --src=de --tgt=en --hparams_path=/workspace/intelai_models/fp32/standard_hparams/wmt16_gnmt_4_layer_internal.json --out_dir=/workspace/benchmarks/common/tensorflow/logs --vocab_prefix=/dataset/vocab.bpe.32000 --ckpt=/checkpoints/translate.ckpt --infer_batch_size=1 --inference_input_file=/dataset/newstest2015.tok.bpe.32000.de --inference_output_file=/workspace/benchmarks/common/tensorflow/logs/output_infer --inference_ref_file=/dataset/newstest2015.tok.bpe.32000.en --num_inter_threads=1 --num_intra_threads=28 --infer_mode=beam_search"}, + + { "_comment": "FP32 Throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=gnmt --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=32 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --infer_mode=beam_search", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/nmt.py --src=de --tgt=en --hparams_path=/workspace/intelai_models/fp32/standard_hparams/wmt16_gnmt_4_layer_internal.json --out_dir=/workspace/benchmarks/common/tensorflow/logs --vocab_prefix=/dataset/vocab.bpe.32000 --ckpt=/checkpoints/translate.ckpt --infer_batch_size=32 --inference_input_file=/dataset/newstest2015.tok.bpe.32000.de --inference_output_file=/workspace/benchmarks/common/tensorflow/logs/output_infer --inference_ref_file=/dataset/newstest2015.tok.bpe.32000.en --num_inter_threads=1 --num_intra_threads=28 --infer_mode=beam_search"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_inception_resnet_v2_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_inception_resnet_v2_args.json new file mode 100644 index 000000000..c1a59e0b5 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_inception_resnet_v2_args.json @@ -0,0 +1,27 @@ +[ + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --accuracy-only --verbose --in-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --data-location=/dataset", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/eval_image_classifier_accuracy.py --input_graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --data_location=/dataset --input_height=299 --input_width=299 --num_inter_threads=2 --num_intra_threads=56 --output_layer=InceptionResnetV2/Logits/Predictions --batch_size=100"}, + + { "_comment": "Int8 command for latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --in-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --data-location=/dataset", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/eval_image_classifier_benchmark.py --input-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --inter-op-parallelism-threads=1 --intra-op-parallelism-threads=28 --batch-size=1"}, + + { "_comment": "Int8 command for throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --benchmark-only --verbose --in-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --data-location=/dataset", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/eval_image_classifier_benchmark.py --input-graph=/in_graph/inception_resnet_v2_int8_pretrained_model.pb --inter-op-parallelism-threads=1 --intra-op-parallelism-threads=28 --batch-size=128"}, + + { "_comment": "Fp32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --accuracy-only --verbose --in-graph=/in_graph/inception_resnet_v2_fp32_pretrained_model.pb --data-location=/dataset", + "output": "python /workspace/intelai_models/eval_image_classifier_accuracy.py --input_graph=/in_graph/inception_resnet_v2_fp32_pretrained_model.pb --data_location=/dataset --input_height=299 --input_width=299 --num_inter_threads=2 --num_intra_threads=56 --output_layer=InceptionResnetV2/Logits/Predictions --batch_size=100"}, + + { "_comment": "FP32 command for latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/inception_resnet_v2_fp32_pretrained_model.pb", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/eval_image_classifier_benchmark.py --input-graph=/in_graph/inception_resnet_v2_fp32_pretrained_model.pb --inter-op-parallelism-threads=2 --intra-op-parallelism-threads=28 --batch-size=1"}, + + { "_comment": "FP32 command for throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inception_resnet_v2 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/inception_resnet_v2_fp32_pretrained_model.pb", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/eval_image_classifier_benchmark.py --input-graph=/in_graph/inception_resnet_v2_fp32_pretrained_model.pb --inter-op-parallelism-threads=2 --intra-op-parallelism-threads=28 --batch-size=128"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv3_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv3_args.json new file mode 100644 index 000000000..733b691ee --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv3_args.json @@ -0,0 +1,44 @@ +[ + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 100 --in-graph /final_int8_inceptionv3.pb --intelai-models . --accuracy-only --verbose", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./int8/accuracy.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/final_int8_inceptionv3.pb"}, + + { "_comment": "Int8 command for latency benchmark with default --num-inter-threads, --num-intra-threads.", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 1 --in-graph /final_int8_inceptionv3.pb --intelai-models . --benchmark-only --socket-id 0 --verbose", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28"}, + + { "_comment": "Int8 command for throughput benchmark", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /final_int8_inceptionv3.pb --intelai-models . --benchmark-only --socket-id 0 --verbose", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28"}, + + { "_comment": "Int8 command for throughput benchmark with --steps=200 --warmup-steps=20", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inceptionv3 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/inception_frozen_max_min.pb --steps=200 --warmup-steps=20", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/int8/benchmark.py --warmup_steps=20 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/in_graph/inception_frozen_max_min.pb --steps=200 --num_cores=28"}, + + { "_comment": "Int8 command for latency benchmark with --steps=200 --warmup-steps=20", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inceptionv3 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/inception_frozen_max_min.pb --steps=200 --warmup-steps=20", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/int8/benchmark.py --warmup_steps=20 --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/inception_frozen_max_min.pb --steps=200 --num_cores=28"}, + + { "_comment": "Int8 command for throughput benchmark with --disable-tcmalloc=True", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /final_int8_inceptionv3.pb --intelai-models . --benchmark-only --socket-id 0 --disable-tcmalloc=True", + "output": "numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28"}, + + { "_comment": "Fp32 accuracy command", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 100 --accuracy-only --data-location /dataset --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --verbose", + "output": "python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only"}, + + { "_comment": "FP32 command for latency benchmark", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 1 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose", + "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28" + }, + + { "_comment": "FP32 command for throughput benchmark", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose", + "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + + { "_comment": "FP32 command for throughput benchmark with --num-inter-threads 4 --num-intra-threads 16", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose --num-inter-threads 4 --num-intra-threads 16", + "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv4_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv4_args.json new file mode 100644 index 000000000..0535c2eef --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv4_args.json @@ -0,0 +1,19 @@ +[ + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=int8 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/inceptionv4_int8_pretrained_model.pb --data-location=/dataset", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/accuracy.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28 --data_location=/dataset"}, + + { "_comment": "Int8 command for latency benchmark with default --num-inter-threads, --num-intra-threads.", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=int8 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/inceptionv4_int8_pretrained_model.pb --data-location=/dataset", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=1 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28"}, + + { "_comment": "Int8 command for throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=int8 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/inceptionv4_int8_pretrained_model.pb --data-location=/dataset", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28"}, + + { "_comment": "Fp32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=fp32 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/accuracy.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --num_intra_threads=28 --data_location=/dataset"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_lm_1b_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_lm_1b_args.json new file mode 100644 index 000000000..26d11e1c3 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_lm_1b_args.json @@ -0,0 +1,7 @@ +[ + { "_comment": "FP32 benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=lm-1b --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/models/benchmark.py -b=1 -I=100 --inter=1 --intra=28"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_maskrcnn_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_maskrcnn_args.json new file mode 100644 index 000000000..5900877d3 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_maskrcnn_args.json @@ -0,0 +1,11 @@ +[ + { "_comment": "FP32 benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=maskrcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/coco.py evaluate --dataset=/dataset --num_inter_threads 1 --num_intra_threads 28 --nw 5 --nb 50 --model=coco --infbs 1"}, + + { "_comment": "FP32 benchmark with --num-inter-threads 4 --num-intra-threads 16", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=maskrcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --data-location=/dataset --num-inter-threads 4 --num-intra-threads 16", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/coco.py evaluate --dataset=/dataset --num_inter_threads 4 --num_intra_threads 16 --nw 5 --nb 50 --model=coco --infbs 1"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json new file mode 100644 index 000000000..c98ada086 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json @@ -0,0 +1,36 @@ +[ + { "_comment": "FP32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --accuracy-only --verbose --checkpoint=/checkpoints --in-graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --data-location=/dataset", + "output": "python /workspace/intelai_models/inference/fp32/accuracy.py --batch_size=100 --num_inter_threads=2 --input_graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --num_intra_threads=56 --data_location=/dataset"}, + + { "_comment": "FP32 latency benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/fp32/eval_image_classifier.py --dataset_name imagenet --checkpoint_path /checkpoints --dataset_dir /dataset --dataset_split_name=validation --clone_on_cpu=True --model_name mobilenet_v1 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 28 --batch_size 1"}, + + { "_comment": "FP32 throughput benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/fp32/eval_image_classifier.py --dataset_name imagenet --checkpoint_path /checkpoints --dataset_dir /dataset --dataset_split_name=validation --clone_on_cpu=True --model_name mobilenet_v1 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 28 --batch_size 100"}, + + { "_comment": "FP32 benchmark command with dummy data and --output-dir specified", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --checkpoint=/checkpoints", + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/fp32/eval_image_classifier.py --dataset_name imagenet --checkpoint_path /checkpoints --dataset_split_name=validation --clone_on_cpu=True --model_name mobilenet_v1 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 28 --batch_size 100"}, + + { "_comment": "Int8 command for throughput benchmark with --number-of-steps enabled.", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --benchmark-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset --number-of-steps=500", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/run_frozen_graph_rcnn.py -g /in_graph/pretrained_int8_faster_rcnn_model.pb -n 500 -d /dataset --num-inter-threads 2 --num-intra-threads 56"}, + + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --data-location=/dataset --input_height=224 --input_width=224", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/accuracy.py --input_height=224 --input_width=224 --num_intra_threads=28 --output_layer=MobilenetV1/Predictions/Reshape_1 --num_inter_threads=1 --batch_size=100 --input_graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --data_location=/dataset --input_layer=input"}, + + { "_comment": "Int8 latency benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_height=224 --input_width=224 --warmup_steps=10 --steps=50", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/benchmark.py --input_height=224 --input_width=224 --warmup_steps=10 --num_intra_threads=28 --output_layer=MobilenetV1/Predictions/Reshape_1 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_layer=input --steps=50"}, + + + { "_comment": "Int8 throughput benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_height=224 --input_width=224 --warmup_steps=10 --steps=50", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/benchmark.py --input_height=224 --input_width=224 --warmup_steps=10 --num_intra_threads=28 --output_layer=MobilenetV1/Predictions/Reshape_1 --num_inter_threads=1 --batch_size=240 --input_graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_layer=input --steps=50"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_mtcc_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_mtcc_args.json new file mode 100644 index 000000000..b0093db93 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_mtcc_args.json @@ -0,0 +1,5 @@ +[ + { "_comment": "FP32 benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=face_detection_and_alignment --model-name=mtcc --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/one_image_test.py --num_inter_threads 1 --num_intra_threads 28 -ckpt /checkpoints -dl /dataset"} +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_ncf_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_ncf_args.json new file mode 100644 index 000000000..67fa8402c --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_ncf_args.json @@ -0,0 +1,15 @@ +[ + { "_comment": "FP32 latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=1 --inference_only --benchmark_only"}, + + { "_comment": "Fp32 accuracy", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=256 --socket-id 0 --accuracy-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=256 --inference_only --accuracy_only"}, + + { "_comment": "FP32 Throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=256 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=256 --inference_only --benchmark_only"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_resnet101_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_resnet101_args.json new file mode 100644 index 000000000..4c9132a79 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_resnet101_args.json @@ -0,0 +1,17 @@ +[ + { "_comment": "FP32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --accuracy-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb --data-location=/dataset", + "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=128 --num-inter-threads=2 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=56 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only"}, + + { "_comment": "FP32 command for throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=128 --num-inter-threads=1 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=28 --warmup-steps=10 --steps=50"}, + + { "_comment": "Int8 latency benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/resnet101_int8_model.pb", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=1 --num-intra-threads=28 --num-inter-threads=1 --input-graph=/in_graph/resnet101_int8_model.pb --warmup-steps=40 --steps=100"}, + + { "_comment": "FP32 command for latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=1 --num-inter-threads=1 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=28 --warmup-steps=10 --steps=50"} +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_resnet50_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50_args.json new file mode 100644 index 000000000..199ae2c0f --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50_args.json @@ -0,0 +1,40 @@ +[ + { "_comment": "FP32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size 100 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/freezed_resnet50.pb --accuracy-only --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=100 --data-location=/dataset --accuracy-only --num-cores=28 --warmup-steps=10 --steps=50"}, + + { "_comment": "FP32 command for latency benchmark with default --num-inter-threads, --num-intra-threads.", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50 --batch-size 128 --in-graph /freezed_resnet50.pb --intelai-models . --socket-id 0 --verbose", + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + + { "_comment": "FP32 command for latency benchmark with --num-inter-threads 4 --num-intra-threads 16", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50 --batch-size 1 --in-graph /freezed_resnet50.pb --intelai-models . --socket-id 0 --verbose --num-inter-threads 4 --num-intra-threads 16", + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28"}, + + { "_comment": "FP32 command for throughput benchmark with --num-inter-threads=1 --num-intra-threads=28", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50 --batch-size 128 --in-graph /freezed_resnet50.pb --intelai-models . --socket-id 0 --verbose", + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + + { "_comment": "Int8 command for throughput benchmark with --output-dir enabled.", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/resnet50_int8_pretrained_model.pb --steps=200 --warmup-steps=20", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200"}, + + { "_comment": "Int8 command for data calibration with --calibration-only", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50_int8_pretrained_model.pb --data-location=/dataset --calibration-only", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/int8/generate_calibration_data.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/in_graph/resnet50_int8_pretrained_model.pb --data_location=/dataset"}, + + { "_comment": "Fp32 command for throughput benchmark with --output-results enabled.", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --output-results --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50_fp32_pretrained_model.pb --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --results-file-path /workspace/benchmarks/common/tensorflow/logs/resnet50_fp32_inference_results*.txt"}, + + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name resnet50 --batch-size 100 --data-location /dataset --in-graph /final_int8_resnet50.pb --intelai-models . --accuracy-only --verbose", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./inference/eval_image_classifier_inference.py --input-graph=/final_int8_resnet50.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only"}, + + { "_comment": "Int8 command for throughput benchmark with --steps=200 --warmup-steps=20", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/resnet50_int8_pretrained_model.pb --steps=200 --warmup-steps=20", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200" + } +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json new file mode 100644 index 000000000..271813ed7 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json @@ -0,0 +1,40 @@ +[ + { "_comment": "FP32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size 100 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/freezed_resnet50v1_5.pb --accuracy-only --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=100 --data-location=/dataset --accuracy-only --num-cores=28 --warmup-steps=10 --steps=50"}, + + { "_comment": "FP32 command for latency benchmark with default --num-inter-threads, --num-intra-threads.", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50v1_5 --batch-size 128 --in-graph /freezed_resnet50v1_5.pb --intelai-models . --socket-id 0 --verbose", + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + + { "_comment": "FP32 command for latency benchmark with --num-inter-threads 4 --num-intra-threads 16", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50v1_5 --batch-size 1 --in-graph /freezed_resnet50v1_5.pb --intelai-models . --socket-id 0 --verbose --num-inter-threads 4 --num-intra-threads 16", + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50v1_5.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28"}, + + { "_comment": "FP32 command for throughput benchmark with --num-inter-threads=1 --num-intra-threads=28", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50v1_5 --batch-size 128 --in-graph /freezed_resnet50v1_5.pb --intelai-models . --socket-id 0 --verbose", + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + + { "_comment": "Int8 command for throughput benchmark with --output-dir enabled.", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --steps=200 --warmup-steps=20", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200"}, + + { "_comment": "Int8 command for data calibration with --calibration-only", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --data-location=/dataset --calibration-only", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/int8/generate_calibration_data.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --data_location=/dataset"}, + + { "_comment": "Fp32 command for throughput benchmark with --output-results enabled.", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --output-results --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50v1_5_fp32_pretrained_model.pb --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --results-file-path /workspace/benchmarks/common/tensorflow/logs/resnet50v1_5_fp32_inference_results*.txt"}, + + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name resnet50v1_5 --batch-size 100 --data-location /dataset --in-graph /final_int8_resnet50v1_5.pb --intelai-models . --accuracy-only --verbose", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./inference/eval_image_classifier_inference.py --input-graph=/final_int8_resnet50v1_5.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only"}, + + { "_comment": "Int8 command for throughput benchmark with --steps=200 --warmup-steps=20", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --steps=200 --warmup-steps=20", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200" + } +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_rfcn_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_rfcn_args.json new file mode 100644 index 000000000..f8dc9b0a0 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_rfcn_args.json @@ -0,0 +1,17 @@ +[ + { "_comment": "FP32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --verbose --in-graph=/in_graph/frozen_inference_graph.pb --data-location=/dataset --accuracy-only --split=accuracy_message", + "output": "FROZEN_GRAPH=/in_graph/frozen_inference_graph.pb TF_RECORD_FILE=/dataset SPLIT=accuracy_message TF_MODELS_ROOT=/workspace/models /workspace/intelai_models/inference/fp32/coco_mAP.sh"}, + + { "_comment": "FP32 command for benchmark", + "input": "run_tf_benchmark.py --framework tensorflow --use-case object_detection --precision fp32 --mode inference --model-name rfcn --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --config_file=rfcn_pipeline.config", + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --inter_op 1 --intra_op 28 --omp 28 --pipeline_config_path /checkpoints/rfcn_pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/models/rfcn/eval --logtostderr --blocktime=0 --run_once=True"}, + + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --verbose --in-graph=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --data-location=/dataset --accuracy-only --split=accuracy_message", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 FROZEN_GRAPH=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb TF_RECORD_FILE=/dataset SPLIT=accuracy_message TF_MODELS_ROOT=/workspace/models /workspace/intelai_models/inference/int8/coco_mAP.sh"}, + + { "_comment": "Int8 benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --verbose --in-graph=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --data-location=/dataset --benchmark-only --number_of_steps=500", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb -x 500 -d /dataset --num-inter-threads 2 --num-intra-threads 56"} +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_squeezenet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_squeezenet_args.json new file mode 100644 index 000000000..9232b10fe --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_squeezenet_args.json @@ -0,0 +1,11 @@ +[ + { "_comment": "FP32 command for latency benchmark", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name squeezenet --batch-size 1 --checkpoint /checkpoints --intelai-models . --socket-id 0 --verbose", + "output": "taskset -c 0-27 python ./fp32/train_squeezenet.py --data_location None --batch_size 1 --num_inter_threads 1 --num_intra_threads 28 --model_dir /checkpoints --inference-only --verbose"}, + + { "_comment": "FP32 command for throughput benchmark", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name squeezenet --batch-size 64 --checkpoint /checkpoints --intelai-models . --socket-id 0 --verbose", + "output": "taskset -c 0-27 python ./fp32/train_squeezenet.py --data_location None --batch_size 64 --num_inter_threads 1 --num_intra_threads 28 --model_dir /checkpoints --inference-only --verbose"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json new file mode 100644 index 000000000..fc4a7b1d9 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json @@ -0,0 +1,17 @@ +[ + { "_comment": "FP32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --accuracy-only --verbose --in-graph=/in_graph/frozen_inference_graph.pb --benchmark-dir=/workspace/benchmarks --data-location=/dataset", + "output": "sh /workspace/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/ssdmobilenet_accuracy.sh /in_graph/frozen_inference_graph.pb /dataset"}, + + { "_comment": "FP32 benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --benchmark-only --verbose --in-graph=/in_graph/frozen_inference_graph.pb --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/infer_detections.py --input_tfrecord_paths=/dataset --output_tfrecord_path=/SSD-mobilenet-out.tfrecord --inference_graph=/in_graph/frozen_inference_graph.pb --discard_image_pixels=True --num_inter_threads=2 --num_intra_threads=28"}, + + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb --data-location=/dataset", + "output": "sh /workspace/intelai_models/inference/int8/coco_int8.sh /in_graph/ssdmobilenet_int8_pretrained_model.pb /dataset"}, + + { "_comment": "Int8 benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size=1 --socket-id 0 --data-location=/dataset --verbose --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb --benchmark-only --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/run_frozen_graph_ssdmob.py -g /in_graph/ssdmobilenet_int8_pretrained_model.pb -n 5000 -d /dataset -x --num-inter-threads 2 --num-intra-threads 28"} +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_resnet34_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_resnet34_args.json new file mode 100644 index 000000000..0aa2ca495 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_resnet34_args.json @@ -0,0 +1,11 @@ +[ + { "_comment": "FP32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --accuracy-only --data-location /dataset"}, + + { "_comment": "FP32 benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_vgg16_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_vgg16_args.json new file mode 100644 index 000000000..37d478e1f --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_vgg16_args.json @@ -0,0 +1,17 @@ +[ + { "_comment": "FP32 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd_vgg16 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssdvgg16_fp32_pretrained_model.pb --data-location=/dataset", + "output": "python /workspace/intelai_models/inference/eval_ssd.py --input-graph=/in_graph/ssdvgg16_fp32_pretrained_model.pb --num-inter-threads=2 --num-intra-threads=56 --data-location=/dataset --accuracy-only"}, + + { "_comment": "FP32 benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd_vgg16 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssdvgg16_int8_pretrained_model.pb --data-location=/dataset --num-inter-threads=11 --num-intra-threads=21 --data-num-inter-threads=21 --data-num-intra-threads=28 --steps=500 --warmup-steps=100", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_ssd.py --input-graph=/in_graph/ssdvgg16_int8_pretrained_model.pb --num-inter-threads=11 --num-intra-threads=21 --data-num-inter-threads=21 --data-num-intra-threads=28 --warmup-steps=100 --steps=500 --data-location=/dataset"}, + + { "_comment": "Int8 accuracy command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd_vgg16 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssdvgg16_int8_pretrained_model.pb --data-location=/dataset", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/eval_ssd.py --input-graph=/in_graph/ssdvgg16_int8_pretrained_model.pb --num-inter-threads=2 --num-intra-threads=56 --data-location=/dataset --accuracy-only"}, + + { "_comment": "Int8 benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd_vgg16 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssdvgg16_int8_pretrained_model.pb --data-location=/dataset --num-inter-threads=11 --num-intra-threads=21 --data-num-inter-threads=21 --data-num-intra-threads=28 --steps=500 --warmup-steps=100", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_ssd.py --input-graph=/in_graph/ssdvgg16_int8_pretrained_model.pb --num-inter-threads=11 --num-intra-threads=21 --data-num-inter-threads=21 --data-num-intra-threads=28 --warmup-steps=100 --steps=500 --data-location=/dataset"} +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_transformer_language_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_transformer_language_args.json new file mode 100644 index 000000000..bf5759531 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_transformer_language_args.json @@ -0,0 +1,14 @@ +[ + { "_comment": "FP32 latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_language --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --decode_from_file=/checkpoints/newstest2015.en --reference=/checkpoints/newstest2015.de", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/models/tensor2tensor/bin/t2t_decoder.py --problem=translate_ende_wmt32k --model=transformer --hparams_set=transformer_base_single_gpu --decode_hparams=beam_size=4,alpha=0.6,batch_size=1 --data_dir=/dataset --output_dir=/checkpoints --decode_from_file=/checkpoints/newstest2015.en --decode_to_file=/workspace/models/out_dir/output_infer --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28"}, + + { "_comment": "Fp32 throughput", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_language --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=32 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --decode_from_file=/checkpoints/newstest2015.en --reference=/checkpoints/newstest2015.de", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/models/tensor2tensor/bin/t2t_decoder.py --problem=translate_ende_wmt32k --model=transformer --hparams_set=transformer_base_single_gpu --decode_hparams=beam_size=4,alpha=0.6,batch_size=32 --data_dir=/dataset --output_dir=/checkpoints --decode_from_file=/checkpoints/newstest2015.en --decode_to_file=/workspace/models/out_dir/output_infer --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28" + }, + { "_comment": "Fp32 benchmarking with no reference file", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_language --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=32 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset --decode_from_file=/checkpoints/newstest2015.en", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/models/tensor2tensor/bin/t2t_decoder.py --problem=translate_ende_wmt32k --model=transformer --hparams_set=transformer_base_single_gpu --decode_hparams=beam_size=4,alpha=0.6,batch_size=32 --data_dir=/dataset --output_dir=/checkpoints --decode_from_file=/checkpoints/newstest2015.en --decode_to_file=/workspace/models/out_dir/output_infer --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28" + } +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_transformer_lt_official_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_transformer_lt_official_args.json new file mode 100644 index 000000000..079f99abd --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_transformer_lt_official_args.json @@ -0,0 +1,9 @@ +[ + { "_comment": "FP32 latency benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_lt_official --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --file=/dataset/newstest2014.en --reference=/dataset/newstest2014.de --vocab_file=/dataset/vocab.txt --in_graph=/in_graph/fp32_graphdef.pb", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/models/official/transformer/infer_ab.py --param_set=big --in_graph=/in_graph/fp32_graphdef.pb --batch_size=1 --file=/dataset/newstest2014.en --file_out=/models/benchmarks/common/tensorflow/logs/translate.txt --vocab_file=/dataset/vocab.txt --num_inter=1 --num_intra=28"}, + + { "_comment": "FP32 throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_lt_official --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=64 --socket-id=0 --benchmark-only --verbose --file=/dataset/newstest2014.en --reference=/dataset/newstest2014.de --vocab_file=/dataset/vocab.txt --in_graph=/in_graph/fp32_graphdef.pb", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/models/official/transformer/infer_ab.py --param_set=big --in_graph=/in_graph/fp32_graphdef.pb --batch_size=64 --file=/dataset/newstest2014.en --file_out=/models/benchmarks/common/tensorflow/logs/translate.txt --vocab_file=/dataset/vocab.txt --num_inter=1 --num_intra=28"} +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_unet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_unet_args.json new file mode 100644 index 000000000..cbbe2f3f4 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_unet_args.json @@ -0,0 +1,7 @@ +[ + { "_comment": "FP32 benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=unet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --checkpoint_name=model.ckpt", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/unet_infer.py -bs 1 -cp /checkpoints/model.ckpt --num_inter_threads 1 --num_intra_threads 28 -nw 80 -nb 400"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_wavenet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_wavenet_args.json new file mode 100644 index 000000000..49ea2e09e --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_wavenet_args.json @@ -0,0 +1,7 @@ +[ + { "_comment": "FP32 benchmark command", + "input": "run_tf_benchmark.py --framework tensorflow --use-case text_to_speech --precision fp32 --mode inference --model-name wavenet --num-cores 1 --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --checkpoint_name=model.ckpt-99 --sample=8510", + "output": "numactl --physcpubind=0-0 --membind=0 python generate.py /checkpoints/model.ckpt-99 --num_inter_threads=1 --num_intra_threads=1 --sample=8510"} +] + + diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_args.json new file mode 100644 index 000000000..64fddac5b --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_args.json @@ -0,0 +1,5 @@ +[ + { "_comment": "FP32 benchmark", + "input": "run_tf_benchmark.py --framework tensorflow --use-case recommendation --precision fp32 --mode inference --model-name wide_deep --batch-size 1024 --data-location /dataset --checkpoint /checkpoints --intelai-models . --verbose", + "output": "OMP_NUM_THREADS=1 numactl --cpunodebind=0 --membind=0 python inference/fp32/wide_deep_inference.py --data_dir=/dataset --model_dir=/checkpoints --batch_size=1024"} +] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json new file mode 100644 index 000000000..3d2297515 --- /dev/null +++ b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json @@ -0,0 +1,27 @@ +[ + { "_comment": "Int8 benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14"}, + + { "_comment": "Int8 latency benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=1 --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14"}, + + { "_comment": "Int8 command for throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=512 --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14"}, + + { "_comment": "FP32 benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14"}, + + { "_comment": "Fp32 command for throughput benchmark", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=512 --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14"}, + + { "_comment": "Fp32 latency benchmark command", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset --num-parallel-batches=14", + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=1 --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data_location=/dataset --num_parallel_batches=14"} +] + + diff --git a/tests/unit/common/test_base_model_init.py b/tests/unit/common/test_base_model_init.py index 43f3076f1..7a376fb35 100644 --- a/tests/unit/common/test_base_model_init.py +++ b/tests/unit/common/test_base_model_init.py @@ -17,8 +17,22 @@ # # SPDX-License-Identifier: EPL-2.0 # - +from contextlib import contextmanager import os +import pytest +import sys +import tempfile + +try: + # python 2 + from cStringIO import StringIO +except ImportError: + # python 3 + # only supports unicode so can't be used in python 2 for sys.stdout + # because (from `print` documentation) + # "All non-keyword arguments are converted to strings like str() does" + from io import StringIO + from mock import MagicMock, patch @@ -26,6 +40,27 @@ from benchmarks.common.base_model_init import set_env_var +@contextmanager +def catch_stdout(): + _stdout = sys.stdout + sys.stdout = caught_output = StringIO() + try: + yield caught_output + finally: + sys.stdout = _stdout + caught_output.close() + + +@pytest.fixture +def mock_json(patch): + return patch('json') + + +@pytest.fixture +def mock_glob(patch): + return patch('glob.glob') + + # Example args and output strings for testing mocks test_model_name = "resnet50" test_framework = "tensorflow" @@ -109,3 +144,101 @@ def test_env_var_not_already_set(): finally: if os.environ.get(env_var): del os.environ[env_var] + + +def test_set_kmp_vars_config_json_does_not_exists(): + """Test config.json does not exist""" + # Setup base model init with test settings + platform_util = MagicMock() + args = MagicMock(verbose=True, model_name=test_model_name) + os.environ["PYTHON_EXE"] = "python" + base_model_init = BaseModelInitializer(args, [], platform_util) + + config_file_path = '/test/foo/config.json' + + with catch_stdout() as caught_output: + base_model_init.set_kmp_vars(config_file_path) + output = caught_output.getvalue() + + assert "Warning: File {} does not exist and \ + cannot be used to set KMP environment variables".format(config_file_path) == output.strip() + + +def test_set_kmp_vars_config_json_exists(mock_json): + """Test config.json when exists""" + # Setup base model init with test settings + platform_util = MagicMock() + args = MagicMock(verbose=True, model_name=test_model_name) + os.environ["PYTHON_EXE"] = "python" + base_model_init = BaseModelInitializer(args, [], platform_util) + + file_descriptor, config_file_path = tempfile.mkstemp(suffix=".json") + + base_model_init.set_kmp_vars(config_file_path) + + +@pytest.mark.parametrize('precision', ['int8']) +def test_command_prefix_tcmalloc_int8(precision, mock_glob): + """ For Int8 models, TCMalloc should be enabled by default and models should include + LD_PRELOAD in the command prefix, unless disable_tcmalloc=True is set """ + platform_util = MagicMock() + args = MagicMock(verbose=True, model_name=test_model_name) + test_tcmalloc_lib = "/usr/lib/libtcmalloc.so.4.2.6" + mock_glob.return_value = [test_tcmalloc_lib] + os.environ["PYTHON_EXE"] = "python" + args.socket_id = 0 + args.precision = precision + + # If tcmalloc is not disabled, we should have LD_PRELOAD in the prefix + args.disable_tcmalloc = False + base_model_init = BaseModelInitializer(args, [], platform_util) + command_prefix = base_model_init.get_command_prefix(args.socket_id) + assert "LD_PRELOAD={}".format(test_tcmalloc_lib) in command_prefix + assert "numactl --cpunodebind=0 --membind=0" in command_prefix + + # If tcmalloc is disabled, LD_PRELOAD shouild not be in the prefix + args.disable_tcmalloc = True + base_model_init = BaseModelInitializer(args, [], platform_util) + command_prefix = base_model_init.get_command_prefix(args.socket_id) + assert "LD_PRELOAD={}".format(test_tcmalloc_lib) not in command_prefix + assert "numactl --cpunodebind=0 --membind=0" in command_prefix + + # If numactl is set to false, we should not have numactl in the prefix + args.disable_tcmalloc = False + base_model_init = BaseModelInitializer(args, [], platform_util) + command_prefix = base_model_init.get_command_prefix(args.socket_id, numactl=False) + assert "LD_PRELOAD={}".format(test_tcmalloc_lib) in command_prefix + assert "numactl" not in command_prefix + + +@pytest.mark.parametrize('precision', ['fp32']) +def test_command_prefix_tcmalloc_fp32(precision, mock_glob): + """ FP32 models should have TC Malloc disabled by default, but models should + include LD_PRELOAD in the command prefix if disable_tcmalloc=False is explicitly set. """ + platform_util = MagicMock() + args = MagicMock(verbose=True, model_name=test_model_name) + test_tcmalloc_lib = "/usr/lib/libtcmalloc.so.4.2.6" + mock_glob.return_value = [test_tcmalloc_lib] + os.environ["PYTHON_EXE"] = "python" + args.socket_id = 0 + args.precision = precision + + # By default, TCMalloc should not be used + base_model_init = BaseModelInitializer(args, [], platform_util) + command_prefix = base_model_init.get_command_prefix(args.socket_id) + assert "LD_PRELOAD={}".format(test_tcmalloc_lib) not in command_prefix + assert "numactl --cpunodebind=0 --membind=0" in command_prefix + + # If tcmalloc is disabled, LD_PRELOAD shouild not be in the prefix + args.disable_tcmalloc = False + base_model_init = BaseModelInitializer(args, [], platform_util) + command_prefix = base_model_init.get_command_prefix(args.socket_id) + assert "LD_PRELOAD={}".format(test_tcmalloc_lib) in command_prefix + assert "numactl --cpunodebind=0 --membind=0" in command_prefix + + # If numactl is set to false, we should not have numactl in the prefix + args.disable_tcmalloc = True + base_model_init = BaseModelInitializer(args, [], platform_util) + command_prefix = base_model_init.get_command_prefix(args.socket_id, numactl=False) + assert "LD_PRELOAD={}".format(test_tcmalloc_lib) not in command_prefix + assert "numactl" not in command_prefix diff --git a/tests/unit/common/utils/test_validators.py b/tests/unit/common/utils/test_validators.py index 369ddfd76..2f590a23e 100644 --- a/tests/unit/common/utils/test_validators.py +++ b/tests/unit/common/utils/test_validators.py @@ -26,7 +26,7 @@ from common.utils.validators import (check_for_link, check_no_spaces, check_positive_number, check_positive_number_or_equal_to_negative_one, check_valid_filename, - check_valid_folder, check_valid_file_or_dir) + check_valid_folder, check_valid_file_or_dir, check_volume_mount) @pytest.fixture() @@ -152,3 +152,28 @@ def test_check_valid_file_or_dir(mock_link, mock_exists): def test_check_valid_file_or_dir_bad(): with pytest.raises(ArgumentTypeError): check_valid_file_or_dir('3245jlnsdfnsfd234ofds') + + +@pytest.mark.parametrize("volume_mount_str", + ["foo", + "foo:foo:foo:foo", + "foo,foo"]) +def test_bad_volume_mount_strings(volume_mount_str): + with pytest.raises(ArgumentTypeError): + check_volume_mount(volume_mount_str) + + +def test_valid_volume_mount(): + # create temp directory + temp_dir = tempfile.mkdtemp() + + try: + # test string that mounts local directory with mount path + volume_mount = temp_dir + ":/mount_path" + check_volume_mount(volume_mount) + + # test string that mounts local directory with mount path and specifies read only + volume_mount = temp_dir + ":/mount_path:ro" + check_volume_mount(volume_mount) + finally: + os.rmdir(temp_dir) diff --git a/tests/unit/test_launch_benchmark.py b/tests/unit/test_launch_benchmark.py index 608adc464..6145cf614 100644 --- a/tests/unit/test_launch_benchmark.py +++ b/tests/unit/test_launch_benchmark.py @@ -37,6 +37,9 @@ test_docker_image = "foo" test_batch_size = "100" test_num_cores = "1" +# need a valid file for tests to work, see conftest.py for where this is managed +test_input_graph = "test.pb" +test_tfserving_framework = "tensorflow_serving" @pytest.fixture @@ -66,8 +69,35 @@ def mock_system_platform(patch): return patch("common.base_benchmark_util.platform_util.system_platform") +@pytest.fixture +def mock_path_exists(patch): + return patch("os.path.exists", MagicMock(return_value=True)) + + +@pytest.fixture +def mock_isfile(patch): + return patch("os.path.isfile", MagicMock(return_value=True)) + + +@pytest.fixture +def mock_isdir(patch): + return patch("os.path.isdir", MagicMock(return_value=True)) + + +@pytest.fixture +def mock_islink(patch): + return patch("os.path.islink", MagicMock(return_value=False)) + + +@pytest.fixture +def mock_stat(patch): + stat = MagicMock() + stat.return_value.st_nlink = 0 + return patch("os.stat", stat) + + @pytest.fixture(autouse=True) -def launch_benchmark(mock_platform_util, request): +def launch_benchmark(mock_platform_util, request, mock_isdir, mock_isfile, mock_islink, mock_stat, mock_path_exists): """sets up launch_benchmark obj for every test case and handles catching errors if we wanna test that To catch errors called when running launch_benchmark, use something like: ['catch_error', SystemExit, [{args}], {error_message}] in parametrize @@ -113,8 +143,10 @@ def launch_benchmark(mock_platform_util, request): req_args = request.param[2] error_message = request.param[3] if len(request.param) == 4 else '' else: + # add extra arguments to the default ones when calling LaunchBenchmark req_args = request.param + example_req_args else: + # only use default arguments when calling LaunchBenchmark req_args = example_req_args with mock_patch.object(sys, "argv", ['run_tf_benchmark.py'] + req_args): @@ -167,7 +199,14 @@ def test_launch_benchmark_parse_unknown_args(launch_benchmark): "--accuracy-only", "--output-results"], "--output-results can only be used when running " - "inference with a dataset"] + "inference with a dataset"], + ['catch_error_override_all_params', SystemExit, + ["--model-name", test_model_name, + "--framework", test_framework, + "--mode", test_mode, + "--precision", test_precision, + "--volume", "~:test"], + "Volume mounts can only be used when running in a docker container"] ], indirect=True) def test_launch_benchmark_parse_bad_args(launch_benchmark): """ @@ -216,3 +255,76 @@ def test_bare_metal(launch_benchmark, mock_popen): # ensure env vars are set assert os.environ["TEST_ENV_VAR_1"] == test_env_vars["TEST_ENV_VAR_1"] assert os.environ["TEST_ENV_VAR_2"] == test_env_vars["TEST_ENV_VAR_2"] + + +@pytest.mark.parametrize('launch_benchmark', [["--in-graph", test_input_graph]], indirect=True) +def test_launch_benchmark_tensorflow_serving_framework(launch_benchmark, mock_popen): + """ + Tests that the launch script works for tensorflow serving framework + """ + test_env_vars = {"TEST_ENV_VAR_1": "a", "TEST_ENV_VAR_2": "b"} + # Override framework and docker image. + launch_benchmark.args.framework = test_tfserving_framework + launch_benchmark.args.docker_image = None + launch_benchmark.run_bare_metal("/foo", "/bar", test_env_vars) + assert mock_popen.called + args, kwargs = mock_popen.call_args + + assert launch_benchmark.args.input_graph == test_input_graph + assert launch_benchmark.args.framework == test_tfserving_framework + + # make sure that the start script is run + assert "bash" == args[0][0] + assert "start.sh" in args[0][1] + + # ensure env vars are set + assert os.environ["TEST_ENV_VAR_1"] == test_env_vars["TEST_ENV_VAR_1"] + assert os.environ["TEST_ENV_VAR_2"] == test_env_vars["TEST_ENV_VAR_2"] + + +def test_help(mock_platform_util, capsys): + """ Tests `launch_benchmark.py --help` output and ensures there is no error """ + with mock_patch.object(sys, 'argv', ["launch_benchmark.py", "--help"]): + with pytest.raises(SystemExit) as e: + LaunchBenchmark(mock_platform_util) + assert e.value.code == 0 + + # get the stdout and check the output + captured = capsys.readouterr() + assert "usage: launch_benchmark.py [-h] " in captured.out + + # check for an arg that is only in launch_benchmark.py + assert "--docker-image DOCKER_IMAGE" in captured.out + + # check for an arg that's in base_benchmark_util.py + assert "-f FRAMEWORK, --framework FRAMEWORK" in captured.out + + # make sure there were no errors printed + assert "error" not in captured.out.lower() + + +def test_launch_benchmark_custom_volume(launch_benchmark, mock_popen): + """ + Verifies the docker run command includes custom volumes + """ + custom_volumes = ["~:/foo1", "~:/foo2"] + launch_benchmark.args.custom_volumes = custom_volumes + launch_benchmark.main() + assert mock_popen.called + args, _ = mock_popen.call_args + # convert the run command args to a string and then check for the custom volume mounts + docker_run_cmd = " ".join(args[0]) + for custom_volume in custom_volumes: + assert "--volume {}".format(custom_volume) in docker_run_cmd + + +@pytest.mark.parametrize("precision,expected_disable_tcmalloc", [["int8", "False"], + ["fp32", "True"]]) +def test_disable_tcmalloc(launch_benchmark, mock_popen, precision, expected_disable_tcmalloc): + launch_benchmark.args.precision = precision + launch_benchmark.main() + assert mock_popen.called + args, _ = mock_popen.call_args + # convert the run command args to a string and then check for the custom volume mounts + docker_run_cmd = " ".join(args[0]) + assert "--env DISABLE_TCMALLOC=".format(expected_disable_tcmalloc) in docker_run_cmd diff --git a/tox.ini b/tox.ini index 90ac004f4..20ae07d16 100644 --- a/tox.ini +++ b/tox.ini @@ -37,6 +37,7 @@ omit = .tox/* .pytest_cache/* __pycache__/* + benchmarks/image_recognition/tensorflow_serving/* benchmarks/image_segmentation/tensorflow/unet/inference/fp32/unet_infer.py benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/fp32/infer_detections.py benchmarks/recommendation/tensorflow/wide_deep/inference/fp32/data_download.py