enable INT8 InnerProduct and support memory optimizaiton for inference

intel · Mar 4, 2019 · 2135dfb · 2135dfb
1 parent d554cbf
commit 2135dfb
Show file tree

Hide file tree

Showing 42 changed files with 71,261 additions and 92 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,8 +10,8 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "1.1.4" CACHE STRING "Caffe logical version")
-set(CAFFE_TARGET_SOVERSION "1.1.4" CACHE STRING "Caffe soname version")
+set(CAFFE_TARGET_VERSION "1.1.5" CACHE STRING "Caffe logical version")
+set(CAFFE_TARGET_SOVERSION "1.1.5" CACHE STRING "Caffe soname version")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
 # ---[ Using cmake scripts and modules

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Intel® Distribution of Caffe*
-This fork Caffe is dedicated to improving Caffe performance when running on CPU, in particular Intel® Xeon processors (HSW, BDW, Xeon Phi)
+This fork is dedicated to improving Caffe performance when running on CPU, in particular Intel® Xeon processors.
 
 ## Building
 Build procedure is the same as on bvlc-caffe-master branch, see section "Caffe". Both Make and CMake can be used.

diff --git a/cmake/MKLDNN.cmake b/cmake/MKLDNN.cmake
@@ -40,7 +40,7 @@ function(Download_MKLDNN)
                       ) 
 
   set(MKLDNN_INCLUDE_DIR ${MKLDNN_INSTALL_DIR}/include CACHE PATH "Include files for MKLDNN")
-  set(MKLDNN_LIB_DIR ${MKLDNN_INSTALL_DIR}/lib)
+  set(MKLDNN_LIB_DIR ${MKLDNN_INSTALL_DIR}/lib64)
   add_library(mkldnn SHARED IMPORTED ${MKLDNN_INSTALL_DIR})
   set_property(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB_DIR}/libmkldnn.so)
   add_dependencies(mkldnn MKLDNN_Build)

diff --git a/docker/README.md b/docker/README.md
@@ -17,7 +17,7 @@ docker run -ti caffe:cpu caffe --version
 ```
 which should show a message like:
 ```
-caffe version 1.1.4
+caffe version 1.1.5
 ```
 
 One can also build and run the Caffe tests in the image using:

diff --git a/docs/release_notes.md b/docs/release_notes.md
@@ -56,7 +56,7 @@ All modification made by Intel Corporation: © 2016 Intel Corporation
 
 ## Introduction
 
-This fork is dedicated to improving Caffe performance when running on CPU, in particular Intel® Xeon processors (Haswell, Broadwell, Xenon Phi)
+This fork is dedicated to improving Caffe performance when running on CPU, in particular Intel® Xeon processors.
 
 ## Installation
 

diff --git a/include/caffe/layers/axpy_layer.hpp b/include/caffe/layers/axpy_layer.hpp
@@ -0,0 +1,62 @@
+/*
+ *  * Axpy Layer
+ *   *
+ *    * Created on: May 1, 2017
+ *     * Author: hujie
+ *      */
+
+#ifndef CAFFE_AXPY_LAYER_HPP_
+#define CAFFE_AXPY_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+
+namespace caffe {
+
+/**
+ *  * @brief For reduce memory and time both on training and testing, we combine
+ *   *        channel-wise scale operation and element-wise addition operation 
+ *    *        into a single layer called "axpy".
+ *     *       
+ *      */
+template <typename Dtype>
+class AxpyLayer: public Layer<Dtype> {
+ public:
+  explicit AxpyLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Axpy"; }
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+/**
+ *  * @param Formulation:
+ *   *            F = a * X + Y
+ *    *	  Shape info:
+ *     *            a:  N x C          --> bottom[0]      
+ *      *            X:  N x C x H x W  --> bottom[1]       
+ *       *            Y:  N x C x H x W  --> bottom[2]     
+ *        *            F:  N x C x H x W  --> top[0]
+ *         */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Blob<Dtype> spatial_sum_multiplier_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_AXPY_LAYER_HPP_
diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp
@@ -426,8 +426,8 @@ class MKLDNNReLULayer : public MKLDNNLayer<Dtype> , public NeuronLayer<Dtype>  {
 
     shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data, bwd_bottom_data;
     shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
-    shared_ptr<relu_forward::primitive_desc> reluFwd_pd;
-    shared_ptr<relu_backward::primitive_desc> reluBwd_pd;
+    shared_ptr<eltwise_forward::primitive_desc> reluFwd_pd;
+    shared_ptr<eltwise_backward::primitive_desc> reluBwd_pd;
     MKLDNNPrimitive<Dtype> reluFwd, reluBwd;
     shared_ptr<memory> fwd_top_data_memory, bwd_bottom_diff_memory;
     shared_ptr<primitive> fwd_bottom_data_primitive, bwd_top_diff_primitive, bwd_bottom_data_primitive;
@@ -525,6 +525,8 @@ class MKLDNNSplitLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype> {
     vector<primitive::at> bwd_top_diffs_primitives_at_;
     vector<shared_ptr<MKLDNNDiff<Dtype> > > bwd_top_diffs_;
 
+    bool first;
+
     PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
 };
 

diff --git a/include/caffe/mkldnn_memory.hpp b/include/caffe/mkldnn_memory.hpp
@@ -47,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/util/math_functions.hpp"
 #include "mkldnn.hpp"
 #include "mkldnn_base.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/net.hpp"
 
 using namespace mkldnn;
 
@@ -135,7 +137,19 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr
               new memory(*_prv_memory_pd, (void*)_mlsl_memory.get()));
           } else {
 #endif
-            _prv_memory = shared_ptr<memory>(new memory(*_prv_memory_pd));
+            // BufSize is the switch of whether enabling circle buffer mechanism to
+            // boost up mkldnn primitive execution on inference path.
+            if (CircleBuf::Instance()->GetBufSize()) {
+              if (!_is_weight) {
+                // find out a free buf in the circleBuf queue
+                _m_memory = CircleBuf::Instance()->GetFreeBuf();
+              } else {
+                bool cuda;
+                CaffeMallocHost(&_m_memory, _prv_memory_pd->get_size(), &cuda);
+              }
+              _prv_memory = shared_ptr<memory>(new memory(*_prv_memory_pd, _m_memory));
+            } else
+              _prv_memory = shared_ptr<memory>(new memory(*_prv_memory_pd));
 #ifdef USE_MLSL
           }
 #endif
@@ -195,6 +209,8 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr
 #ifdef USE_MLSL
     shared_ptr<char> _mlsl_memory;
 #endif
+    void* _m_memory;
+    bool _is_weight;
 };
 
 template <typename Dtype, bool is_diff>

diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
@@ -54,6 +54,60 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace caffe {
 
+class CircleBuf {
+  private:
+    /* The first element of pair is the pointer to the buffer. The second one is the reference count of this buffer */
+    vector<pair<void*, int>> circleBuf;
+    /* All circle buffers in the queue use same size. It's calculated by getting largest blob size at Net Init phase
+     * and multiplying 4 as mkldnn winograd conv algorithm may need 4x space of original blob size. */
+    size_t BufSize;
+
+  public:
+    /* Static access method. */
+    static CircleBuf* Instance() {
+      static CircleBuf* instance = new CircleBuf();
+      return instance;
+    }
+
+    /* Get free buffer from the queue, using refcnt to keep track of the state of each buffer.
+     * if not exist free buffer, then allocate a new one. */
+    inline void* GetFreeBuf() {
+      // BufSize being 0 means the circle buf feature is disabled.
+      if (BufSize == 0) return NULL;
+
+      for (auto & p : circleBuf) {
+        if (p.second == 0) {p.second = 1; return p.first;}
+      }
+
+      // not found free buf or queue is empty, allocate a new buf and insert to queue.
+      void* buf = NULL;
+      bool  cuda;
+      CaffeMallocHost(&buf, BufSize * 4, &cuda); 
+      circleBuf.push_back(make_pair(buf, 1));
+      return buf;
+    }
+    /* Increase the reference count of specified buffer */
+    inline void IncRefCnt(const void* buf, size_t refcnt) {
+      for (auto & p : circleBuf) {
+        if (p.first == buf) {p.second += refcnt;}
+      }
+    }
+    /* Decrease the reference count of specified buffer */
+    inline void DecRefCnt(const void* buf) {
+      for (auto & p : circleBuf) {
+        if (p.first == buf) {if (p.second > 0) p.second -= 1;}
+      }
+    }
+
+    inline void   SetBufSize(size_t size) {BufSize = size;}
+    inline size_t GetBufSize() {return BufSize;}
+    inline size_t GetQueueSize() {return circleBuf.size();}
+
+  private:                         
+    /* Private constructor to prevent instancing. */
+    CircleBuf() { BufSize = 0; circleBuf.clear(); }
+};
+
 /**
  * @brief Connects Layer%s together into a directed acyclic graph (DAG)
  *        specified by a NetParameter.
@@ -69,6 +123,9 @@ class Net {
       const Net* root_net = NULL, std::string engine = "");
   virtual ~Net() {}
 
+  /// @brief Buffer Queue for reducing cache missing and saving memory footprint of MKLDNN layer.
+  static vector<struct CircleBuf> circleBuf;
+
   /// @brief Initialize a network with a NetParameter.
   void Init(const NetParameter& param);
 
@@ -523,13 +580,15 @@ class Net {
   vector<bool> has_params_decay_;
   /// The bytes of memory used by this net
   size_t memory_used_;
+  size_t max_blob_count;
   /// Whether to compute and display debug info for the net.
   bool debug_info_;
   /// The root net that actually holds the shared layers in data parallelism
   const Net* const root_net_;
   DISABLE_COPY_AND_ASSIGN(Net);
 };
 
+template<typename Dtype> vector<struct CircleBuf> Net<Dtype>::circleBuf;
 
 }  // namespace caffe
 

diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
@@ -75,7 +75,9 @@ inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
 #ifdef USE_MKL
     *ptr = mkl_malloc(size ? size : 1, 64);
 #else
-   *ptr = malloc(size);
+   //*ptr = malloc(size);
+   int rc = ::posix_memalign(ptr, 64, size);
+   assert(rc == 0);
 #endif
 
 #ifdef USE_MLSL
@@ -103,7 +105,8 @@ inline void CaffeFreeHost(void* ptr, bool use_cuda) {
 #ifdef USE_MKL
     mkl_free(ptr);
 #else
-    free(ptr);
+    //free(ptr);
+    ::free(ptr);
 #endif
 
 #ifdef USE_MLSL

diff --git a/models/intel_optimized_models/int8/resnet50_int8.prototxt b/models/intel_optimized_models/int8/resnet50_int8.prototxt
@@ -9,7 +9,7 @@ layer {
       value: 0.01 
     }    
     shape {
-      dim: 64
+      dim: 1
       dim: 3
       dim: 224
       dim: 224