Skip to content
This repository has been archived by the owner on Aug 5, 2022. It is now read-only.

Commit

Permalink
enable INT8 InnerProduct and support memory optimizaiton for inference
Browse files Browse the repository at this point in the history
  • Loading branch information
daisyden committed Mar 4, 2019
1 parent d554cbf commit 2135dfb
Show file tree
Hide file tree
Showing 42 changed files with 71,261 additions and 92 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ endif()
project(Caffe C CXX)

# ---[ Caffe version
set(CAFFE_TARGET_VERSION "1.1.4" CACHE STRING "Caffe logical version")
set(CAFFE_TARGET_SOVERSION "1.1.4" CACHE STRING "Caffe soname version")
set(CAFFE_TARGET_VERSION "1.1.5" CACHE STRING "Caffe logical version")
set(CAFFE_TARGET_SOVERSION "1.1.5" CACHE STRING "Caffe soname version")
add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})

# ---[ Using cmake scripts and modules
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Intel® Distribution of Caffe*
This fork Caffe is dedicated to improving Caffe performance when running on CPU, in particular Intel® Xeon processors (HSW, BDW, Xeon Phi)
This fork is dedicated to improving Caffe performance when running on CPU, in particular Intel® Xeon processors.

## Building
Build procedure is the same as on bvlc-caffe-master branch, see section "Caffe". Both Make and CMake can be used.
Expand Down
2 changes: 1 addition & 1 deletion cmake/MKLDNN.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ function(Download_MKLDNN)
)

set(MKLDNN_INCLUDE_DIR ${MKLDNN_INSTALL_DIR}/include CACHE PATH "Include files for MKLDNN")
set(MKLDNN_LIB_DIR ${MKLDNN_INSTALL_DIR}/lib)
set(MKLDNN_LIB_DIR ${MKLDNN_INSTALL_DIR}/lib64)
add_library(mkldnn SHARED IMPORTED ${MKLDNN_INSTALL_DIR})
set_property(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB_DIR}/libmkldnn.so)
add_dependencies(mkldnn MKLDNN_Build)
Expand Down
2 changes: 1 addition & 1 deletion docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ docker run -ti caffe:cpu caffe --version
```
which should show a message like:
```
caffe version 1.1.4
caffe version 1.1.5
```

One can also build and run the Caffe tests in the image using:
Expand Down
2 changes: 1 addition & 1 deletion docs/release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ All modification made by Intel Corporation: © 2016 Intel Corporation

## Introduction

This fork is dedicated to improving Caffe performance when running on CPU, in particular Intel® Xeon processors (Haswell, Broadwell, Xenon Phi)
This fork is dedicated to improving Caffe performance when running on CPU, in particular Intel® Xeon processors.

## Installation

Expand Down
62 changes: 62 additions & 0 deletions include/caffe/layers/axpy_layer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* * Axpy Layer
* *
* * Created on: May 1, 2017
* * Author: hujie
* */

#ifndef CAFFE_AXPY_LAYER_HPP_
#define CAFFE_AXPY_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"

namespace caffe {

/**
* * @brief For reduce memory and time both on training and testing, we combine
* * channel-wise scale operation and element-wise addition operation
* * into a single layer called "axpy".
* *
* */
template <typename Dtype>
class AxpyLayer: public Layer<Dtype> {
public:
explicit AxpyLayer(const LayerParameter& param)
: Layer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {}
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);

virtual inline const char* type() const { return "Axpy"; }
virtual inline int ExactNumBottomBlobs() const { return 3; }
virtual inline int ExactNumTopBlobs() const { return 1; }

protected:
/**
* * @param Formulation:
* * F = a * X + Y
* * Shape info:
* * a: N x C --> bottom[0]
* * X: N x C x H x W --> bottom[1]
* * Y: N x C x H x W --> bottom[2]
* * F: N x C x H x W --> top[0]
* */
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

Blob<Dtype> spatial_sum_multiplier_;
};

} // namespace caffe

#endif // CAFFE_AXPY_LAYER_HPP_
6 changes: 4 additions & 2 deletions include/caffe/layers/mkldnn_layers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,8 +426,8 @@ class MKLDNNReLULayer : public MKLDNNLayer<Dtype> , public NeuronLayer<Dtype> {

shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data, bwd_bottom_data;
shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
shared_ptr<relu_forward::primitive_desc> reluFwd_pd;
shared_ptr<relu_backward::primitive_desc> reluBwd_pd;
shared_ptr<eltwise_forward::primitive_desc> reluFwd_pd;
shared_ptr<eltwise_backward::primitive_desc> reluBwd_pd;
MKLDNNPrimitive<Dtype> reluFwd, reluBwd;
shared_ptr<memory> fwd_top_data_memory, bwd_bottom_diff_memory;
shared_ptr<primitive> fwd_bottom_data_primitive, bwd_top_diff_primitive, bwd_bottom_data_primitive;
Expand Down Expand Up @@ -525,6 +525,8 @@ class MKLDNNSplitLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype> {
vector<primitive::at> bwd_top_diffs_primitives_at_;
vector<shared_ptr<MKLDNNDiff<Dtype> > > bwd_top_diffs_;

bool first;

PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
};

Expand Down
18 changes: 17 additions & 1 deletion include/caffe/mkldnn_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "caffe/util/math_functions.hpp"
#include "mkldnn.hpp"
#include "mkldnn_base.hpp"
#include "caffe/syncedmem.hpp"
#include "caffe/net.hpp"

using namespace mkldnn;

Expand Down Expand Up @@ -135,7 +137,19 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr
new memory(*_prv_memory_pd, (void*)_mlsl_memory.get()));
} else {
#endif
_prv_memory = shared_ptr<memory>(new memory(*_prv_memory_pd));
// BufSize is the switch of whether enabling circle buffer mechanism to
// boost up mkldnn primitive execution on inference path.
if (CircleBuf::Instance()->GetBufSize()) {
if (!_is_weight) {
// find out a free buf in the circleBuf queue
_m_memory = CircleBuf::Instance()->GetFreeBuf();
} else {
bool cuda;
CaffeMallocHost(&_m_memory, _prv_memory_pd->get_size(), &cuda);
}
_prv_memory = shared_ptr<memory>(new memory(*_prv_memory_pd, _m_memory));
} else
_prv_memory = shared_ptr<memory>(new memory(*_prv_memory_pd));
#ifdef USE_MLSL
}
#endif
Expand Down Expand Up @@ -195,6 +209,8 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr
#ifdef USE_MLSL
shared_ptr<char> _mlsl_memory;
#endif
void* _m_memory;
bool _is_weight;
};

template <typename Dtype, bool is_diff>
Expand Down
59 changes: 59 additions & 0 deletions include/caffe/net.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,60 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

namespace caffe {

class CircleBuf {
private:
/* The first element of pair is the pointer to the buffer. The second one is the reference count of this buffer */
vector<pair<void*, int>> circleBuf;
/* All circle buffers in the queue use same size. It's calculated by getting largest blob size at Net Init phase
* and multiplying 4 as mkldnn winograd conv algorithm may need 4x space of original blob size. */
size_t BufSize;

public:
/* Static access method. */
static CircleBuf* Instance() {
static CircleBuf* instance = new CircleBuf();
return instance;
}

/* Get free buffer from the queue, using refcnt to keep track of the state of each buffer.
* if not exist free buffer, then allocate a new one. */
inline void* GetFreeBuf() {
// BufSize being 0 means the circle buf feature is disabled.
if (BufSize == 0) return NULL;

for (auto & p : circleBuf) {
if (p.second == 0) {p.second = 1; return p.first;}
}

// not found free buf or queue is empty, allocate a new buf and insert to queue.
void* buf = NULL;
bool cuda;
CaffeMallocHost(&buf, BufSize * 4, &cuda);
circleBuf.push_back(make_pair(buf, 1));
return buf;
}
/* Increase the reference count of specified buffer */
inline void IncRefCnt(const void* buf, size_t refcnt) {
for (auto & p : circleBuf) {
if (p.first == buf) {p.second += refcnt;}
}
}
/* Decrease the reference count of specified buffer */
inline void DecRefCnt(const void* buf) {
for (auto & p : circleBuf) {
if (p.first == buf) {if (p.second > 0) p.second -= 1;}
}
}

inline void SetBufSize(size_t size) {BufSize = size;}
inline size_t GetBufSize() {return BufSize;}
inline size_t GetQueueSize() {return circleBuf.size();}

private:
/* Private constructor to prevent instancing. */
CircleBuf() { BufSize = 0; circleBuf.clear(); }
};

/**
* @brief Connects Layer%s together into a directed acyclic graph (DAG)
* specified by a NetParameter.
Expand All @@ -69,6 +123,9 @@ class Net {
const Net* root_net = NULL, std::string engine = "");
virtual ~Net() {}

/// @brief Buffer Queue for reducing cache missing and saving memory footprint of MKLDNN layer.
static vector<struct CircleBuf> circleBuf;

/// @brief Initialize a network with a NetParameter.
void Init(const NetParameter& param);

Expand Down Expand Up @@ -523,13 +580,15 @@ class Net {
vector<bool> has_params_decay_;
/// The bytes of memory used by this net
size_t memory_used_;
size_t max_blob_count;
/// Whether to compute and display debug info for the net.
bool debug_info_;
/// The root net that actually holds the shared layers in data parallelism
const Net* const root_net_;
DISABLE_COPY_AND_ASSIGN(Net);
};

template<typename Dtype> vector<struct CircleBuf> Net<Dtype>::circleBuf;

} // namespace caffe

Expand Down
7 changes: 5 additions & 2 deletions include/caffe/syncedmem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
#ifdef USE_MKL
*ptr = mkl_malloc(size ? size : 1, 64);
#else
*ptr = malloc(size);
//*ptr = malloc(size);
int rc = ::posix_memalign(ptr, 64, size);
assert(rc == 0);
#endif

#ifdef USE_MLSL
Expand Down Expand Up @@ -103,7 +105,8 @@ inline void CaffeFreeHost(void* ptr, bool use_cuda) {
#ifdef USE_MKL
mkl_free(ptr);
#else
free(ptr);
//free(ptr);
::free(ptr);
#endif

#ifdef USE_MLSL
Expand Down
2 changes: 1 addition & 1 deletion models/intel_optimized_models/int8/resnet50_int8.prototxt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ layer {
value: 0.01
}
shape {
dim: 64
dim: 1
dim: 3
dim: 224
dim: 224
Expand Down
Loading

0 comments on commit 2135dfb

Please sign in to comment.