diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0856984..c84cfed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,15 @@ project(adaboost CXX)
 # First, define all the compilation options.
 option(BUILD_TESTS "Build tests." OFF)
 option(INSTALL_GOOGLETEST "For installing GoogleTest on your system along with the build." OFF)
+option(BUILD_CUDA "Build with CUDA support." OFF)
+
+include(CheckLanguage)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+else()
+  message(STATUS "Building without CUDA support as no CUDA compiler is found on this system.")
+endif()
 
 
 # Set required standard to C++11.
@@ -37,4 +46,6 @@ install(DIRECTORY adaboost DESTINATION ${CMAKE_INSTALL_PREFIX}/include
 install(FILES
         ${CMAKE_BINARY_DIR}/libs/libadaboost_core.so
         ${CMAKE_BINARY_DIR}/libs/libadaboost_utils.so
+        ${CMAKE_BINARY_DIR}/libs/libadaboost_cuda.so
+        ${CMAKE_BINARY_DIR}/libs/libadaboost_cuda_wrappers.so
         DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
diff --git a/adaboost/CMakeLists.txt b/adaboost/CMakeLists.txt
index c55f6f4..3a96aab 100644
--- a/adaboost/CMakeLists.txt
+++ b/adaboost/CMakeLists.txt
@@ -3,10 +3,22 @@ add_library(adaboost_utils SHARED
             utils/utils_impl.cpp)
 add_library(adaboost_core SHARED
             core/data_structures_impl.cpp
-            core/operations_impl.cpp)
+            core/operations_impl.cpp
+            utils/utils_impl.cpp)
+if(BUILD_CUDA)
+    add_library(adaboost_cuda_wrappers SHARED
+                utils/cuda_wrappers_impl.cu)
+    add_library(adaboost_cuda SHARED
+                cuda/cuda_data_structures_impl.cu
+                utils/cuda_wrappers_impl.cu)
+endif()
 if(BUILD_TESTS)
     set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin)
-    set(LINK_LIBRARIES gtest gtest_main pthread adaboost_core adaboost_utils)
+    set(LINK_LIBRARIES gtest gtest_main pthread adaboost_core)
     add_executable(test_core tests/test_core.cpp)
     target_link_libraries(test_core ${LINK_LIBRARIES})
+    if(BUILD_CUDA)
+        add_executable(test_cuda tests/test_cuda.cpp)
+        target_link_libraries(test_cuda ${LINK_LIBRARIES} adaboost_cuda)
+    endif()
 endif()
diff --git a/adaboost/core/data_structures.hpp b/adaboost/core/data_structures.hpp
index 64d0585..a7c67be 100644
--- a/adaboost/core/data_structures.hpp
+++ b/adaboost/core/data_structures.hpp
@@ -79,6 +79,8 @@ namespace adaboost
                 */
                 unsigned int get_size() const;
 
+                data_type_vector* get_data_pointer() const;
+
                 /*
                 * Used for freeing memory.
                 */
@@ -102,7 +104,7 @@ namespace adaboost
                 unsigned int rows, cols;
 
                 //! Array for storing data internally in matrices.
-                data_type_matrix **data;
+                data_type_matrix* data;
 
                 /*
                 * For reserving space in memory accoring to a given rows and columns.
@@ -112,7 +114,7 @@ namespace adaboost
                 * @param _rows Number of rows for which the space is to be reserved.
                 * @param _cols Number of columns for which the space is to be reserved.
                 */
-                static data_type_matrix**
+                static data_type_matrix*
                 _reserve_space(unsigned int _rows,
                                unsigned int _cols);
 
@@ -174,6 +176,8 @@ namespace adaboost
                 */
                 unsigned int get_cols() const;
 
+                data_type_matrix* get_data_pointer() const;
+
                 /*
                 * Used for freeing memory.
                 */
diff --git a/adaboost/core/data_structures_impl.cpp b/adaboost/core/data_structures_impl.cpp
index 692ce78..37bf6f5 100644
--- a/adaboost/core/data_structures_impl.cpp
+++ b/adaboost/core/data_structures_impl.cpp
@@ -3,6 +3,7 @@
 
 #include<adaboost/core/data_structures.hpp>
 #include<adaboost/utils/utils.hpp>
+#include<iostream>
 
 namespace adaboost
 {
@@ -66,6 +67,12 @@ namespace adaboost
             return this->size;
         }
 
+        template <class data_type_vector>
+        data_type_vector* Vector<data_type_vector>::get_data_pointer() const
+        {
+            return this->data;
+        }
+
         template <class data_type_vector>
         Vector<data_type_vector>::
         ~Vector()
@@ -84,16 +91,12 @@ namespace adaboost
         }
 
         template <class data_type_matrix>
-        data_type_matrix**  Matrix<data_type_matrix>::
+        data_type_matrix*  Matrix<data_type_matrix>::
         _reserve_space(unsigned int _rows, unsigned int _cols)
         {
             adaboost::utils::check(_rows > 0, "Number of rows should be positive.");
             adaboost::utils::check(_cols > 0, "Number of cols should be positive.");
-            data_type_matrix** new_data = new data_type_matrix*[_rows];
-            for(unsigned int i = 0; i < _rows; i++)
-            {
-                new_data[i] = new data_type_matrix[_cols];
-            }
+            data_type_matrix* new_data = new data_type_matrix[_rows*_cols];
             return new_data;
         }
 
@@ -114,7 +117,7 @@ namespace adaboost
                                   "Row index out of range.");
             adaboost::utils::check(y >= 0 && y < this->get_cols(),
                                  "Column index out of range.");
-            return this->data[x][y];
+            return this->data[x*this->cols + y];
         }
 
         template <class data_type_matrix>
@@ -127,7 +130,7 @@ namespace adaboost
                                   "Row index out of range.");
             adaboost::utils::check(y >= 0 && y < this->get_cols(),
                                   "Column index out of range.");
-            this->data[x][y] = value;
+            this->data[x*this->cols + y] = value;
         }
 
         template <class data_type_matrix>
@@ -138,7 +141,7 @@ namespace adaboost
             {
                 for(unsigned int j = 0; j < this->cols; j++)
                 {
-                    this->data[i][j] = value;
+                    this->data[i*this->cols + j] = value;
                 }
             }
         }
@@ -157,16 +160,19 @@ namespace adaboost
             return this->cols;
         }
 
+        template <class data_type_matrix>
+        data_type_matrix* Matrix<data_type_matrix>::get_data_pointer() const
+        {
+            return this->data;
+        }
+
         template <class data_type_matrix>
         Matrix<data_type_matrix>::
         ~Matrix()
         {
             if(this->data != NULL)
             {
-                for(unsigned int i = 0; i < this->rows; i++)
-                {
-                    delete [] this->data[i];
-                }
+                delete [] this->data;
             }
         }
 
diff --git a/adaboost/cuda/cuda_data_structures.hpp b/adaboost/cuda/cuda_data_structures.hpp
new file mode 100644
index 0000000..b6f67c9
--- /dev/null
+++ b/adaboost/cuda/cuda_data_structures.hpp
@@ -0,0 +1,158 @@
+#ifndef CUDA_ADABOOST_CORE_DATA_STRUCTURES_HPP
+#define CUDA_ADABOOST_CORE_DATA_STRUCTURES_HPP
+
+#include<adaboost/core/data_structures.hpp>
+
+namespace adaboost
+{
+    namespace cuda
+    {
+        namespace core
+        {
+            /*
+            * This class represents the GPU version
+            * of adaboost::core::Vector.
+            *
+            * @tparam data_type_vector Data type of the elements
+            * supported by C++.
+            */
+            template <class data_type_vector>
+            class VectorGPU: public adaboost::core::Vector<data_type_vector>
+            {
+                private:
+
+                    //! Array for storing data on GPU.
+                    data_type_vector* data_gpu;
+
+                    //! The size of the vector stored on GPU.
+                    unsigned size_gpu;
+
+                    /*
+                    * For reserving space in GPU memory accoring to a given size.
+                    * Used in initializer list of parameterized constructors.
+                    * Returns a new pointer.
+                    *
+                    * @param _size The size for which the space is to be reserved
+                    * on GPU.
+                    */
+                    static data_type_vector*
+                    _reserve_space_gpu(unsigned _size_gpu);
+
+                public:
+
+                    /*
+                    * Default constructor.
+                    * Sets VectorGPU::data_gpu to NULL and size_gpu to 0.
+                    */
+                    VectorGPU();
+
+                    /*
+                    * Prameterized constructor.
+                    *
+                    * @param _size The size of the vector on GPU.
+                    *    Must be positive.
+                    */
+                    VectorGPU(unsigned _size);
+
+                    /*
+                    * Used for filling the vector with a given value.
+                    * If block size is passed 0 then the values are
+                    * filled on the CPU otherwise they are filled on
+                    * GPU.
+                    *
+                    * @param value The value with which the vector is
+                    *    to be populated.
+                    * @param block_size The number of threads to be
+                    *    launched per block on GPU.
+                    */
+                    void fill(data_type_vector value,
+                              unsigned block_size=0);
+
+                    /*
+                    * Copies the data from GPU to CPU.
+                    */
+                    void copy_to_host();
+
+                    /*
+                    * Copies the  data from CPU to GPU.
+                    */
+                    void copy_to_device();
+
+                    /*
+                    * Returns the size of the vector.
+                    * By default returns the size of the
+                    * vector on GPU.
+                    *
+                    * @param gpu If true then size of the
+                    *   vector on GPU otherwise size of the
+                    *   vector on CPU is returned.
+                    */
+                    unsigned get_size(bool gpu=true) const;
+
+                    /*
+                    * Returns the data pointer, by default, of
+                    * the vector on GPU.
+                    *
+                    * @param gpu If true then data pointer on GPU
+                    *   is returned otherwise the one on CPU
+                    *   is returned.
+                    */
+                    data_type_vector* get_data_pointer(bool gpu=true) const;
+
+                    /*
+                    * Frees the memory from both CPU and GPU.
+                    */
+                    ~VectorGPU();
+            };
+
+            template <class data_type_vector>
+            void product_gpu(const VectorGPU<data_type_vector>& vec1,
+                             const VectorGPU<data_type_vector>& vec2,
+                             data_type_vector& result,
+                             unsigned block_size=0);
+
+            template <class data_type_matrix>
+            class MatrixGPU: public adaboost::core::Matrix<data_type_matrix>
+            {
+                private:
+
+                    data_type_matrix* data_gpu;
+
+                    unsigned rows_gpu, cols_gpu;
+
+                    static data_type_matrix* _reserve_space_gpu
+                    (unsigned _rows_gpu, unsigned _cols_gpu);
+
+                public:
+
+                    MatrixGPU();
+
+                    MatrixGPU(unsigned _rows, unsigned _cols);
+
+                    void fill(data_type_matrix value,
+                              unsigned block_size_x=0,
+                              unsigned block_size_y=0);
+
+                    void copy_to_host();
+
+                    void copy_to_device();
+
+                    unsigned get_rows(bool gpu=true) const;
+
+                    unsigned get_cols(bool gpu=true) const;
+
+                    data_type_matrix* get_data_pointer(bool gpu=true) const;
+
+                    ~MatrixGPU();
+            };
+
+            template <class data_type_matrix>
+            void multiply_gpu(const MatrixGPU<data_type_matrix>& mat1,
+                              const MatrixGPU<data_type_matrix>& mat2,
+                              MatrixGPU<data_type_matrix>& result);
+
+        } // namespace core
+    } // namespace cuda
+} // namespace adaboost
+
+#endif
diff --git a/adaboost/cuda/cuda_data_structures_impl.cu b/adaboost/cuda/cuda_data_structures_impl.cu
new file mode 100644
index 0000000..a884b8c
--- /dev/null
+++ b/adaboost/cuda/cuda_data_structures_impl.cu
@@ -0,0 +1,6 @@
+#ifndef CUDA_ADABOOST_CORE_DATA_STRUCTURES_IMPL_CU
+#define CUDA_ADABOOST_CORE_DATA_STRUCTURES_IMPL_CU
+
+#include "cuda_data_structures_impl.hpp"
+
+#endif
diff --git a/adaboost/cuda/cuda_data_structures_impl.hpp b/adaboost/cuda/cuda_data_structures_impl.hpp
new file mode 100644
index 0000000..e48f3ce
--- /dev/null
+++ b/adaboost/cuda/cuda_data_structures_impl.hpp
@@ -0,0 +1,397 @@
+#ifndef CUDA_ADABOOST_CORE_DATA_STRUCTURES_IMPL_HPP
+#define CUDA_ADABOOST_CORE_DATA_STRUCTURES_IMPL_HPP
+
+#include<adaboost/utils/utils.hpp>
+#include<adaboost/utils/cuda_wrappers.hpp>
+#include<adaboost/cuda/cuda_data_structures.hpp>
+#include<cmath>
+#include<iostream>
+
+#define MAX_BLOCK_SIZE 1024
+#define BLOCK_SIZE 16
+
+namespace adaboost
+{
+    namespace cuda
+    {
+        namespace core
+        {
+            template <class data_type_vector>
+            data_type_vector*
+            VectorGPU<data_type_vector>::
+            _reserve_space_gpu(unsigned _size_gpu)
+            {
+                adaboost::utils::check(_size_gpu > 0,
+                "The size of the vector should be positive.");
+                unsigned bytes = _size_gpu*sizeof(data_type_vector);
+                data_type_vector* new_pointer;
+                adaboost::utils::cuda::cuda_malloc((void**)&new_pointer, bytes);
+                return new_pointer;
+            }
+
+            template <class data_type_vector>
+            VectorGPU<data_type_vector>::
+            VectorGPU():
+            adaboost::core::Vector<data_type_vector>(),
+            size_gpu(0),
+            data_gpu(NULL)
+            {
+            }
+
+            template <class data_type_vector>
+            VectorGPU<data_type_vector>::
+            VectorGPU(unsigned _size):
+            adaboost::core::Vector<data_type_vector>(_size),
+            data_gpu(_reserve_space_gpu(_size)),
+            size_gpu(_size)
+            {
+            }
+
+            template <class data_type_vector>
+            __global__ void fill_vector_kernel
+            (data_type_vector* data,
+             unsigned size,
+             data_type_vector value)
+            {
+                unsigned index = threadIdx.x;
+                unsigned stride = blockDim.x;
+                for(unsigned i = index; i < size; i += stride)
+                {
+                    data[i] = value;
+                }
+            }
+
+            template <class data_type_vector>
+            void VectorGPU<data_type_vector>::
+            fill(data_type_vector value,
+                 unsigned block_size)
+            {
+                if(block_size == 0)
+                {
+                    this->adaboost::core::Vector<data_type_vector>::fill(value);
+                }
+                else
+                {
+                    fill_vector_kernel<data_type_vector>
+                    <<<
+                    (this->size_gpu + block_size - 1)/block_size,
+                    block_size
+                    >>>(this->data_gpu, this->size_gpu, value);
+                }
+            }
+
+            template <class data_type_vector>
+            void
+            VectorGPU<data_type_vector>::copy_to_host()
+            {
+                adaboost::utils::cuda::cuda_memcpy(
+                this->get_data_pointer(false),
+                this->data_gpu,
+                this->size_gpu*sizeof(data_type_vector),
+                adaboost::utils::cuda::DeviceToHost);
+            }
+
+            template <class data_type_vector>
+            void
+            VectorGPU<data_type_vector>::copy_to_device()
+            {
+                adaboost::utils::cuda::cuda_memcpy(
+                this->data_gpu,
+                this->get_data_pointer(false),
+                this->size_gpu*sizeof(data_type_vector),
+                adaboost::utils::cuda::HostToDevice);
+            }
+
+            template <class data_type_vector>
+            unsigned VectorGPU<data_type_vector>::
+            get_size(bool gpu) const
+            {
+                if(gpu)
+                {
+                    return this->size_gpu;
+                }
+                else
+                {
+                    return this->adaboost::core::Vector<data_type_vector>::get_size();
+                }
+            }
+
+            template <class data_type_vector>
+            data_type_vector* VectorGPU<data_type_vector>::
+            get_data_pointer(bool gpu) const
+            {
+                if(gpu)
+                {
+                    return this->data_gpu;
+                }
+                else
+                {
+                    return this->adaboost::core::Vector<data_type_vector>::get_data_pointer();
+                }
+            }
+
+            template <class data_type_vector>
+            VectorGPU<data_type_vector>::
+            ~VectorGPU()
+            {
+                adaboost::utils::cuda::cuda_free(this->data_gpu);
+            }
+
+            template <class data_type_vector>
+            __global__
+            void product_kernel
+            (data_type_vector* v1, data_type_vector* v2, data_type_vector* v3,
+            unsigned size)
+            {
+                __shared__ data_type_vector cache[MAX_BLOCK_SIZE];
+                data_type_vector temp = 0;
+                unsigned thread_i = threadIdx.x + blockDim.x*blockIdx.x;
+                unsigned cache_i = threadIdx.x;
+                while(thread_i < size)
+                {
+                    temp += v1[thread_i]*v2[thread_i];
+                    thread_i = blockDim.x*gridDim.x;
+                }
+                cache[cache_i] = temp;
+                __syncthreads();
+
+                unsigned i = blockDim.x/2;
+                while(i != 0)
+                {
+                    if(cache_i < i)
+                    {
+                        cache[cache_i] += cache[cache_i + i];
+                    }
+                    __syncthreads();
+                    i /= 2;
+                }
+
+                if(cache_i == 0)
+                    v3[blockIdx.x] = cache[0];
+            }
+
+            template <class data_type_vector>
+            void product_gpu(const VectorGPU<data_type_vector>& vec1,
+                             const VectorGPU<data_type_vector>& vec2,
+                             data_type_vector& result,
+                             unsigned block_size)
+            {
+                if(block_size == 0)
+                {
+                    return adaboost::core::product(vec1, vec2, result);
+                }
+                else
+                {
+                    adaboost::utils::check(vec1.get_size() == vec2.get_size(),
+                                           "Size of vectors don't match.");
+                    adaboost::utils::check(block_size > 0,
+                    "Size of the block should be a positive multiple of 32.");
+                    unsigned num_blocks = (vec1.get_size() + block_size - 1)/block_size;
+                    VectorGPU<data_type_vector> temp_result(num_blocks);
+                    product_kernel
+                    <<<
+                    num_blocks,
+                    block_size
+                    >>>(vec1.get_data_pointer(), vec2.get_data_pointer(),
+                        temp_result.get_data_pointer(), vec1.get_size());
+                    temp_result.copy_to_host();
+                    result = 0;
+                    for(unsigned i = 0; i < num_blocks; i++)
+                    {
+                        result += temp_result.at(i);
+                    }
+                }
+            }
+
+            template <class data_type_matrix>
+            data_type_matrix*
+            MatrixGPU<data_type_matrix>::
+            _reserve_space_gpu
+            (unsigned _rows_gpu, unsigned _cols_gpu)
+            {
+                adaboost::utils::check(_rows_gpu > 0,
+                "The number of rows in matrix should be positive.");
+                adaboost::utils::check(_cols_gpu > 0,
+                "The number of cols in matrix should be positive.");
+                unsigned bytes = _rows_gpu*_cols_gpu*sizeof(data_type_matrix);
+                data_type_matrix* new_pointer;
+                adaboost::utils::cuda::cuda_malloc((void**)&new_pointer, bytes);
+                return new_pointer;
+            }
+
+            template <class data_type_matrix>
+            MatrixGPU<data_type_matrix>::
+            MatrixGPU():
+            adaboost::core::Matrix<data_type_matrix>(),
+            rows_gpu(0),
+            cols_gpu(0)
+            {
+            }
+
+            template <class data_type_matrix>
+            MatrixGPU<data_type_matrix>::
+            MatrixGPU(unsigned _rows, unsigned _cols):
+            adaboost::core::Matrix<data_type_matrix>(_rows, _cols),
+            data_gpu(_reserve_space_gpu(_rows, _cols)),
+            rows_gpu(_rows),
+            cols_gpu(_cols)
+            {
+            }
+
+            template <class data_type_matrix>
+            __global__
+            void fill_matrix_kernel
+            (data_type_matrix* data,
+             unsigned cols,
+             data_type_matrix value)
+            {
+                unsigned row = blockDim.y*blockIdx.y + threadIdx.y;
+                unsigned col = blockDim.x*blockIdx.x + threadIdx.x;
+                data[row*cols + col] = value;
+            }
+
+            template <class data_type_matrix>
+            void MatrixGPU<data_type_matrix>::
+            fill(data_type_matrix value,
+                 unsigned block_size_x,
+                 unsigned block_size_y)
+            {
+                if(block_size_x == 0 || block_size_y == 0)
+                {
+                    this->adaboost::core::Matrix<data_type_matrix>::fill(value);
+                }
+                else
+                {
+                    dim3 gridDim((this->cols_gpu + block_size_x - 1)/block_size_x,
+                                  (this->rows_gpu + block_size_y - 1)/block_size_y);
+                    dim3 blockDim(block_size_x, block_size_y);
+                    fill_matrix_kernel<data_type_matrix>
+                    <<<gridDim, blockDim>>>
+                    (this->data_gpu,
+                     this->cols_gpu,
+                     value);
+                }
+            }
+
+            template <class data_type_matrix>
+            void MatrixGPU<data_type_matrix>::
+            copy_to_host()
+            {
+                adaboost::utils::cuda::cuda_memcpy(
+                this->get_data_pointer(false),
+                this->data_gpu,
+                this->rows_gpu*this->cols_gpu*sizeof(data_type_matrix),
+                adaboost::utils::cuda::DeviceToHost);
+            }
+
+            template <class data_type_matrix>
+            void MatrixGPU<data_type_matrix>::
+            copy_to_device()
+            {
+                adaboost::utils::cuda::cuda_memcpy(
+                this->data_gpu,
+                this->get_data_pointer(false),
+                this->rows_gpu*this->cols_gpu*sizeof(data_type_matrix),
+                adaboost::utils::cuda::HostToDevice);
+            }
+
+            template <class data_type_matrix>
+            unsigned MatrixGPU<data_type_matrix>::
+            get_rows(bool gpu) const
+            {
+                if(gpu)
+                {
+                    return this->rows_gpu;
+                }
+                else
+                {
+                    return this->adaboost::core::
+                           Matrix<data_type_matrix>::get_rows();
+                }
+            }
+
+            template <class data_type_matrix>
+            unsigned MatrixGPU<data_type_matrix>::
+            get_cols(bool gpu) const
+            {
+                if(gpu)
+                {
+                    return this->cols_gpu;
+                }
+                else
+                {
+                    return this->adaboost::core::
+                           Matrix<data_type_matrix>::get_cols();
+                }
+            }
+
+            template <class data_type_matrix>
+            data_type_matrix* MatrixGPU<data_type_matrix>::
+            get_data_pointer(bool gpu) const
+            {
+                if(gpu)
+                {
+                    return this->data_gpu;
+                }
+                else
+                {
+                    return this->adaboost::core::Matrix<data_type_matrix>::get_data_pointer();
+                }
+            }
+
+            template <class data_type_matrix>
+            MatrixGPU<data_type_matrix>::
+            ~MatrixGPU()
+            {
+                adaboost::utils::cuda::cuda_free(this->data_gpu);
+            }
+
+            template <class data_type_matrix>
+            __global__
+            void multiply_kernel(
+            data_type_matrix* mat1,
+            data_type_matrix* mat2,
+            data_type_matrix* result,
+            unsigned mat1_rows,
+            unsigned mat1_cols,
+            unsigned mat2_rows,
+            unsigned mat2_cols)
+            {
+                data_type_matrix cvalue = 0.0;
+                unsigned row = blockIdx.y*blockDim.y + threadIdx.y;
+                unsigned col = blockIdx.x*blockDim.x + threadIdx.x;
+                if(row > mat1_rows || col > mat2_cols)
+                    return ;
+                for(unsigned e = 0; e < mat1_cols; e++)
+                    cvalue += mat1[row*mat1_cols+e] * mat2[e*mat2_cols+col];
+                result[row*mat2_cols+col] = cvalue;
+            }
+
+            template <class data_type_matrix>
+            void multiply_gpu(const MatrixGPU<data_type_matrix>& mat1,
+                              const MatrixGPU<data_type_matrix>& mat2,
+                              MatrixGPU<data_type_matrix>& result)
+            {
+                adaboost::utils::check(mat1.get_cols() == mat2.get_rows(),
+                                       "Order of matrices don't match.");
+                dim3 gridDim((mat2.get_cols() + BLOCK_SIZE - 1)/BLOCK_SIZE,
+                             (mat1.get_rows() + BLOCK_SIZE - 1)/BLOCK_SIZE);
+                dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
+                multiply_kernel
+                <<<gridDim, blockDim>>>
+                (mat1.get_data_pointer(),
+                 mat2.get_data_pointer(),
+                 result.get_data_pointer(),
+                 mat1.get_rows(),
+                 mat1.get_cols(),
+                 mat2.get_rows(),
+                 mat2.get_cols());
+            }
+
+            #include "instantiated_templates_cuda_data_structures.hpp"
+
+        } // namespace core
+    } // namespace cuda
+} // namespace adaboost
+
+#endif
diff --git a/adaboost/cuda/instantiated_templates_cuda_data_structures.hpp b/adaboost/cuda/instantiated_templates_cuda_data_structures.hpp
new file mode 100644
index 0000000..0819923
--- /dev/null
+++ b/adaboost/cuda/instantiated_templates_cuda_data_structures.hpp
@@ -0,0 +1,96 @@
+template class VectorGPU<bool>;
+template class VectorGPU<short>;
+template class VectorGPU<unsigned short>;
+template class VectorGPU<int>;
+template class VectorGPU<unsigned int>;
+template class VectorGPU<long>;
+template class VectorGPU<unsigned long>;
+template class VectorGPU<long long>;
+template class VectorGPU<unsigned long long>;
+template class VectorGPU<float>;
+template class VectorGPU<double>;
+template class VectorGPU<long double>;
+template void product_gpu<bool>(
+const VectorGPU<bool>&, const VectorGPU<bool>&, bool& result, unsigned);
+template void product_gpu<short>(
+const VectorGPU<short>&, const VectorGPU<short>&, short& result, unsigned);
+template void product_gpu<unsigned short>(
+const VectorGPU<unsigned short>&, const VectorGPU<unsigned short>&, unsigned short& result, unsigned);
+template void product_gpu<int>(
+const VectorGPU<int>&, const VectorGPU<int>&, int& result, unsigned);
+template void product_gpu<unsigned int>(
+const VectorGPU<unsigned int>&, const VectorGPU<unsigned int>&, unsigned int& result, unsigned);
+template void product_gpu<long>(
+const VectorGPU<long>&, const VectorGPU<long>&, long& result, unsigned);
+template void product_gpu<unsigned long>(
+const VectorGPU<unsigned long>&, const VectorGPU<unsigned long>&, unsigned long& result, unsigned);
+template void product_gpu<long long>(
+const VectorGPU<long long>&, const VectorGPU<long long>&, long long& result, unsigned);
+template void product_gpu<unsigned long long>(
+const VectorGPU<unsigned long long>&, const VectorGPU<unsigned long long>&, unsigned long long& result, unsigned);
+template void product_gpu<float>(
+const VectorGPU<float>&, const VectorGPU<float>&, float& result, unsigned);
+template void product_gpu<double>(
+const VectorGPU<double>&, const VectorGPU<double>&, double& result, unsigned);
+template void product_gpu<long double>(
+const VectorGPU<long double>&, const VectorGPU<long double>&, long double& result, unsigned);
+template class MatrixGPU<bool>;
+template class MatrixGPU<short>;
+template class MatrixGPU<unsigned short>;
+template class MatrixGPU<int>;
+template class MatrixGPU<unsigned int>;
+template class MatrixGPU<long>;
+template class MatrixGPU<unsigned long>;
+template class MatrixGPU<long long>;
+template class MatrixGPU<unsigned long long>;
+template class MatrixGPU<float>;
+template class MatrixGPU<double>;
+template class MatrixGPU<long double>;
+template void multiply_gpu
+(const MatrixGPU<bool>& mat1,
+const MatrixGPU<bool>& mat2,
+MatrixGPU<bool>& result);
+template void multiply_gpu
+(const MatrixGPU<short>& mat1,
+const MatrixGPU<short>& mat2,
+MatrixGPU<short>& result);
+template void multiply_gpu
+(const MatrixGPU<unsigned short>& mat1,
+const MatrixGPU<unsigned short>& mat2,
+MatrixGPU<unsigned short>& result);
+template void multiply_gpu
+(const MatrixGPU<int>& mat1,
+const MatrixGPU<int>& mat2,
+MatrixGPU<int>& result);
+template void multiply_gpu
+(const MatrixGPU<unsigned int>& mat1,
+const MatrixGPU<unsigned int>& mat2,
+MatrixGPU<unsigned int>& result);
+template void multiply_gpu
+(const MatrixGPU<long>& mat1,
+const MatrixGPU<long>& mat2,
+MatrixGPU<long>& result);
+template void multiply_gpu
+(const MatrixGPU<unsigned long>& mat1,
+const MatrixGPU<unsigned long>& mat2,
+MatrixGPU<unsigned long>& result);
+template void multiply_gpu
+(const MatrixGPU<long long>& mat1,
+const MatrixGPU<long long>& mat2,
+MatrixGPU<long long>& result);
+template void multiply_gpu
+(const MatrixGPU<unsigned long long>& mat1,
+const MatrixGPU<unsigned long long>& mat2,
+MatrixGPU<unsigned long long>& result);
+template void multiply_gpu
+(const MatrixGPU<float>& mat1,
+const MatrixGPU<float>& mat2,
+MatrixGPU<float>& result);
+template void multiply_gpu
+(const MatrixGPU<double>& mat1,
+const MatrixGPU<double>& mat2,
+MatrixGPU<double>& result);
+template void multiply_gpu
+(const MatrixGPU<long double>& mat1,
+const MatrixGPU<long double>& mat2,
+MatrixGPU<long double>& result);
diff --git a/adaboost/tests/test_cuda.cpp b/adaboost/tests/test_cuda.cpp
new file mode 100644
index 0000000..9e23610
--- /dev/null
+++ b/adaboost/tests/test_cuda.cpp
@@ -0,0 +1,7 @@
+#include<adaboost/tests/test_cuda_data_structures.hpp>
+
+int main(int ac, char* av[])
+{
+  testing::InitGoogleTest(&ac, av);
+  return RUN_ALL_TESTS();
+}
diff --git a/adaboost/tests/test_cuda_data_structures.hpp b/adaboost/tests/test_cuda_data_structures.hpp
new file mode 100644
index 0000000..0b3903c
--- /dev/null
+++ b/adaboost/tests/test_cuda_data_structures.hpp
@@ -0,0 +1,77 @@
+#include<gtest/gtest.h>
+#include<string>
+#include<adaboost/cuda/cuda_data_structures.hpp>
+#include<adaboost/utils/cuda_wrappers.hpp>
+#include<stdexcept>
+
+TEST(Cuda, VectorGPU)
+{
+    adaboost::utils::cuda::cuda_event_t has_happened;
+    adaboost::utils::cuda::cuda_event_create(&has_happened);
+    adaboost::cuda::core::VectorGPU<float> vec1(1000);
+    adaboost::cuda::core::VectorGPU<float> vec2(1000);
+    unsigned block_size = 32;
+    vec1.fill(1.0, block_size);
+    vec2.fill(1.0, block_size);
+    adaboost::utils::cuda::cuda_event_record(has_happened);
+    adaboost::utils::cuda::cuda_event_synchronize(has_happened);
+    float result_gpu;
+    product_gpu(vec1, vec2, result_gpu, block_size);
+    adaboost::utils::cuda::cuda_event_record(has_happened);
+    adaboost::utils::cuda::cuda_event_synchronize(has_happened);
+    EXPECT_EQ(result_gpu, 1000.0)<<"Result from product on GPU should be 1000.0";
+    vec1.copy_to_host();
+    vec2.copy_to_host();
+    adaboost::utils::cuda::cuda_event_record(has_happened);
+    adaboost::utils::cuda::cuda_event_synchronize(has_happened);
+    for(unsigned i = 0; i < 1000; i++)
+    {
+        std::string msg1 = "All entries of VectorGPU should be 1";
+        EXPECT_EQ(1, vec1.at(i))<<msg1;
+        EXPECT_EQ(1, vec2.at(i))<<msg1;
+    }
+    float result;
+    product_gpu(vec1, vec2, result);
+    EXPECT_EQ(result, 1000.0)<<"Result from product on CPU should be 1000.0";
+}
+
+TEST(Cuda, MatrixGPU)
+{
+    adaboost::utils::cuda::cuda_event_t has_happened;
+    adaboost::utils::cuda::cuda_event_create(&has_happened);
+    adaboost::cuda::core::MatrixGPU<float> mat_f;
+    EXPECT_EQ(0, mat_f.get_cols())<<"Number of columns should be 0";
+    EXPECT_EQ(0, mat_f.get_rows())<<"Number of rows should be 0.";
+    adaboost::cuda::core::MatrixGPU<float> mat1(3, 3), mat2(3, 3), mat3(2, 1);
+    mat1.fill(4.0);
+    mat2.fill(5.0);
+    mat1.copy_to_device();
+    mat2.copy_to_device();
+    adaboost::utils::cuda::cuda_event_record(has_happened);
+    adaboost::utils::cuda::cuda_event_synchronize(has_happened);
+    adaboost::cuda::core::MatrixGPU<float> result1(3, 3);
+    adaboost::cuda::core::multiply_gpu(mat1, mat2, result1);
+    adaboost::utils::cuda::cuda_event_record(has_happened);
+    adaboost::utils::cuda::cuda_event_synchronize(has_happened);
+    result1.copy_to_host();
+    for(unsigned int i = 0; i < 3; i++)
+    {
+        for(unsigned int j = 0; j < 3; j++)
+        {
+            EXPECT_EQ(60.0, result1.at(i, j));
+        }
+    }
+    mat3.set(0, 0, 6.0);
+    mat3.set(1, 0, 6.0);
+    EXPECT_THROW({
+        try
+        {
+            adaboost::cuda::core::multiply_gpu(mat1, mat3, result1);
+        }
+        catch(const std::logic_error& e)
+        {
+            EXPECT_STREQ("Order of matrices don't match.", e.what());
+            throw;
+        }
+    }, std::logic_error);
+}
diff --git a/adaboost/utils/cuda_wrappers.hpp b/adaboost/utils/cuda_wrappers.hpp
new file mode 100644
index 0000000..5e1227e
--- /dev/null
+++ b/adaboost/utils/cuda_wrappers.hpp
@@ -0,0 +1,33 @@
+#ifndef ADABOOST_UTILS_CUDA_WRAPPERS_HPP
+#define ADABOOST_UTILS_CUDA_WRAPPERS_HPP
+
+#include<cuda.h>
+#include<cuda_runtime.h>
+
+namespace adaboost
+{
+    namespace utils
+    {
+        namespace cuda
+        {
+            enum direction {HostToDevice, DeviceToHost};
+            typedef cudaEvent_t cuda_event_t;
+
+            void cuda_malloc(void** ptr, unsigned num_bytes);
+
+            void cuda_memcpy
+            (void* ptr_1, void* ptr_2, unsigned num_bytes, direction d);
+
+            void cuda_event_create(cuda_event_t* event_ptr);
+
+            void cuda_event_record(cuda_event_t event);
+
+            void cuda_event_synchronize(cuda_event_t event);
+
+            void cuda_free(void* ptr);
+
+        } // namspace cuda
+    } // namespace utils
+} // namespace adaboost
+
+#endif
diff --git a/adaboost/utils/cuda_wrappers_impl.cu b/adaboost/utils/cuda_wrappers_impl.cu
new file mode 100644
index 0000000..f4a855f
--- /dev/null
+++ b/adaboost/utils/cuda_wrappers_impl.cu
@@ -0,0 +1,6 @@
+#ifndef ADABOOST_UTILS_CUDA_WRAPPERS_IMPL_CU
+#define ADABOOST_UTILS_CUDA_WRAPPERS_IMPL_CU
+
+#include "cuda_wrappers_impl.hpp"
+
+#endif
diff --git a/adaboost/utils/cuda_wrappers_impl.hpp b/adaboost/utils/cuda_wrappers_impl.hpp
new file mode 100644
index 0000000..5787fbe
--- /dev/null
+++ b/adaboost/utils/cuda_wrappers_impl.hpp
@@ -0,0 +1,54 @@
+#ifndef ADABOOST_UTILS_CUDA_WRAPPERS_IMPL_HPP
+#define ADABOOST_UTILS_CUDA_WRAPPERS_IMPL_HPP
+
+#include<adaboost/utils/cuda_wrappers.hpp>
+
+namespace adaboost
+{
+    namespace utils
+    {
+        namespace cuda
+        {
+            void cuda_malloc(void** ptr, unsigned num_bytes)
+            {
+                cudaMalloc(ptr, num_bytes);
+            }
+
+            void cuda_memcpy
+            (void* ptr_1, void* ptr_2, unsigned num_bytes, direction d)
+            {
+                if(d == HostToDevice)
+                {
+                    cudaMemcpy(ptr_1, ptr_2, num_bytes, cudaMemcpyHostToDevice);
+                }
+                else if(d == DeviceToHost)
+                {
+                    cudaMemcpy(ptr_1, ptr_2, num_bytes, cudaMemcpyDeviceToHost);
+                }
+            }
+
+            void cuda_event_create(cuda_event_t* event_ptr)
+            {
+                cudaEventCreate(event_ptr);
+            }
+
+            void cuda_event_record(cuda_event_t event)
+            {
+                cudaEventRecord(event);
+            }
+
+            void cuda_event_synchronize(cuda_event_t event)
+            {
+                cudaEventSynchronize(event);
+            }
+
+            void cuda_free(void* ptr)
+            {
+                cudaFree(ptr);
+            }
+
+        } // namespace cuda
+    } // namespace utils
+} // namespace adaboost
+
+#endif