flexflow · oOTigger · Jul 11, 2024 · Jul 12, 2024 · Jul 14, 2024 · Jul 14, 2024
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
@@ -30,6 +30,7 @@ target_link_libraries(
   cudnn
   nccl
   utils
+  pcg
 )
 
 define_ff_vars(${project_target})

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
@@ -5,11 +5,14 @@
 #include "device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
+#include "pcg/device_type.dtg.h"
 #include "utils/exception.h"
 #include "utils/required.h"
 
 namespace FlexFlow {
 
+struct Allocator;
+
 class GenericTensorAccessorW {
 public:
   template <DataType DT>
@@ -28,15 +31,68 @@
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
+  GenericTensorAccessorW() = delete;
+
+  GenericTensorAccessorW(DataType data_type,
+                         ArrayShape const &shape,
+                         void *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorW const &) const;
+  bool operator!=(GenericTensorAccessorW const &) const;
+
+  template <DataType DT, typename... Indices>
+  real_type_t<DT> &at(Indices... indices) {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+
+    using T = real_type_t<DT>;
+
+    T *data_ptr = static_cast<T *>(this->ptr);
+    size_t offset = calculate_index_offset({static_cast<size_t>(indices)...});
+
+    return data_ptr[offset];
+  }
+
+  template <DataType DT, typename... Indices>
+  real_type_t<DT> const &at(Indices... indices) const {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    size_t offset = calculate_index_offset({static_cast<size_t>(indices)...});
+
+    return data_ptr[offset];
+  }
+
 public:
   DataType data_type;
   ArrayShape shape;
-  req<void *> ptr;
+  void *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+
+  size_t calculate_index_offset(
+      std::initializer_list<size_t> const &indices) const;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
-                                             data_type,
-                                             shape,
-                                             ptr);
 
 std::string format_as(GenericTensorAccessorW const &);
 std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
@@ -59,15 +115,50 @@
   double const *get_double_ptr() const;
   half const *get_half_ptr() const;
 
+  GenericTensorAccessorR() = delete;
+
+  GenericTensorAccessorR(DataType data_type,
+                         ArrayShape const &shape,
+                         void const *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorR const &) const;
+  bool operator!=(GenericTensorAccessorR const &) const;
+
+  template <DataType DT, typename... Indices>
+  real_type_t<DT> const &at(Indices... indices) const {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    size_t offset = calculate_index_offset({static_cast<size_t>(indices)...});
+
+    return data_ptr[offset];
+  }
+
 public:
   DataType data_type;
   ArrayShape shape;
-  req<void const *> ptr;
+  void const *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+
+  size_t calculate_index_offset(
+      std::initializer_list<size_t> const &indices) const;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
-                                             data_type,
-                                             shape,
-                                             ptr);
 
 std::string format_as(GenericTensorAccessorR const &);
 std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
@@ -166,6 +257,22 @@
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorW const &accessor);
 
+void transfer_data_between_accessors(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorR const &src_accessor);
+
+void transfer_data_between_accessors(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorW const &src_accessor);
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator);
+
 } // namespace FlexFlow
 
 namespace FlexFlow {

diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
 #define _FLEXFLOW_KERNELS_ALLOCATION_H
 
-#include "accessor.h"
+#include "kernels/accessor.h"
 #include <cstddef>
 #include <memory>
 
@@ -11,16 +11,21 @@ struct IAllocator {
   virtual void *allocate(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
+  virtual DeviceType get_allocation_device_type() const = 0;
+
   virtual ~IAllocator() = default;
 };
 
 struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
+
   void *allocate(size_t mem_size);
   void deallocate(void *ptr);
 
+  DeviceType get_allocation_device_type() const;
+
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<IAllocator, T>::value,
                                  Allocator>::type

diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h
@@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
 std::string format_as(MHAPerDeviceState const &x);
 std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);
 
-namespace Kernels {
-namespace MultiHeadAttention {
+namespace Kernels::MultiHeadAttention {
 
 MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
                               Allocator &,
@@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream,
 void cleanup_kernel(Allocator &allocator,
                     MHAPerDeviceState const &device_state);
 
-} // namespace MultiHeadAttention
-} // namespace Kernels
+} // namespace Kernels::MultiHeadAttention
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h
@@ -5,9 +5,7 @@
 #include "kernels/allocation.h"
 #include "kernels/ff_handle.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchMatmul {
+namespace FlexFlow::Kernels::BatchMatmul {
 
 void forward_kernel(ffStream_t stream,
                     PerDeviceFFHandle const &handle,
@@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream,
                      int k,
                      int batch);
 
-} // namespace BatchMatmul
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::BatchMatmul
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -43,8 +43,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState,
                                              output_w,
                                              relu);
 
-namespace Kernels {
-namespace BatchNorm {
+namespace Kernels::BatchNorm {
 
 BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     Allocator allocator,
@@ -81,8 +80,7 @@ void cleanup_kernel(Allocator allocator,
                     bool relu,
                     float *runningMean);
 
-} // namespace BatchNorm
-} // namespace Kernels
+} // namespace Kernels::BatchNorm
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
@@ -3,12 +3,8 @@
 
 #include "device.h"
 #include "kernels/accessor.h"
-#include "kernels/ff_handle.h"
-#include "op-attrs/activation.dtg.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Cast {
+namespace FlexFlow::Kernels::Cast {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -22,8 +18,6 @@ void backward_kernel(ffStream_t stream,
                      DataType input_type,
                      DataType output_type);
 
-} // namespace Cast
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Cast
 
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        DataType input_type,
+                        DataType output_type);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &output,
+                         DataType input_type,
+                         DataType output_type);
+
+} // namespace FlexFlow::Kernels::Cast
+
+#endif
diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Combine {
+namespace FlexFlow::Kernels::Combine {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Combine
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Combine
 
 #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow::Kernels::Combine {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace FlexFlow::Kernels::Combine
+
+#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output,
@@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis);
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
 
 #endif
diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h
@@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState,
                                              bwdFilterAlgo,
                                              bwdDataAlgo);
 
-namespace Kernels {
-namespace Conv2D {
+namespace Kernels::Conv2D {
 
 Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -70,8 +69,7 @@ void backward_kernel(ffStream_t stream,
                      float *bias_grad_ptr,
                      std::optional<Activation> activation);
 
-} // namespace Conv2D
-} // namespace Kernels
+} // namespace Kernels::Conv2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H