diff --git a/README.md b/README.md
index f744fdce..157978cd 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ No installation is required. This will launch a Jupyter notebook with the quicks
     package.add(schedule, args=(A, B, C), base_name="matmul_relu_fusion_naive")
 
     # transform the schedule, add to the package
-    f, i, j, k = schedule.get_indices()
+    i, j, f, k = schedule.get_indices()
     ii, jj = schedule.tile({
         i: 16,
         j: 16
diff --git a/accera/acc-opt/test/vectorization.mlir b/accera/acc-opt/test/vectorization.mlir
new file mode 100644
index 00000000..27718b86
--- /dev/null
+++ b/accera/acc-opt/test/vectorization.mlir
@@ -0,0 +1,84 @@
+// RUN: acc-opt --verify-each=false --acc-vectorize %s | FileCheck %s
+
+module @test_accera_vectorization attributes {accv.target_device_features = "-avx512pf,-tsxldtrk,+cx16,+sahf,-tbm,-avx512ifma,-sha,+crc32,-fma4,-vpclmulqdq,-prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-amx-tile,-uintr,-gfni,+popcnt,-widekl,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512er,-avxvnni,-avx512fp16,-avx512vnni,-amx-bf16,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-xsavec,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,-rdseed,-waitpkg,-kl,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,-serialize,-hreset,+invpcid,-avx512cd,+avx,-vaes,-avx512bf16,+cx8,+fma,-rtm,+bmi,-enqcmd,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,+fxsr,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,-amx-int8,+movbe,-avx512vp2intersect,+xsaveopt,-avx512dq,+sse2,-adx,+sse"} {
+    accv.module "test_accera_vectorization"  {
+
+        // Single-op cases:
+        // TODO : implement test cases for these
+        // mlir::memref::AllocaOp
+        // mlir::arith::ConstantOp
+        // mlir::memref::LoadOp sequential
+        // mlir::memref::LoadOp non-sequential
+        // mlir::memref::StoreOp sequential
+        // mlir::memref::StoreOp non-sequential
+        // mlir::affine::AffineLoadOp sequential
+        // mlir::affine::AffineLoadOp non-sequential
+        // mlir::affine::AffineStoreOp sequential
+        // mlir::affine::AffineStoreOp non-sequential
+        // mlir::SelectOp
+        // mlir::arith::ShLIOp
+        // mlir::arith::FPToSIOp
+        // mlir::arith::ExtSIOp
+        // mlir::math::AbsOp
+        // mlir::math::ExpOp
+        // value::CastOp
+        // value::RoundOp
+        // value::BitCastOp
+        // value::BinOp
+        // value::CmpOp
+        // value::ReferenceGlobalOp
+
+        // Special cases:
+        // TODO : implement test cases for these
+        // horizontal reduction
+        // multi-loop sequential cast
+        // two-row interleaved pack
+        // vpmaddwd avx 2
+        // vpmaddwd avx 512
+        // masked load
+        // two-row interleaved masked load and pack
+
+
+        // CHECK-LABEL builtin.func nested @test_view_split_dim_interleaved_pack
+        builtin.func nested @test_view_split_dim_interleaved_pack(%arg0: memref<1885x256xui8> loc(unknown), %arg1: memref<483840xui8> loc(unknown)) attributes {accv.dyn_arg_size_refs = [[-1, -1], [-1]], accv.usages = [1 : i8, 0 : i8], args_name = ["", ""], args_size = ["1885*256", "483840"], args_symbol = ["args_symbol_name_0", "args_symbol_name_1"], exec_target = 0 : i64} {
+            %c1024 = arith.constant 1024 : index
+            %c1 = arith.constant 1 : index
+            %c482816 = arith.constant 482816 : index
+            %c98304 = arith.constant 98304 : index
+            %c2 = arith.constant 2 : index
+            %c16 = arith.constant 16 : index
+            %c192 = arith.constant 192 : index
+            affine.for %arg2 = 0 to 1536 step 384 {
+                %0 = "accv.view"(%arg1, %c482816, %c1024, %c1) {operand_segment_sizes = dense<1> : vector<4xi32>} : (memref<483840xui8>, index, index, index) -> memref<482816xui8, affine_map<(d0) -> (d0 + 1024)>>
+                %1 = affine.apply affine_map<(d0) -> (d0 * 256)>(%arg2)
+                %2 = "accv.view"(%0, %c98304, %1, %c1) {operand_segment_sizes = dense<1> : vector<4xi32>} : (memref<482816xui8, affine_map<(d0) -> (d0 + 1024)>>, index, index, index) -> memref<98304xui8, affine_map<(d0)[s0] -> (d0 + s0 + 1024)>>
+                %3 = "accv.split_dim"(%2, %c2) {dim = 0 : i64} : (memref<98304xui8, affine_map<(d0)[s0] -> (d0 + s0 + 1024)>>, index) -> memref<49152x2xui8, affine_map<(d0, d1)[s0] -> (d0 * 2 + d1 + s0 + 1024)>>
+                %4 = "accv.split_dim"(%3, %c16) {dim = 0 : i64} : (memref<49152x2xui8, affine_map<(d0, d1)[s0] -> (d0 * 2 + d1 + s0 + 1024)>>, index) -> memref<3072x16x2xui8, affine_map<(d0, d1, d2)[s0] -> ((d0 * 16 + d1) * 2 + d2 + s0 + 1024)>>
+                %5 = "accv.split_dim"(%4, %c192) {dim = 0 : i64} : (memref<3072x16x2xui8, affine_map<(d0, d1, d2)[s0] -> ((d0 * 16 + d1) * 2 + d2 + s0 + 1024)>>, index) -> memref<16x192x16x2xui8, affine_map<(d0, d1, d2, d3)[s0] -> (((d0 * 192 + d1) * 16 + d2) * 2 + d3 + s0 + 1024)>>
+                // CHECK: affine.for %arg3 = 0 to 256 step 16 {
+                affine.for %arg3 = 0 to 256 step 16 {
+                    // CHECK-NEXT: affine.for %arg4 = 0 to 384 step 2 {
+                    affine.for %arg4 = 0 to 384 step 2 {
+                        affine.for %arg5 = 0 to 16 {
+                            affine.for %arg6 = 0 to 2 {
+                                %8 = affine.load %arg0[%arg6 + %arg4 + symbol(%arg2), %arg5 + %arg3] : memref<1885x256xui8>
+                                affine.store %8, %5[symbol(%arg3) floordiv 16, symbol(%arg4) floordiv 2, %arg5, %arg6] : memref<16x192x16x2xui8, affine_map<(d0, d1, d2, d3)[s0] -> (((d0 * 192 + d1) * 16 + d2) * 2 + d3 + s0 + 1024)>>
+                            } {beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (2)>, index = #accln<"index{j_i,245}">, kernels = ["_cache_fill"], operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, subdomainIndexOrder = [#accln<"index{i,240}">, #accln<"index{j,241}">], subdomainSize = [16, 2]}
+                        } {accxp_vectorizationInfo = #accxp<"vectorizationinfo{32,16,0}">, beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (16)>, index = #accln<"index{i_i,243}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, scheduledIndex = #accln<"index{i_i,243}">, subdomainIndexOrder = [#accln<"index{i,240}">, #accln<"index{j,241}">], subdomainSize = [16, 2]}
+                        // CHECK-NEXT: %6 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [482560], strides: [1] : memref<1885x256xui8> to memref<482560xui8>
+                        // CHECK-NEXT: %7 = affine.apply #map6(%arg4, %c0, %arg3)[%arg2]
+                        // CHECK-NEXT: %8 = vector.load %6[%7] : memref<482560xui8>, vector<16xui8>
+                        // CHECK-NEXT: %9 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [482560], strides: [1] : memref<1885x256xui8> to memref<482560xui8>
+                        // CHECK-NEXT: %10 = affine.apply #map7(%arg4, %c0, %arg3)[%arg2]
+                        // CHECK-NEXT: %11 = vector.load %9[%10] : memref<482560xui8>, vector<16xui8>
+                        // CHECK-NEXT: %12 = vector.shuffle %8, %11 [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31] : vector<16xui8>, vector<16xui8>
+                        // CHECK-NEXT: %13 = memref.reinterpret_cast %5 to offset: [0], sizes: [98304], strides: [1] : memref<16x192x16x2xui8, #map5> to memref<98304xui8>
+                        // CHECK-NEXT: %14 = affine.apply #map8(%c0, %c0, %arg2)[%arg3, %arg4]
+                        // CHECK-NEXT: vector.store %12, %13[%14] : memref<98304xui8>, vector<32xui8>
+                    } {beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (384)>, index = #accln<"index{i_i_o,257}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, subdomainIndexOrder = [#accln<"index{i,254}">], subdomainSize = [-1]}
+                } {beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (256)>, index = #accln<"index{i_i_o,262}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, subdomainIndexOrder = [#accln<"index{i,259}">], subdomainSize = [-1]}
+            } {beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (1536)>, index = #accln<"index{i_o,268}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, subdomainIndexOrder = [#accln<"index{i,266}">, #accln<"index{j,267}">], subdomainSize = [1885, 256]}
+            return
+        }
+    }
+}
diff --git a/accera/accc/accc.py b/accera/accc/accc.py
index 77b87d23..309c63ce 100644
--- a/accera/accc/accc.py
+++ b/accera/accc/accc.py
@@ -105,6 +105,8 @@ def bstr(val):
 
 OPT_DISABLE_LOOP_UNROLLING_ARGS = ["--disable-loop-unrolling"]
 
+LLVM_KEEP_DEBUG_INFO_ARGS = ["--frame-pointer=all"]
+
 LLVM_TOOLING_OPTS = {
     SystemTarget.HOST.value: ["-O3", "-mcpu=native"],
     SystemTarget.RPI4.value: [
@@ -137,9 +139,16 @@ def bstr(val):
 DEFAULT_LLC_ARGS = DEFAULT_LLVM_TOOLING_OPTS + ["-relocation-model=pic"]
 
 class Options(Flag):
-    NONE = auto() # (enable auto unroll | low precision float)
+    NONE = auto() # (enable auto unroll | low precision float | no debug info)
     DISABLE_AUTO_UNROLL = auto()
     HIGH_PRECISION_FLOATING_POINT_OPS = auto()
+    KEEP_DEBUG_INFO = auto()
+
+def _get_common_debug_info_options_args(options: Options):
+    if options & Options.KEEP_DEBUG_INFO:
+        return LLVM_KEEP_DEBUG_INFO_ARGS
+    else:
+        return []
 
 def _get_common_fp_options_args(options: Options):
     if options & Options.HIGH_PRECISION_FLOATING_POINT_OPS:
@@ -154,6 +163,7 @@ def _get_options_opt_args(options: Options):
         args += OPT_DISABLE_LOOP_UNROLLING_ARGS
 
     args += _get_common_fp_options_args(options)
+    args += _get_common_debug_info_options_args(options)
 
     return args
 
@@ -161,6 +171,7 @@ def _get_options_llc_args(options: Options):
     args = []
 
     args += _get_common_fp_options_args(options)
+    args += _get_common_debug_info_options_args(options)
 
     return args
 
diff --git a/accera/ir/include/IRUtil.h b/accera/ir/include/IRUtil.h
index a98e1588..c4e37f7e 100644
--- a/accera/ir/include/IRUtil.h
+++ b/accera/ir/include/IRUtil.h
@@ -461,5 +461,7 @@ namespace util
 
     bool IsTerminalOp(mlir::Operation* op);
 
+    std::vector<mlir::Value> GetDynamicOffsetSymbols(mlir::Value val);
+
 } // namespace util
 } // namespace accera::ir
diff --git a/accera/ir/src/IRUtil.cpp b/accera/ir/src/IRUtil.cpp
index 0e72d392..31568dfa 100644
--- a/accera/ir/src/IRUtil.cpp
+++ b/accera/ir/src/IRUtil.cpp
@@ -1304,35 +1304,55 @@ namespace util
             return shape;
         }
 
-        // Currently this utility only supports dynamic memrefs that are function arguments with dimension size handles which are
-        // also function arguments
-        if (!memref.isa<mlir::BlockArgument>())
+        // Currently this utility only supports dynamic memrefs that are alloc ops with shape args or
+        // function arguments with dimension size handles which are also function arguments
+        if (auto allocOp = memref.getDefiningOp<ir::value::AllocOp>())
         {
-            throw LogicException(LogicExceptionErrors::notImplemented, "Currently only supports function arguments for dynamic memref shape resolution");
+            // Assumes the operands to AllocOp are ordered in the logical shape order where they're needed
+            unsigned currentOperandIndex = 0;
+            auto allocOperands = allocOp.operands();
+            for (unsigned dimIdx = 0; dimIdx < memrefType.getRank(); ++dimIdx)
+            {
+                if (memrefType.isDynamicDim(dimIdx))
+                {
+                    shape.push_back(allocOperands[currentOperandIndex++]);
+                }
+                else
+                {
+                    shape.push_back(memrefType.getDimSize(dimIdx));
+                }
+            }
         }
-        auto memrefBlockArg = memref.cast<mlir::BlockArgument>();
-        auto memrefFuncArgIdx = memrefBlockArg.getArgNumber();
-        auto blockParentOp = memrefBlockArg.getOwner()->getParentOp();
+        else if (memref.isa<mlir::BlockArgument>())
+        {
+            auto memrefBlockArg = memref.cast<mlir::BlockArgument>();
+            auto memrefFuncArgIdx = memrefBlockArg.getArgNumber();
+            auto blockParentOp = memrefBlockArg.getOwner()->getParentOp();
 
-        auto allFuncArgs = memrefBlockArg.getOwner()->getArguments();
-        std::vector<mlir::Type> allFuncArgTypes;
-        allFuncArgTypes.reserve(allFuncArgs.size());
-        std::transform(allFuncArgs.begin(), allFuncArgs.end(), std::back_inserter(allFuncArgTypes), [](mlir::Value val) { return val.getType(); });
+            auto allFuncArgs = memrefBlockArg.getOwner()->getArguments();
+            std::vector<mlir::Type> allFuncArgTypes;
+            allFuncArgTypes.reserve(allFuncArgs.size());
+            std::transform(allFuncArgs.begin(), allFuncArgs.end(), std::back_inserter(allFuncArgTypes), [](mlir::Value val) { return val.getType(); });
 
-        std::vector<std::vector<int64_t>> dynamicArgSizeRefs = ParseDynamicArgSizeReferences(blockParentOp, allFuncArgTypes);
+            std::vector<std::vector<int64_t>> dynamicArgSizeRefs = ParseDynamicArgSizeReferences(blockParentOp, allFuncArgTypes);
 
-        for (unsigned dimIdx = 0; dimIdx < memrefType.getRank(); ++dimIdx)
-        {
-            if (memrefType.isDynamicDim(dimIdx))
-            {
-                auto shapeRefArgIdx = dynamicArgSizeRefs[memrefFuncArgIdx][dimIdx];
-                shape.push_back(allFuncArgs[shapeRefArgIdx]);
-            }
-            else
+            for (unsigned dimIdx = 0; dimIdx < memrefType.getRank(); ++dimIdx)
             {
-                shape.push_back(memrefType.getDimSize(dimIdx));
+                if (memrefType.isDynamicDim(dimIdx))
+                {
+                    auto shapeRefArgIdx = dynamicArgSizeRefs[memrefFuncArgIdx][dimIdx];
+                    shape.push_back(allFuncArgs[shapeRefArgIdx]);
+                }
+                else
+                {
+                    shape.push_back(memrefType.getDimSize(dimIdx));
+                }
             }
         }
+        else
+        {
+            throw LogicException(LogicExceptionErrors::notImplemented, "Currently only supports local allocations or function arguments for dynamic memref shape resolution");
+        }
         return shape;
     }
 
@@ -1495,5 +1515,30 @@ namespace util
         return op->getNumResults() == 0;
     }
 
+    std::vector<mlir::Value> GetDynamicOffsetSymbols(mlir::Value val)
+    {
+        std::vector<mlir::Value> offsetSymbols;
+        // If there are dynamic offsets, get the source handle for those and incorporate them into the offsetSymbols
+        if (auto memrefSrcOp = val.getDefiningOp())
+        {
+            // Currently only handles value::SplitDimOp and value::ViewOp
+            while (auto splitDimOp = mlir::dyn_cast_or_null<ir::value::SplitDimOp>(memrefSrcOp))
+            {
+                memrefSrcOp = splitDimOp.getViewSource().getDefiningOp();
+            }
+            if (auto viewOp = mlir::dyn_cast_or_null<ir::value::ViewOp>(memrefSrcOp))
+            {
+                for (auto offset : viewOp.offsets())
+                {
+                    if (!offset.getDefiningOp<mlir::arith::ConstantOp>())
+                    {
+                        offsetSymbols.push_back(offset);
+                    }
+                }
+            }
+        }
+        return offsetSymbols;
+    }
+
 } // namespace util
 } // namespace accera::ir
diff --git a/accera/ir/src/value/ValueCanonicalization.cpp b/accera/ir/src/value/ValueCanonicalization.cpp
index 69273afc..e799f0ee 100644
--- a/accera/ir/src/value/ValueCanonicalization.cpp
+++ b/accera/ir/src/value/ValueCanonicalization.cpp
@@ -245,6 +245,55 @@ mlir::Value constantBuildHelper<mlir::arith::ConstantFloatOp>(mlir::OpBuilder& b
 
 struct ValueBinOpSimplification : public mlir::OpRewritePattern<v::BinOp>
 {
+    mlir::Value handlePartiallyConstantBoolOp(mlir::PatternRewriter& rewriter, v::BinaryOpPredicate pred, mlir::Value lhs, mlir::Value rhs) const
+    {
+        auto lhsCast = lhs.getDefiningOp<mlir::arith::ConstantIntOp>();
+        auto rhsCast = rhs.getDefiningOp<mlir::arith::ConstantIntOp>();
+        if ((lhsCast == nullptr && rhsCast == nullptr) ||
+            (lhsCast != nullptr && rhsCast != nullptr))
+        {
+            return nullptr;
+        }
+        // Only one is non-null
+        auto constOperand = lhsCast == nullptr ? rhsCast : lhsCast; 
+        auto otherOperand = lhsCast == nullptr ? lhs : rhs;
+        auto type = constOperand.getType();
+        if (!type.isa<mlir::IntegerType>() || type.cast<mlir::IntegerType>().getWidth() != 1)
+        {
+            return nullptr;
+        }
+
+        auto boolResult = constOperand.value() != 0;
+        auto loc = lhs.getLoc();
+        switch (pred)
+        {
+        case v::BinaryOpPredicate::LOGICAL_AND:
+            if (boolResult)
+            {
+                // (arg AND true) == (arg)
+                return otherOperand;
+            }
+            else
+            {
+                // (arg AND false) == (false)
+                return rewriter.create<mlir::arith::ConstantIntOp>(loc, static_cast<int64_t>(boolResult), 1 /* bitwidth = i1 for boolean values */);
+            }
+        case v::BinaryOpPredicate::LOGICAL_OR:
+            if (boolResult)
+            {
+                // (arg OR true) == (true)
+                return rewriter.create<mlir::arith::ConstantIntOp>(loc, static_cast<int64_t>(boolResult), 1 /* bitwidth = i1 for boolean values */);
+            }
+            else
+            {
+                // (arg OR false) == (arg)
+                return otherOperand;
+            }
+        default:
+            return nullptr;
+        }
+    }
+
     using OpRewritePattern::OpRewritePattern;
 
     mlir::arith::ConstantOp handleConstantBoolOp(mlir::PatternRewriter& rewriter, v::BinaryOpPredicate pred, mlir::arith::ConstantOp lhs, mlir::arith::ConstantOp rhs) const
@@ -314,6 +363,7 @@ struct ValueBinOpSimplification : public mlir::OpRewritePattern<v::BinOp>
         // TODO : if we lowered BinOps to MLIR earlier than other value dialect ops, the built-in arithmetic canonicalizations and lowerings would handle this
         auto lhs = op.lhs();
         auto rhs = op.rhs();
+        auto pred = op.getPredicate();
         auto resultElementType = accera::ir::util::GetElementType(op.result().getType());
         auto lhsElementType = accera::ir::util::GetElementType(lhs.getType());
         auto rhsElementType = accera::ir::util::GetElementType(rhs.getType());
@@ -327,28 +377,33 @@ struct ValueBinOpSimplification : public mlir::OpRewritePattern<v::BinOp>
         {
             if (auto rhsConstantOp = rhs.getDefiningOp<mlir::arith::ConstantOp>())
             {
-                if (mlir::Value intOp = handleConstantOp<mlir::arith::ConstantIntOp>(rewriter, op.getPredicate(), lhsConstantOp, rhsConstantOp))
+                if (mlir::Value intOp = handleConstantOp<mlir::arith::ConstantIntOp>(rewriter, pred, lhsConstantOp, rhsConstantOp))
                 {
                     rewriter.replaceOp(op, { intOp });
                     return mlir::success();
                 }
-                else if (mlir::Value indexOp = handleConstantOp<mlir::arith::ConstantIndexOp>(rewriter, op.getPredicate(), lhsConstantOp, rhsConstantOp))
+                else if (mlir::Value indexOp = handleConstantOp<mlir::arith::ConstantIndexOp>(rewriter, pred, lhsConstantOp, rhsConstantOp))
                 {
                     rewriter.replaceOp(op, { indexOp });
                     return mlir::success();
                 }
-                else if (mlir::Value floatOp = handleConstantOp<mlir::arith::ConstantFloatOp>(rewriter, op.getPredicate(), lhsConstantOp, rhsConstantOp))
+                else if (mlir::Value floatOp = handleConstantOp<mlir::arith::ConstantFloatOp>(rewriter, pred, lhsConstantOp, rhsConstantOp))
                 {
                     rewriter.replaceOp(op, { floatOp });
                     return mlir::success();
                 }
-                else if (mlir::Value boolOp = handleConstantBoolOp(rewriter, op.getPredicate(), lhsConstantOp, rhsConstantOp))
+                else if (mlir::Value boolOp = handleConstantBoolOp(rewriter, pred, lhsConstantOp, rhsConstantOp))
                 {
                     rewriter.replaceOp(op, { boolOp });
                     return mlir::success();
                 }
             }
         }
+        if (auto partiallyConstReplaceVal = handlePartiallyConstantBoolOp(rewriter, pred, lhs, rhs))
+        {
+            rewriter.replaceOp(op, { partiallyConstReplaceVal });
+            return mlir::success();
+        }
         return mlir::failure();
     }
 
diff --git a/accera/python/accera/lang/Nest.py b/accera/python/accera/lang/Nest.py
index fdb27654..851c7383 100644
--- a/accera/python/accera/lang/Nest.py
+++ b/accera/python/accera/lang/Nest.py
@@ -84,10 +84,7 @@ def get_indices(self) -> Union[List[LoopIndex], LoopIndex]:
             else:
                 self._shape[0][1].name = names[0]
 
-        if len(self._shape) > 1:
-            return [idx for _, idx in self._shape]
-        else:
-            return self._shape[0][1]
+        return [idx for _, idx in self._shape]
 
     def iteration_logic(self, logic: Callable = None, predicate=None, placement=None):
         """Adds iteration logic to the nest
diff --git a/accera/python/accera/lang/Schedule.py b/accera/python/accera/lang/Schedule.py
index 1ec129a4..5e131131 100644
--- a/accera/python/accera/lang/Schedule.py
+++ b/accera/python/accera/lang/Schedule.py
@@ -46,13 +46,7 @@ def __init__(self, nest: Nest):
         self._nest = nest
         self._delayed_calls = {}
         self._parameterized_index_map = {}
-
-        # nest.get_indices gives us a single index if there's only one index
         self._indices = nest.get_indices()
-        try:
-            _ = iter(self._indices)
-        except TypeError:
-            self._indices: List[LoopIndex] = [self._indices]
 
         shape = nest.get_shape()
         if any([isinstance(s, DelayedParameter) for s in shape]):
@@ -632,6 +626,8 @@ def __init__(self, schedules: List[Schedule], partial: int = None):
             [self._fusing_index] + self._common_indices + self._unfused_indices
         )
 
+        self.reorder(self._common_indices + [self._fusing_index] + self._unfused_indices)
+
     def print(self, per_index_fn: Callable[[LoopIndex], List[str]] = None):
         # TODO
         ...
diff --git a/accera/python/accera/test/dsl_tests.py b/accera/python/accera/test/dsl_tests.py
index 58cae933..b6019958 100644
--- a/accera/python/accera/test/dsl_tests.py
+++ b/accera/python/accera/test/dsl_tests.py
@@ -59,6 +59,10 @@
 # TODO: Remove all @expectedFailure decorators as implementation converges with spec
 
 
+def _get_test_mode(correctness_check: bool = False):
+    return Package.Mode.RELEASE if correctness_check else TEST_MODE
+
+
 class DSLTest_01Arrays(unittest.TestCase):
 
     def _verify_nest(self, nest, args: Tuple[Array], package_name, correctness_check_values=None) -> None:
@@ -70,7 +74,7 @@ def _verify_nest(self, nest, args: Tuple[Array], package_name, correctness_check
 
         # build the HAT package
         with verifiers.VerifyPackage(self, package_name, output_dir) as v:
-            package.build(package_name, format=TEST_FORMAT, mode=TEST_MODE, output_dir=output_dir)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir)
             if correctness_check_values:
                 v.check_correctness(
                     function.name,
@@ -178,7 +182,7 @@ def _():
             package.build(
                 package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -263,7 +267,7 @@ def _():
             package.build(
                 package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -312,7 +316,7 @@ def test_fn(A, B):
             package.build(
                 package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -329,7 +333,7 @@ def test_fn_wrong_role(A, B):
             package.build(
                 package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
                 fail_on_error=True,
             )
@@ -363,7 +367,7 @@ def _():
             package.build(
                 package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -776,7 +780,7 @@ def _verify_helper(self, package, test_name, function_name=None, correctness_che
         output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name
         with verifiers.VerifyPackage(self, test_name, output_dir) as v:
             shutil.rmtree(output_dir, ignore_errors=True)
-            package.build(test_name, format=TEST_FORMAT, mode=TEST_MODE, output_dir=output_dir)
+            package.build(test_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir)
             if function_name and correctness_check_values:
                 v.check_correctness(
                     function_name,
@@ -819,7 +823,7 @@ def _simple_runtimesize_loopnest_common(self, name, splits=[]) -> None:
 
         nest = Nest((M, ))
 
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _():
@@ -1456,7 +1460,7 @@ def _():
         package.build(
             "test_output_array_range_node2",
             format=TEST_FORMAT | Package.Format.MLIR_VERBOSE,
-            mode=TEST_MODE,
+            mode=_get_test_mode(),
             output_dir=TEST_PACKAGE_DIR
         )
 
@@ -1515,7 +1519,7 @@ def _():
             package.build(
                 "test_output_array_gather_node",
                 format=TEST_FORMAT | Package.Format.MLIR_VERBOSE,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR
             )
 
@@ -1562,7 +1566,7 @@ def _():
             package.build(
                 "test_output_array_gather_node",
                 format=TEST_FORMAT | Package.Format.MLIR_VERBOSE,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR
             )
 
@@ -1592,7 +1596,7 @@ def _build_nest(self, nest, args: Tuple[Array], package_name, correctness_check_
         # build the HAT package
         output_dir = pathlib.Path(TEST_PACKAGE_DIR) / package_name
         with verifiers.VerifyPackage(self, package_name, output_dir) as v:
-            package.build(package_name, format=TEST_FORMAT, mode=TEST_MODE, output_dir=output_dir, _quiet=quiet)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir, _quiet=quiet)
             if correctness_check_values:
                 v.check_correctness(
                     function.name,
@@ -2204,7 +2208,7 @@ def _():
             package.build(
                 package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -2227,7 +2231,7 @@ def _():
             package.build(
                 package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -2252,7 +2256,7 @@ def _verify_schedule(self, schedule, args: Tuple[Array], package_name, correctne
 
         # build the HAT package
         with verifiers.VerifyPackage(self, package_name, output_dir) as v:
-            package.build(package_name, format=TEST_FORMAT, mode=TEST_MODE, output_dir=output_dir)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir)
             if correctness_check_values:
                 v.check_correctness(
                     function.name,
@@ -2514,7 +2518,7 @@ def test_schedule_pad_inner_index_no_bc_1(self) -> None:
         A = Array(role=Role.INPUT_OUTPUT, element_type=float, shape=(I, ))
 
         nest = Nest(shape=(I, ))
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _():
@@ -2786,7 +2790,7 @@ def _():
             package.build(
                 package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -2845,9 +2849,7 @@ def _():
 
         # Create a fused schedule
         schedule = fuse(schedule0, schedule1)
-        f, i, j = schedule.get_indices()
-
-        schedule.reorder(i, j, f)
+        i, j, f = schedule.get_indices()
 
         A_test = np.random.random(A.shape).astype(np.float32)
         B_test = np.random.random(B.shape).astype(np.float32)
@@ -2910,8 +2912,7 @@ def _():
         schedule1 = nest1.create_schedule()
 
         schedule = fuse((schedule0, schedule1), partial=2)
-        f, i, j, k = schedule.get_indices()
-        schedule.reorder(i, j, f, k)
+        i, j, f, k = schedule.get_indices()
 
         # unfused indices (k) must not precede the fusing index (f)
         with self.assertRaises(ValueError):
@@ -2957,9 +2958,8 @@ def _():
         s1 = n1.create_schedule()
 
         fs = fuse((s0, s1), partial=1)
-        f, i, j = fs.get_indices()
+        i, f, j = fs.get_indices()
         jj = fs.split(j, 2)
-        fs.reorder(i, f, j, jj)
 
         A_test_pre = np.random.random(A.shape).astype(np.float32)
         B_test_pre = np.random.random(B.shape).astype(np.float32)
@@ -3008,8 +3008,6 @@ def _():
         # be automatically end-padded with no-ops
 
         schedule = fuse(schedule0, schedule1)
-        f, i, j = schedule.get_indices()
-        schedule.reorder(i, j, f)
 
         # Emitted fused loop should look like:
         # for i in range(0, 16):
@@ -3073,9 +3071,6 @@ def _():
         # be automatically end-padded with no-ops
 
         schedule = fuse(schedule0, schedule1)
-        f, i, j = schedule.get_indices()
-        schedule.reorder(i, j, f)
-
         # Emitted fused loop should look like:
         # for i in range(0, 16):
         #   for j in range(0, 10):
@@ -3138,7 +3133,7 @@ def _():
         # Create a fused schedule: the smaller iteration space (nest1) should
         # be automatically end-padded with no-ops
         schedule = fuse(schedule0, schedule1)
-        f, i, j = schedule.get_indices()
+        i, j, f = schedule.get_indices()
 
         # computing the output block-by-block:
         #  first computing C[0:4, 0:4] += A[0:4, 0:4]
@@ -3418,7 +3413,7 @@ def test_multi_concat_fusing_1(self) -> None:
 
         # Create nest0 and schedule
         nest0 = Nest(A.shape)
-        i0 = nest0.get_indices()
+        i0, = nest0.get_indices()
 
         @nest0.iteration_logic
         def _():
@@ -3426,7 +3421,7 @@ def _():
 
         # Create nest1 and schedule1
         nest1 = Nest(B.shape)
-        i1 = nest1.get_indices()
+        i1, = nest1.get_indices()
 
         @nest1.iteration_logic
         def _():
@@ -3439,7 +3434,7 @@ def _():
         fused1 = fuse([s0, s1], partial=0)
 
         nest2 = Nest(C.shape)
-        i2 = nest2.get_indices()
+        i2, = nest2.get_indices()
 
         @nest2.iteration_logic
         def _():
@@ -3450,7 +3445,7 @@ def _():
         fused2 = fuse([fused1, s2], partial=0)
 
         nest3 = Nest(D.shape)
-        i3 = nest3.get_indices()
+        i3, = nest3.get_indices()
 
         @nest3.iteration_logic
         def _():
@@ -3607,8 +3602,6 @@ def _():
         schedule1.reorder(i1, j1, ii1, jj1)
 
         schedule_01 = fuse((schedule0, schedule1), partial=2)
-        f, i, j, ii0, jj0, ii1, jj1 = schedule_01.get_indices()
-        schedule_01.reorder(i, j, f, ii0, jj0, ii1, jj1)
 
         # Create nest2 and schedule2
         nest2 = Nest(shape=(M, N))
@@ -3641,12 +3634,8 @@ def _():
         schedule3.reorder(i3, j3, ii3, jj3)
 
         schedule_23 = fuse((schedule2, schedule3), partial=2)
-        f_23, i_23, j_23, ii2, jj2, ii3, jj3 = schedule_23.get_indices()
-        schedule_23.reorder(i_23, j_23, f_23, ii2, jj2, ii3, jj3)
 
         schedule_0123 = fuse((schedule_01, schedule_23), partial=1)
-        f_0123, i_0123, j_01, f_01, ii0, jj0, ii1, jj1, j_23, f_23, ii2, jj2, ii3, jj3 = schedule_0123.get_indices()
-        schedule_0123.reorder(i_0123, f_0123, j_01, f_01, ii0, jj0, ii1, jj1, j_23, f_23, ii2, jj2, ii3, jj3)
 
         plan = schedule_0123.create_plan()
 
@@ -3877,7 +3866,7 @@ def _pack_b():
         )
 
         matmul_kernel_nest = Nest((n_kernel_dim, ))
-        mmk_j = matmul_kernel_nest.get_indices()
+        mmk_j, = matmul_kernel_nest.get_indices()
 
         @matmul_kernel_nest.iteration_logic
         def _matmul():
@@ -4054,7 +4043,7 @@ def _verify_plan(self, plan, args: Tuple[Array], package_name, correctness_check
 
         # build the HAT package
         with verifiers.VerifyPackage(self, package_name, output_dir) as v:
-            package.build(package_name, format=TEST_FORMAT, mode=TEST_MODE, output_dir=output_dir)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir)
             if correctness_check_values:
                 v.check_correctness(
                     function.name,
@@ -4263,7 +4252,7 @@ def _verify_plan(self, plan, args: Tuple[int], package_name, correctness_check_v
 
         output_dir = pathlib.Path(TEST_PACKAGE_DIR) / package_name
         with verifiers.VerifyPackage(self, package_name, output_dir) as v:
-            package.build(package_name, format=TEST_FORMAT, mode=TEST_MODE, output_dir=output_dir)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir)
             if correctness_check_values:
                 v.check_correctness(
                     function.name,
@@ -4303,7 +4292,7 @@ def test_vectorize(self) -> None:
         my_target = Target(category=Target.Category.CPU, vector_bytes=16, vector_registers=2)
 
         nest = Nest(shape=(64, ))
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _():
@@ -4585,7 +4574,7 @@ def _verify_package(self, plan, args, package_name, correctness_check_values) ->
 
         output_dir = pathlib.Path(TEST_PACKAGE_DIR) / package_name
         with verifiers.VerifyPackage(self, package_name, output_dir) as v:
-            package.build(package_name, format=TEST_FORMAT, mode=TEST_MODE, output_dir=output_dir)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir)
             if correctness_check_values:
                 v.check_correctness(
                     function.name,
@@ -4754,7 +4743,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -4819,7 +4808,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -4876,7 +4865,7 @@ def _():
                     package.build(
                         package_name,
                         format=TEST_FORMAT,
-                        mode=TEST_MODE,
+                        mode=_get_test_mode(correctness_check_values),
                         output_dir=output_dir,
                     )
                     if correctness_check_values:
@@ -4959,7 +4948,7 @@ def _():
 
         # build the HAT package
         with verifiers.VerifyPackage(self, package_name, output_dir) as v:
-            package.build(package_name, format=TEST_FORMAT, mode=TEST_MODE, output_dir=output_dir)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir)
             if correctness_check_values:
                 v.check_correctness(
                     function.name,
@@ -5026,7 +5015,7 @@ def _():
                 package.build(
                     package_name,
                     format=TEST_FORMAT,
-                    mode=TEST_MODE,
+                    mode=_get_test_mode(correctness_check_values),
                     output_dir=output_dir,
                 )
                 if correctness_check_values:
@@ -5057,7 +5046,7 @@ def _():
                 package_ii.build(
                     package_name,
                     format=TEST_FORMAT,
-                    mode=TEST_MODE,
+                    mode=_get_test_mode(correctness_check_values),
                     output_dir=output_dir,
                 )
             if correctness_check_values:
@@ -5088,7 +5077,7 @@ def _():
                 package_partial.build(
                     package_name,
                     format=TEST_FORMAT,
-                    mode=TEST_MODE,
+                    mode=_get_test_mode(correctness_check_values),
                     output_dir=output_dir,
                 )
                 if correctness_check_values:
@@ -5119,7 +5108,7 @@ def _():
                 package_partial_inner.build(
                     package_name,
                     format=TEST_FORMAT,
-                    mode=TEST_MODE,
+                    mode=_get_test_mode(correctness_check_values),
                     output_dir=output_dir,
                 )
                 if correctness_check_values:
@@ -5170,7 +5159,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -5219,7 +5208,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -5281,7 +5270,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -5325,12 +5314,8 @@ def _():
         jj0_up = s0_up.split(j0_up, 16)
 
         fs = fuse((s0, s1), partial=1)
-        f, i, j, jj = fs.get_indices()
-        fs.reorder(i, f, j, jj)
 
         fs_up = fuse((s0_up, s1), partial=1)
-        f_up, i_up, j_up, jj_up = fs_up.get_indices()
-        fs_up.reorder(i_up, f_up, j_up, jj_up)
 
         package = Package()
         package_name = "test_fusion_parameterization_1"
@@ -5365,7 +5350,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -5437,7 +5422,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -5470,7 +5455,7 @@ def _():
         jj0 = s0.split(j0, P0)
 
         fs = fuse((s0, s1), partial=1)
-        f, i, j, jj = fs.get_indices()
+        i, f, j, jj = fs.get_indices()
         ii = fs.split(i, P1)
         fs.reorder(f, i, j, ii, jj)
 
@@ -5497,7 +5482,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -5530,7 +5515,7 @@ def _():
         jj0 = s0.split(j0, P0)
 
         fs = fuse((s0, s1), partial=1)
-        f, i, j, jj = fs.get_indices()
+        i, f, j, jj = fs.get_indices()
         ii = fs.split(i, P1)
         fs.reorder(i, f, j, ii, jj)
         jjj = fs.split(jj, P2)
@@ -5598,7 +5583,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -5644,7 +5629,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -5713,7 +5698,7 @@ def _():
             package.build(
                 name=package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -5846,7 +5831,7 @@ def test_HAT_packages(self) -> None:
             package.build(
                 package_name,
                 format=Package.Format.HAT_STATIC,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
                 platform=Package.Platform.RASPBIAN,
             )
@@ -5875,7 +5860,7 @@ def test_default_output_dir(self) -> None:
         package.add(plan, args=(A, ), base_name="func2")
 
         with verifiers.VerifyPackage(self, package_name):
-            package.build(package_name, format=TEST_FORMAT, mode=TEST_MODE)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode())
 
     def test_debug_mode_1(self) -> None:
         M = N = K = 16
@@ -5997,9 +5982,8 @@ def _():
         schedule1 = nest1.create_schedule()
 
         schedule = fuse(schedule0, schedule1, partial=1)
-        f, i, j0, j1 = schedule.get_indices()
+        i, f, j0, j1 = schedule.get_indices()
         ii = schedule.split(i, 2)
-        schedule.reorder(i, ii, f, j0, j1)
 
         package = Package()
         package_name = "MyFusionDebugPackage"
@@ -6051,16 +6035,12 @@ def _():
 
         schedule1 = nest1.create_schedule()
 
-        # Reorder schedule1 before fusing
-        schedule1.reorder(j1, i1)
-        # Fuse schedule0 with the reordered schedule1
-        schedule = fuse(schedule0, schedule1)
-        f, a, b = schedule.get_indices()
-
         # Deliberately break logical equivalence
         # before: C[1,0] = C[1,0] * B[1,0] + A[1,0]
         # after: C[1,0] = (C[1,0] + A[1,0]) * B[1,0]
-        schedule.reorder(a, b, f)
+        schedule1.reorder(j1, i1)
+        # Fuse schedule0 with the reordered schedule1
+        schedule = fuse(schedule0, schedule1)
 
         package = Package()
         package_name = "MyFusionDebugPackageIncorrect"
@@ -6116,8 +6096,6 @@ def _():
         schedule1 = nest1.create_schedule()
 
         schedule_f1 = fuse(schedule0, schedule1)
-        f, i, j = schedule_f1.get_indices()
-        schedule_f1.reorder(i, j, f)
 
         nest2 = Nest(shape=(M, N))
         i2, j2 = nest2.get_indices()
@@ -6182,8 +6160,6 @@ def _():
         schedule1 = nest1.create_schedule()
 
         schedule_f1 = fuse(schedule0, schedule1)
-        f, i, j = schedule_f1.get_indices()
-        schedule_f1.reorder(i, j, f)
 
         nest2 = Nest(shape=(M, N))
         i2, j2 = nest2.get_indices()
@@ -6260,7 +6236,7 @@ def test_add_description(self) -> None:
             package.build(
                 package_name,
                 format=TEST_FORMAT,
-                mode=TEST_MODE,
+                mode=_get_test_mode(),
                 output_dir=TEST_PACKAGE_DIR,
             )
 
@@ -6309,7 +6285,7 @@ def _():
         make_test_fn(package, A, B, C)
         package_name = "test_logic_function_conditionals"
         with verifiers.VerifyPackage(self, package_name, TEST_PACKAGE_DIR):
-            package.build(package_name, format=TEST_FORMAT, mode=TEST_MODE, output_dir=TEST_PACKAGE_DIR)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(), output_dir=TEST_PACKAGE_DIR)
 
 
 class DSLTest_11AutoPlan(unittest.TestCase):
diff --git a/accera/python/accera/test/mfma_tests.py b/accera/python/accera/test/mfma_tests.py
index 588cc361..ad071760 100644
--- a/accera/python/accera/test/mfma_tests.py
+++ b/accera/python/accera/test/mfma_tests.py
@@ -315,7 +315,7 @@ def _matmul_relu_plan(self, M, N, K, block_tile, thread_tile, outer_tile_k, inne
         matmul_sched = matmul_nest.create_schedule()
         relu_sched = relu_nest.create_schedule()
         schedule = acc.fuse(matmul_sched, relu_sched, partial = 2)
-        f, i, j, k = schedule.get_indices()
+        i, j, f, k = schedule.get_indices()
 
         ii, jj, kk = schedule.tile({
             i: block_tile[0],
diff --git a/accera/python/accera/test/smoke_tests.py b/accera/python/accera/test/smoke_tests.py
index aa0e2256..0f638ca7 100644
--- a/accera/python/accera/test/smoke_tests.py
+++ b/accera/python/accera/test/smoke_tests.py
@@ -89,7 +89,6 @@ def _():
 
         schedule1 = nest1.create_schedule()
         schedule = fuse(schedule0, schedule1)
-        f, i, j = schedule.get_indices()
         plan = schedule.create_plan()
 
         # Create a package and add our function definition to it
@@ -126,8 +125,6 @@ def _():
 
         schedule1 = nest1.create_schedule()
         schedule = fuse(schedule0, schedule1)
-        f, i, j = schedule.get_indices()
-        schedule.reorder(i, j, f)
         plan = schedule.create_plan()
 
         # Create a package and add our function definition to it
@@ -165,7 +162,6 @@ def _():
         schedule1 = nest1.create_schedule()
 
         schedule = fuse((schedule0, schedule1), partial=2)
-        f, i, j, k = schedule.get_indices()
         plan = schedule.create_plan()
 
         # Create a package and add our function definition to it
@@ -203,8 +199,6 @@ def _():
         schedule1 = nest1.create_schedule()
 
         schedule = fuse((schedule0, schedule1), partial=2)
-        f, i, j, k = schedule.get_indices()
-        schedule.reorder(i, j, f, k)
         plan = schedule.create_plan()
 
         # Create a package and add our function definition to it
@@ -254,8 +248,6 @@ def _():
 
 
         schedule = fuse((schedule0, schedule1), partial=2)
-        f, i, j, ii0, k0, iii0, ii1 = schedule.get_indices()
-        schedule.reorder(i, j, f, ii0, k0, iii0, ii1)
         plan = schedule.create_plan()
 
         # Create a package and add our function definition to it
@@ -338,8 +330,7 @@ def _():
         schedule1.reorder(i1, k1, j1)
 
         schedule = fuse((schedule0, schedule1), partial=2)
-        f, i, j, k, l = schedule.get_indices()
-        schedule.reorder(i, j, f, k, l)
+        i, j, f, k, l = schedule.get_indices()
 
         ii, jj = schedule.tile({
             i: 4,
@@ -392,8 +383,6 @@ def _():
             B[i2] += accum[0] * 1.2
 
         fused = fuse((n.create_schedule() for n in [nest0, nest1, nest2]), partial=1)
-        f, i, j = fused.get_indices()
-        fused.reorder(i, f, j)
 
         plan = fused.create_plan()
 
@@ -423,7 +412,7 @@ def test_multischedule_fusion2(self) -> None:
         nest1 = Nest(shape=(4, 8, 12))
         nest2 = Nest(shape=(4, 8))
 
-        i0 = nest0.get_indices()
+        i0, = nest0.get_indices()
         i1, j1, k1 = nest1.get_indices()
         i2, j2 = nest2.get_indices()
 
@@ -440,22 +429,6 @@ def _():
             B[i2, j2] *= accum[0]
 
         fused0 = fuse((nest1.create_schedule(), nest2.create_schedule()), partial=2)
-        ff0, if0, jf0, kf0 = fused0.get_indices()
-
-        # equivalent:
-        # for ff0 in range(2):
-        #     if ff0 == 0:
-        #         for if0 in range(4):
-        #             for jf0 in range(8):
-        #                 for kf0 in range(12):
-        #                         accum[0] += A[if0, jf0, kf0] * 0.2
-        #     if ff0 == 1:
-        #         for if0 in range(4):
-        #             for jf0 in range(8):
-        #                 B[if0, jf0] *= accum[0]
-
-        fused0.reorder(if0, jf0, ff0, kf0)
-
         # equivalent:
         # for if0 in range(4):
         #     for jf0 in range(8):
@@ -467,23 +440,6 @@ def _():
         #                 B[if0, jf0] *= accum[0]
 
         fused1 = fuse((nest0.create_schedule(), fused0), partial=1)
-        ff1, if1, jf1, ff0f1, kf1 = fused1.get_indices()
-        # equivalent:
-        # for ff1 in range(2):
-        #     if ff1 == 0:
-        #         for if1 in range(4):
-        #             accum[0] = B[if1, 0] * A[if1, 0, 0]
-        #     if ff1 == 1:
-        #         for if1 in range(4):
-        #             for jf1 in range(8):
-        #                 for ff0f1 in range(2):
-        #                     if ff0f1 == 0:
-        #                         for kf0 in range(12):
-        #                             accum[0] += A[if0, jf0, kf0] * 0.2
-        #                     if ff0f1 == 1:
-        #                         B[if1, jf1] *= accum[0]
-
-        fused1.reorder(if1, ff1, jf1, ff0f1, kf1)
         # equivalent:
         # for if1 in range(4):
         #     for ff1 in range(2):
@@ -944,7 +900,7 @@ def _():
         schedule0.reorder(i0, j0, k0)
         schedule1.reorder(i1, k1, j1)
         schedule = acc.fuse((schedule0, schedule1), partial=2)
-        f, i, j, k0, j1 = schedule.get_indices()
+        i, j, f, k0, j1 = schedule.get_indices()
 
         # TODO: support parameters
         # m, n = acc.create_parameters()
@@ -1014,7 +970,7 @@ def _():
         schedule0.reorder(i0, j0, jj0, k0)
         schedule1.reorder(i1, k1, j1, kk1)
         schedule = acc.fuse((schedule0, schedule1), partial=2)
-        f, i, j, jj0, k0, j1, kk1 = schedule.get_indices()
+        i, j, f, jj0, k0, j1, kk1 = schedule.get_indices()
 
         # TODO: support parameters
         # m, s = acc.create_parameters()
@@ -1086,7 +1042,7 @@ def _():
         schedule1.reorder(i1, k1, j1, kk1)
         schedule = acc.fuse((schedule0, schedule1), partial=2)
 
-        f, i, j, jj0, k0, j1, kk1 = schedule.get_indices()
+        i, j, f, jj0, k0, j1, kk1 = schedule.get_indices()
 
         # TODO: support parameters
         # m, t = acc.create_parameters()
@@ -1248,7 +1204,7 @@ def test_offset_sub_array_packing_flat(self) -> None:
         package = Package()
 
         diagonal_fetch_nest = Nest(shape=(N,))
-        diagonal_idx = diagonal_fetch_nest.get_indices()
+        diagonal_idx, = diagonal_fetch_nest.get_indices()
         @diagonal_fetch_nest.iteration_logic
         def _diag_fetch():
             diag_vec = Output.sub_array(offsets=(0,), shape=(N,))
@@ -1306,7 +1262,7 @@ def test_offset_sub_array_packing_split_dim(self) -> None:
         package = Package()
 
         diagonal_fetch_nest = Nest(shape=(N,))
-        diagonal_idx = diagonal_fetch_nest.get_indices()
+        diagonal_idx, = diagonal_fetch_nest.get_indices()
         @diagonal_fetch_nest.iteration_logic
         def _diag_fetch():
             diag_vec = Output.sub_array(offsets=(0,), shape=(N,))
@@ -1365,7 +1321,7 @@ def test_offset_sub_array_packing_multiple_split_dim(self) -> None:
         package = Package()
 
         diagonal_fetch_nest = Nest(shape=(N,))
-        diagonal_idx = diagonal_fetch_nest.get_indices()
+        diagonal_idx, = diagonal_fetch_nest.get_indices()
         @diagonal_fetch_nest.iteration_logic
         def _diag_fetch():
             diag_vec = Output.sub_array(offsets=(0,), shape=(N,))
@@ -1435,7 +1391,7 @@ def test_shifting_shrinking_sub_array(self) -> None:
         package = Package()
 
         nest = Nest(shape=(N,))
-        idx = nest.get_indices()
+        idx, = nest.get_indices()
         @nest.iteration_logic
         def _fn():
             size = N - idx
@@ -1474,7 +1430,7 @@ def test_dynamic_sub_array_split_dim_subfunction(self) -> None:
         current_outer_idx, extent = create_dimensions()
 
         inner_nest = Nest(shape=(extent,))
-        inner_idx = inner_nest.get_indices()
+        inner_idx, = inner_nest.get_indices()
         @inner_nest.iteration_logic
         def _inner_fn():
             full_idx = current_outer_idx + inner_idx
@@ -1490,7 +1446,7 @@ def _inner_fn():
             function_opts=INTERNAL_FUNCTION_OPTS)
 
         outer_nest = Nest(shape=(N,))
-        outer_idx = outer_nest.get_indices()
+        outer_idx, = outer_nest.get_indices()
         @outer_nest.iteration_logic
         def _outer_fn():
             extent_val = accmin(N - outer_idx, cast(tile_size, ScalarType.index))
@@ -2664,7 +2620,7 @@ def test_gpu_vec_add(self):
         C = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, ))
 
         nest = Nest(shape=(N, ))
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _():
@@ -2710,7 +2666,7 @@ def _test_gpu_vec_add_boundary(self, N, splits, test_name):
         C = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, ))
 
         nest = Nest(shape=(N, ))
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _():
@@ -2772,7 +2728,7 @@ def _test_cpu_vec_add_boundary(self, N, splits, test_name):
         C = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, ))
 
         nest = Nest(shape=(N, ))
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _():
@@ -4038,7 +3994,7 @@ def test_gpu_barrier_opt(self) -> None:
         C = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, ))
 
         nest = Nest(shape=(N, ))
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _():
@@ -6142,8 +6098,7 @@ def _():
         schedule1.reorder(i1, j1, k1, ii1, jj1, kk1)
 
         schedule = fuse((schedule0, schedule1), partial=3)
-        f, i, j, k, ii0, jj0, kk0, ii1, jj1, kk1 = schedule.get_indices()
-        schedule.reorder(i, j, k, f, ii0, jj0, kk0, ii1, jj1, kk1)
+        i, j, k, f, ii0, jj0, kk0, ii1, jj1, kk1 = schedule.get_indices()
         plan = schedule.create_plan()
         plan._erase_loops([kk1])
 
@@ -6165,7 +6120,7 @@ def test_dynamic_size_redundant_split(self) -> None:
         output_arr = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(m_extent,))
 
         nest = Nest((m_extent,))
-        i = nest.get_indices()
+        i, = nest.get_indices()
         @nest.iteration_logic
         def _():
             output_arr[i] += input_arr[i]
@@ -6209,7 +6164,7 @@ def test_dynamic_size_redundant_split_1(self) -> None:
         output_arr = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(m_extent,))
 
         nest = Nest((m_extent,))
-        i = nest.get_indices()
+        i, = nest.get_indices()
         @nest.iteration_logic
         def _():
             output_arr[i] += input_arr[i]
@@ -6253,7 +6208,7 @@ def test_dynamic_size_split_1(self) -> None:
         output_arr = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(m_extent,))
 
         nest = Nest((m_extent,))
-        i = nest.get_indices()
+        i, = nest.get_indices()
         @nest.iteration_logic
         def _():
             output_arr[i] += input_arr[i]
@@ -6297,7 +6252,7 @@ def test_dynamic_size_split_and_redundant_split_1(self) -> None:
         output_arr = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(m_extent,))
 
         nest = Nest((m_extent,))
-        i = nest.get_indices()
+        i, = nest.get_indices()
         @nest.iteration_logic
         def _():
             output_arr[i] += input_arr[i]
@@ -6341,7 +6296,7 @@ def test_vectorized_masked_buffer_fill(self) -> None:
         Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.int32, shape=(N_output,))
         package = Package()
         nest = Nest(shape=(N_output,))
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _nest():
@@ -6369,7 +6324,7 @@ def test_vectorized_masked_store(self) -> None:
         Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.int32, shape=(N_output,))
         package = Package()
         nest = Nest(shape=(N_input,))
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _nest():
@@ -6395,7 +6350,7 @@ def test_vectorized_masked_accumulate(self) -> None:
         Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.int32, shape=(N_output,))
         package = Package()
         nest = Nest(shape=(N_input,))
-        i = nest.get_indices()
+        i, = nest.get_indices()
 
         @nest.iteration_logic
         def _nest():
diff --git a/accera/python/samples/MatrixMultiplication.py b/accera/python/samples/MatrixMultiplication.py
index 788b811d..63f79177 100644
--- a/accera/python/samples/MatrixMultiplication.py
+++ b/accera/python/samples/MatrixMultiplication.py
@@ -119,12 +119,10 @@ def trans_no_trans(val: int, trans: bool):
     bias_c_idxs = bias_stack_idxs + (bias_j, )
 
     if C is not None:
-
         @bias_nest.iteration_logic
         def _():
             Y[bias_y_idxs] = Scalar(beta) * C[bias_c_idxs]
     else:
-
         @bias_nest.iteration_logic
         def _():
             Y[bias_y_idxs] = Scalar(0.0)
@@ -172,17 +170,12 @@ def _():
         compute_schedule,
         scale_schedule,
     ), partial=len(stack) + 2)
-    idxs_f0 = fused0.get_indices()
-    f0, stack_f0, i_f0, j_f0, k_f0 = idxs_f0[0], tuple(idxs_f0[1:-3]), *idxs_f0[-3:]
-    fused0.reorder(*stack_f0, i_f0, j_f0, f0, k_f0)
 
     fused1 = fuse((bias_schedule, fused0), partial=len(stack) + 2)
-    idxs_f1 = fused1.get_indices()
-    f1, stack_f1, i_f1, j_f1, f0_f1, k_f1 = idxs_f1[0], tuple(idxs_f1[1:-4]), *idxs_f1[-4:]
-
-    fused_Y_idxs = stack_f1 + (i_f1, j_f1)
-    fused_A_idxs = stack_f1 + (i_f1, k_f1) if not transA else (k_f1, i_f1)
-    fused_B_idxs = stack_f1 + (k_f1, j_f1) if not transB else (j_f1, k_f1)
+    f1 = fused1.get_fusing_index()
+    fused_dims = fused1.get_fused_indices()
+    stack_f1, i_f1, j_f1 = tuple(fused_dims[:-2]), *fused_dims[-2:]
+    f0_f1, k_f1 = fused1.get_unfused_indices()
 
     jj = fused1.split(j_f1, column_block)
 
@@ -331,14 +324,10 @@ def _():
 
     fused_schedule = fuse((bias_schedule, compute_schedule), partial=len(stack) + 2)
 
-    fused_idxs = fused_schedule.get_indices()
-    f, fused_stack_idxs, fused_i, fused_j, k = fused_idxs[0], tuple(fused_idxs[1:-3]), *fused_idxs[-3:]
-
-    fused_Y_idxs = fused_stack_idxs + (fused_i, fused_j)
-    fused_A_idxs = fused_stack_idxs + \
-        (fused_i, k) if not transA else (k, fused_i)
-    fused_B_idxs = fused_stack_idxs + \
-        (k, fused_j) if not transB else (fused_j, k)
+    f = fused_schedule.get_fusing_index()
+    k = fused_schedule.get_unfused_indices()[0]
+    fused_indices = fused_schedule.get_fused_indices()
+    fused_stack_idxs, fused_i, fused_j = tuple(fused_indices[:-2]), *fused_indices[-2:]
 
     jj = fused_schedule.split(fused_j, column_block)
     kk = fused_schedule.split(k, inner_dim_block)
diff --git a/accera/transforms/include/AcceraPasses.td b/accera/transforms/include/AcceraPasses.td
index f535e8ed..a1edef9b 100644
--- a/accera/transforms/include/AcceraPasses.td
+++ b/accera/transforms/include/AcceraPasses.td
@@ -332,6 +332,57 @@ def LoopNestToValueFunc : Pass<"loopnest-to-value-func", "accera::ir::value::Val
   ];
 }
 
+
+//===----------------------------------------------------------------------===//
+// AcceraVectorizationPass
+//===----------------------------------------------------------------------===//
+
+def AcceraVectorizationPass : Pass<"acc-vectorize", "::mlir::ModuleOp"> {
+  let summary = "Vectorize AffineForOps marked for vectorization";
+  let description = [{
+      This pass finds AffineForOps marked for vectorization and vectorizes the ops in those loops
+    }];
+  let constructor = "accera::transforms::vectorization::createVectorizationPass()";
+  let options = [
+    Option<"printVecOpDetails", "print-vec-details", "bool", /*default=*/"false",
+           "Print details about op vectorization">
+  ];
+  let dependentDialects = [
+    "accera::ir::value::ValueDialect",
+    "mlir::StandardOpsDialect",
+    "mlir::AffineDialect",
+    "mlir::scf::SCFDialect",
+    "mlir::vector::VectorDialect",
+    "mlir::memref::MemRefDialect"
+  ];
+}
+
+
+//===----------------------------------------------------------------------===//
+// AcceraVectorizationUnrollPass
+//===----------------------------------------------------------------------===//
+
+def AcceraVectorizationUnrollPass : Pass<"acc-vectorize-unroll", "::mlir::ModuleOp"> {
+  let summary = "Unroll AffineForOps marked for vectorized unroll";
+  let description = [{
+      This pass finds AffineForOps marked for vectorized unroll and unrolls those loops op-by-op
+    }];
+  let constructor = "accera::transforms::vectorization::createVectorizationUnrollPass()";
+  let options = [
+    Option<"printVecOpDetails", "print-vec-details", "bool", /*default=*/"false",
+           "Print details about op vectorization">
+  ];
+  let dependentDialects = [
+    "accera::ir::value::ValueDialect",
+    "mlir::StandardOpsDialect",
+    "mlir::AffineDialect",
+    "mlir::scf::SCFDialect",
+    "mlir::vector::VectorDialect",
+    "mlir::memref::MemRefDialect"
+  ];
+}
+
+
 //===----------------------------------------------------------------------===//
 // ConvertVulkanLaunchFuncToVulkanCallsWithTiming
 //===----------------------------------------------------------------------===//
diff --git a/accera/transforms/include/vectorization/VectorizationPass.h b/accera/transforms/include/vectorization/VectorizationPass.h
index 40011245..0be49399 100644
--- a/accera/transforms/include/vectorization/VectorizationPass.h
+++ b/accera/transforms/include/vectorization/VectorizationPass.h
@@ -15,9 +15,17 @@ class RewritePatternSet;
 
 namespace accera::transforms::vectorization
 {
+struct VectorizationPassOptions
+{
+    bool printVecOpDetails = false;
+};
+
 void populateVectorizePatterns(bool printVectorizationDetails, mlir::RewritePatternSet& patterns);
 void populateVectorizeUnrollPatterns(bool printVectorizationDetails, mlir::RewritePatternSet& patterns);
 
+std::unique_ptr<mlir::Pass> createVectorizationPass(const VectorizationPassOptions& options);
 std::unique_ptr<mlir::Pass> createVectorizationPass();
+std::unique_ptr<mlir::Pass> createVectorizationUnrollPass(const VectorizationPassOptions& options);
+std::unique_ptr<mlir::Pass> createVectorizationUnrollPass();
 
 } // namespace accera::transforms::vectorization
diff --git a/accera/transforms/src/AcceraPasses.cpp b/accera/transforms/src/AcceraPasses.cpp
index c0324ceb..ded2358b 100644
--- a/accera/transforms/src/AcceraPasses.cpp
+++ b/accera/transforms/src/AcceraPasses.cpp
@@ -152,7 +152,11 @@ void addAcceraToLLVMPassPipeline(OpPassManager& pm, const AcceraPassPipelineOpti
     pmAdaptor.addPass(value::createValueFuncToTargetPass({ options.dumpIntraPassIR.getValue(), options.basename + "ValueFuncToTargetPass_Subpasses" }));
     pmAdaptor.addPass(createSymbolDCEPass());
     pmAdaptor.addPass(affine::createAffineSimplificationPass());
+    pmAdaptor.addPass(createCanonicalizerPass());
+    pmAdaptor.addPass(vectorization::createVectorizationPass({ options.printVecOpDetails.getValue() }));
+    pmAdaptor.addPass(vectorization::createVectorizationUnrollPass({ options.printVecOpDetails.getValue() }));
     pmAdaptor.addPass(value::createValueUnrollingPass());
+    pmAdaptor.addPass(affine::createAffineSimplificationPass());
 
     auto funcOpPM = pmAdaptor.nestPassManager([&]() -> OpPassManager& { return pm.nest<v::ValueModuleOp>().nest<FuncOp>(); });
     funcOpPM.addPass(createConvertLinalgToAffineLoopsPass());
diff --git a/accera/transforms/src/nest/LoopNestToValueFunc.cpp b/accera/transforms/src/nest/LoopNestToValueFunc.cpp
index 3f7e0f34..4069237f 100644
--- a/accera/transforms/src/nest/LoopNestToValueFunc.cpp
+++ b/accera/transforms/src/nest/LoopNestToValueFunc.cpp
@@ -301,14 +301,6 @@ struct LoopNestToValueFuncPass : public accera::transforms::LoopNestToValueFuncB
             snapshotter.Snapshot("GPUIndexMapping", vFuncOp);
         }
 
-        {
-            RewritePatternSet patterns(context);
-            vectr::populateVectorizePatterns(printVecOpDetails, patterns);
-            utilir::FillCanonicalPatternsRecursively(vFuncOp, patterns);
-            (void)applyPatternsAndFoldGreedily(vFuncOp, std::move(patterns));
-            snapshotter.Snapshot("Vectorize_Canonicalize", vFuncOp);
-        }
-
         {
             RewritePatternSet patterns(context);
             tr::populateLoopSimplificationPatterns(patterns);
@@ -316,14 +308,6 @@ struct LoopNestToValueFuncPass : public accera::transforms::LoopNestToValueFuncB
             snapshotter.Snapshot("LoopSimplification", vFuncOp);
         }
 
-        {
-            RewritePatternSet patterns(context);
-            vectr::populateVectorizeUnrollPatterns(printVecOpDetails, patterns);
-            utilir::FillCanonicalPatternsRecursively(vFuncOp, patterns);
-            (void)applyPatternsAndFoldGreedily(vFuncOp, std::move(patterns));
-            snapshotter.Snapshot("VectorizeUnroll_Canonicalize", vFuncOp);
-        }
-
         {
             RewritePatternSet patterns(context);
             tr::populateLoopOptimizationPatterns(patterns);
diff --git a/accera/transforms/src/vectorization/VectorizationPass.cpp b/accera/transforms/src/vectorization/VectorizationPass.cpp
index c45118d8..3569ba92 100644
--- a/accera/transforms/src/vectorization/VectorizationPass.cpp
+++ b/accera/transforms/src/vectorization/VectorizationPass.cpp
@@ -7,6 +7,7 @@
 
 #include "AcceraPasses.h"
 #include "vectorization/VectorizationUtil.h"
+#include "nest/LoopNestToValue.h"
 
 #include <ir/include/IRUtil.h>
 #include <ir/include/value/ValueDialect.h>
@@ -34,6 +35,7 @@ namespace v = accera::ir::value;
 using namespace accera::transforms;
 using namespace accera::ir::util;
 using namespace accera::utilities;
+using namespace accera::transforms::vectorization;
 
 using namespace mlir;
 
@@ -582,17 +584,82 @@ LogicalResult InPlaceUnrollAffineForOpConversion::matchAndRewrite(AffineForOp af
     return success();
 }
 
+namespace vectr = accera::transforms::vectorization;
 
-// TODO : implement
-// struct VectorizationPass : public accera::transforms::AcceraVectorizationPassBase<VectorizationPass>
-// {
-//     void runOnOperation() final
-//     {
-//         auto* context = &getContext();
-//         auto op = getOperation();
-//         // TODO : implement with LoopNestToValueFunc vectorization sequence
-//     }
-// };
+struct VectorizationPass : public accera::transforms::AcceraVectorizationPassBase<VectorizationPass>
+{
+    VectorizationPass(const vectr::VectorizationPassOptions& options = {})
+    {
+        printVecOpDetails = options.printVecOpDetails;
+    }
+
+    void runOnOperation() final
+    {
+        auto* context = &getContext();
+        auto op = getOperation();
+
+        mlir::GreedyRewriteConfig topDownConfig; // Some patterns require a top-down handling of ops to ensure relative orders stay consistent
+        topDownConfig.useTopDownTraversal = true;
+
+        {
+            RewritePatternSet patterns(context);
+            populateLoopSimplificationPatterns(patterns);
+            (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
+        }
+
+        {
+            RewritePatternSet patterns(context);
+            populateVectorizePatterns(printVecOpDetails, patterns);
+            util::FillCanonicalPatternsRecursively(op, patterns);
+            (void)applyPatternsAndFoldGreedily(op, std::move(patterns), topDownConfig);
+        }
+
+        {
+            RewritePatternSet patterns(context);
+            populateLoopSimplificationPatterns(patterns);
+            populateLoopOptimizationPatterns(patterns);
+            (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
+        }
+    }
+};
+
+
+struct VectorizationUnrollPass : public accera::transforms::AcceraVectorizationUnrollPassBase<VectorizationUnrollPass>
+{
+    VectorizationUnrollPass(const vectr::VectorizationPassOptions& options = {})
+    {
+        printVecOpDetails = options.printVecOpDetails;
+    }
+
+    void runOnOperation() final
+    {
+        auto* context = &getContext();
+        auto op = getOperation();
+
+        mlir::GreedyRewriteConfig topDownConfig; // Some patterns require a top-down handling of ops to ensure relative orders stay consistent
+        topDownConfig.useTopDownTraversal = true;
+
+        {
+            RewritePatternSet patterns(context);
+            populateLoopSimplificationPatterns(patterns);
+            (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
+        }
+
+        {
+            RewritePatternSet patterns(context);
+            populateVectorizeUnrollPatterns(printVecOpDetails, patterns);
+            util::FillCanonicalPatternsRecursively(op, patterns);
+            (void)applyPatternsAndFoldGreedily(op, std::move(patterns), topDownConfig);
+        }
+
+        {
+            RewritePatternSet patterns(context);
+            populateLoopSimplificationPatterns(patterns);
+            populateLoopOptimizationPatterns(patterns);
+            (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
+        }
+    }
+};
 
 } // namespace
 
@@ -608,9 +675,20 @@ void populateVectorizeUnrollPatterns(bool printVectorizationDetails, mlir::Rewri
     patterns.insert<InPlaceUnrollAffineForOpConversion>(patterns.getContext(), printVectorizationDetails);
 }
 
-// TODO : implement
-// std::unique_ptr<mlir::Pass> createVectorizationPass()
-// {
-//     return std::make_unique<VectorizationPass>();
-// }
+std::unique_ptr<mlir::Pass> createVectorizationPass(const VectorizationPassOptions& options)
+{
+    return std::make_unique<VectorizationPass>(options);
+}
+std::unique_ptr<mlir::Pass> createVectorizationPass()
+{
+    return std::make_unique<VectorizationPass>();
+}
+std::unique_ptr<mlir::Pass> createVectorizationUnrollPass(const VectorizationPassOptions& options)
+{
+    return std::make_unique<VectorizationUnrollPass>(options);
+}
+std::unique_ptr<mlir::Pass> createVectorizationUnrollPass()
+{
+    return std::make_unique<VectorizationUnrollPass>();
+}
 } // namespace accera::transforms::vectorization
diff --git a/accera/transforms/src/vectorization/VectorizationUtil.cpp b/accera/transforms/src/vectorization/VectorizationUtil.cpp
index a9608eb1..7958efc3 100644
--- a/accera/transforms/src/vectorization/VectorizationUtil.cpp
+++ b/accera/transforms/src/vectorization/VectorizationUtil.cpp
@@ -138,6 +138,34 @@ bool CanVectorizeOp(mlir::Operation* op,
     return result;
 }
 
+std::optional<VectorizedOp> VectorizeGenericOp(mlir::PatternRewriter& rewriter,
+                                               mlir::Operation* op,
+                                               const VectorizedOpMap& vectorizedOps,
+                                               std::vector<mlir::BlockAndValueMapping>& laneMappings,
+                                               mlir::Value inductionVar,
+                                               int64_t step,
+                                               int64_t vectorSize)
+{
+    if (op == nullptr || op->getNumResults() != 1)
+    {
+        return std::nullopt;
+    }
+
+    auto loc = op->getLoc();
+    auto opResult = op->getResult(0);
+    auto elementType = opResult.getType();
+    if (elementType.isa<mlir::VectorType>())
+    {
+        // Can't further vectorize this
+        return std::nullopt;
+    }
+    auto vectorType = mlir::VectorType::get({ vectorSize }, elementType);
+
+    auto result = rewriter.create<mlir::vector::BroadcastOp>(loc, vectorType, opResult);
+
+    return result.getOperation();
+}
+
 std::optional<VectorizedOp> GetVectorizedPredecessor(mlir::PatternRewriter& rewriter,
                                                      mlir::Operation* pred,
                                                      const VectorizedOpMap& vectorizedOps,
@@ -160,6 +188,12 @@ std::optional<VectorizedOp> GetVectorizedPredecessor(mlir::PatternRewriter& rewr
         }
     }
 
+    // If the predecessor op doesn't depend on this induction variable, then simply broadcast its result and return it
+    if (!ir::util::hasRecursiveUseOfOp(inductionVar, pred))
+    {
+        return VectorizeGenericOp(rewriter, pred, vectorizedOps, laneMappings, inductionVar, step, vectorSize);
+    }
+
     if (CanVectorizeOp(pred, vectorizedOps, laneMappings, inductionVar, step, vectorSize))
     {
         auto vecPred = VectorizeOp(rewriter, pred, vectorizedOps, laneMappings, inductionVar, step, vectorSize);
@@ -208,29 +242,6 @@ std::optional<VectorizedOp> GetVectorizedPredecessor(mlir::PatternRewriter& rewr
     return std::nullopt;
 }
 
-std::optional<VectorizedOp> VectorizeGenericOp(mlir::PatternRewriter& rewriter,
-                                               mlir::Operation* op,
-                                               const VectorizedOpMap& vectorizedOps,
-                                               std::vector<mlir::BlockAndValueMapping>& laneMappings,
-                                               mlir::Value inductionVar,
-                                               int64_t step,
-                                               int64_t vectorSize)
-{
-    if (op == nullptr || op->getNumResults() != 1)
-    {
-        return std::nullopt;
-    }
-
-    auto loc = op->getLoc();
-    auto opResult = op->getResult(0);
-    auto elementType = opResult.getType();
-    auto vectorType = mlir::VectorType::get({ vectorSize }, elementType);
-
-    auto result = rewriter.create<mlir::vector::BroadcastOp>(loc, vectorType, opResult);
-
-    return result.getOperation();
-}
-
 std::optional<VectorizedOp> VectorizeAllocaOp(mlir::PatternRewriter& rewriter,
                                               mlir::memref::AllocaOp op,
                                               const VectorizedOpMap& vectorizedOps,
@@ -368,17 +379,21 @@ std::optional<int64_t> GetConstantStrideBetweenUnrolledAccesses(mlir::PatternRew
     // Check if the temporary clones are all accessing sequential memory
     auto accessMapComposition = ir::util::GetIndexToMemoryLocationMap(rewriter.getContext(), op);
 
+    // Memref offset / shape symbols occur first in the symbol list, so get the insert position for them
+    auto symStartIndex = accessMapComposition.getNumDims();
+
     // For dynamically shaped memrefs, currently we only handle identity-mapped memrefs,
     // general dynamic memref support will come later.
     auto memRefType = op.memref().getType().template cast<mlir::MemRefType>();
-    std::vector<mlir::Value> strideSymbols;
+    std::vector<mlir::Value> strideSymbols = ir::util::GetDynamicOffsetSymbols(op.memref());
     if (!memRefType.hasStaticShape())
     {
         if (!ir::util::HasIdentityLayout(op.memref()))
         {
             return std::nullopt;
         }
-        strideSymbols = ir::util::GetIdentityMemrefStrideSymbols(rewriter, loc, op.memref());
+        auto shapeSymbols = ir::util::GetIdentityMemrefStrideSymbols(rewriter, loc, op.memref());
+        strideSymbols.insert(strideSymbols.end(), shapeSymbols.begin(), shapeSymbols.end());
     }
 
     std::optional<int64_t> stride = std::nullopt;
@@ -388,8 +403,8 @@ std::optional<int64_t> GetConstantStrideBetweenUnrolledAccesses(mlir::PatternRew
         std::vector<mlir::Value> currentIndicesVec(temporaryClones[unrollIdx].indices().begin(), temporaryClones[unrollIdx].indices().end());
 
         // Append any dynamic stride symbols since we're dealing with a flattened layout map
-        prevIndicesVec.insert(prevIndicesVec.end(), strideSymbols.begin(), strideSymbols.end());
-        currentIndicesVec.insert(currentIndicesVec.end(), strideSymbols.begin(), strideSymbols.end());
+        prevIndicesVec.insert(prevIndicesVec.begin() + symStartIndex, strideSymbols.begin(), strideSymbols.end());
+        currentIndicesVec.insert(currentIndicesVec.begin() + symStartIndex, strideSymbols.begin(), strideSymbols.end());
 
         auto prevAccess = ir::util::MultiDimAffineApply(rewriter, loc, accessMapComposition, prevIndicesVec);
         auto currentAccess = ir::util::MultiDimAffineApply(rewriter, loc, accessMapComposition, currentIndicesVec);
@@ -501,13 +516,15 @@ std::pair<mlir::Value, mlir::Value> FlattenAccess(mlir::OpBuilder& builder, OpTy
     auto loc = accessOp->getLoc();
     auto flatCastMemref = FlattenMemRefCast(builder, loc, accessOp.memref());
     auto flattenMap = ir::util::GetIndexToMemoryLocationMap(builder.getContext(), accessOp);
-    std::vector<mlir::Value> strideSymbols;
+
+    auto symStartIndex = flattenMap.getNumDims();
+    std::vector<mlir::Value> strideSymbols = ir::util::GetDynamicOffsetSymbols(accessOp.memref());
     if (ir::util::HasIdentityLayout(accessOp.memref()))
     {
         strideSymbols = ir::util::GetIdentityMemrefStrideSymbols(builder, loc, accessOp.memref());
     }
     std::vector<mlir::Value> indicesAndStrideSymbols = indices;
-    indicesAndStrideSymbols.insert(indicesAndStrideSymbols.end(), strideSymbols.begin(), strideSymbols.end());
+    indicesAndStrideSymbols.insert(indicesAndStrideSymbols.begin() + symStartIndex, strideSymbols.begin(), strideSymbols.end());
     auto flatPosition = builder.create<mlir::AffineApplyOp>(loc, flattenMap, indicesAndStrideSymbols);
     return std::make_pair(flatCastMemref, flatPosition);
 }
@@ -1823,7 +1840,7 @@ mlir::LogicalResult vectorizeTwoRowInterleavedPack(mlir::AffineForOp affineForOp
 
     std::stack<mlir::Operation*> matchedOps;
     std::stack<mlir::Operation*> tempOps;
-    ir::util::TempOpCleanupGuard(&tempOps, rewriter);
+    ir::util::TempOpCleanupGuard tempOpGuard(&tempOps, rewriter);
 
     // Match j and k loop
     SmallVector<AffineForOp, 2> loops;
@@ -1861,6 +1878,10 @@ mlir::LogicalResult vectorizeTwoRowInterleavedPack(mlir::AffineForOp affineForOp
     int64_t unrollMax_jj = std::min(jj_numIters, (jj_end - jj_begin));
     int64_t unrollMax_kk = std::min(kk_numIters, (kk_end - kk_begin));
 
+    // Set the insertion point to the end of the inner loop (just before the terminator)
+    mlir::OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(innerLoop.getBody(), innerLoop.getBody()->getTerminator()->getIterator());
+
     // iterate on loop body from begin to end to match the ops list
     auto innerLoopBodyIter = innerLoop.getBody()->begin();
     auto innerLoopBodyEnd = innerLoop.getBody()->end();
@@ -1953,10 +1974,6 @@ mlir::LogicalResult vectorizeTwoRowInterleavedPack(mlir::AffineForOp affineForOp
 
     // So now we can create the new vectorized version of the loops
 
-    // Set the insertion point to the end of the inner loop (just before the terminator)
-    mlir::OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPoint(innerLoop.getBody(), innerLoop.getBody()->getTerminator()->getIterator());
-
     // 1. create vector load of the input rows
     auto inputMemRefType = loadOp.getMemRefType();
     auto inputElementType = inputMemRefType.getElementType();
@@ -1967,7 +1984,6 @@ mlir::LogicalResult vectorizeTwoRowInterleavedPack(mlir::AffineForOp affineForOp
     for (int64_t kk_idx = kk_begin; kk_idx < kk_end; kk_idx += kk_step)
     {
         auto unrolledInductionVar_kk = rewriter.create<mlir::arith::ConstantIndexOp>(loadLoc, kk_idx);
-        tempOps.push(unrolledInductionVar_kk);
         mlir::BlockAndValueMapping kIterMapping;
         kIterMapping.map(kk_inductionVar, unrolledInductionVar_kk);
         auto clonedLoadOp = mlir::cast<mlir::AffineLoadOp>(rewriter.clone(*(loadOp.getOperation()), kIterMapping));
@@ -2534,7 +2550,7 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp,
     auto cmpOp = cast<v::CmpOp>(*loopBodyStart);
     auto cmpOpResult = cmpOp.result();
     matchedOps.push(cmpOp);
-   
+
     loopBodyStart++; 
 
     // 2. match scf.if op 
@@ -2582,7 +2598,7 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp,
 
     // match ops in else block
     auto storeElemType = storeOp.getMemRefType().getElementType();
-    mlir::Value paddingOpValue = rewriter.create<mlir::arith::ConstantOp>(loopOp.getLoc(), rewriter.getZeroAttr(storeElemType));
+    mlir::Value paddingOpValue = rewriter.create<mlir::arith::ConstantOp>(loopOp.getLoc(), rewriter.getZeroAttr(ir::util::ToSignlessMLIRType(rewriter, storeElemType)));
     v::CastOp elseCastOp;
 
     if (elseBlock != nullptr)
@@ -2675,6 +2691,7 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp,
 
     // create a default identity map for mapping 1:1 dimension
     mlir::AffineMap permutationMap = mlir::AffineMap::getMinorIdentityMap(1, 1, rewriter.getContext());
+    mlir::AffineMapAttr permutationMapAttr = mlir::AffineMapAttr::get(permutationMap);
     llvm::SmallVector<bool, 1> inbound_init;
     inbound_init.push_back(false);
     auto inbounds = rewriter.getBoolArrayAttr(inbound_init);
@@ -2693,11 +2710,18 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp,
     mlir::AffineStoreOpAdaptor adaptorStore{ storeOp };
     std::vector<mlir::Value> baseIndicesStore(adaptorStore.indices().begin(), adaptorStore.indices().end());
 
-    mlir::vector::StoreOp storeVecOp;
     auto [flatCastMemRefStore, flattenedPosStore] = FlattenAccess(rewriter, storeOp, baseIndicesStore);
 
-
-    rewriter.create<mlir::vector::StoreOp>(storeOp.getLoc(), valueToStore, flatCastMemRefStore, mlir::ValueRange{ flattenedPosStore });
+    if (elseBlock)
+    {
+        // vector store op for buffer fill case
+        rewriter.create<mlir::vector::StoreOp>(storeOp.getLoc(), valueToStore, flatCastMemRefStore, mlir::ValueRange{ flattenedPosStore });
+    }
+    else
+    {
+        // masked store op
+        rewriter.create<mlir::vector::TransferWriteOp>(storeOp.getLoc(), valueToStore, flatCastMemRefStore, mlir::ValueRange{ flattenedPosStore }, permutationMapAttr, mask, inbounds);
+    }
 
     // Set the step size for vectorized loop
     loopOp.setStep(iter_step * numIters);
diff --git a/docs/Manual/04 Fusing.md b/docs/Manual/04 Fusing.md
index 617716b3..17146fb8 100644
--- a/docs/Manual/04 Fusing.md	
+++ b/docs/Manual/04 Fusing.md	
@@ -15,7 +15,7 @@ schedule = acc.fuse(schedule0, schedule1, ...)
 *Full fusing* is the most straightforward, where each dimension is fused with the corresponding dimension from other schedules. 
 
 ### Full fusing of same-shaped iteration spaces
-First, consider the simplest case where we fuse schedules with identical iteration space shapes. This fusing assigns a new dimension called *fusing dimension* to the fused schedule `schedule` that does not exist in the original schedules. By default, the fusing dimension is the first dimension in the fused schedule. Its size is equal to the number of fused schedules. The slices along the fusing dimension contain a copy of `schedule0`, `schedule1`. The first slice along the fusing dimension contains a copy of `schedule0`, the second slice contains a copy of `schedule1`, and so on. Since the fusing dimension is the first dimension, the fused schedule is logically equivalent to fully executing `schedule0`, followed by `schedule1`, and so on. We apply additional transformations to the fused schedule to interleave the original schedules.
+First, consider the simplest case where we fuse schedules with identical iteration space shapes. This fusing assigns a new dimension called *fusing dimension* to the fused schedule `schedule` that does not exist in the original schedules. By default, the fusing dimension is the last dimension in the fused schedule. Its size is equal to the number of fused schedules. The slices along the fusing dimension contain a copy of the iteration logic of `schedule0`, `schedule1`. The first slice along the fusing dimension contains a copy of the iteration logic of `schedule0`, the second slice contains that of `schedule1`, and so on. Since the fusing dimension is the last dimension, the fused schedule is logically equivalent to executing an iteration of `schedule0`, followed by an iteration of `schedule1`, and so on.
 
 Consider a scenario where we want first to shift and then scale each element of a matrix. In other words, we want to perform the equivalent of the below Python code:  
 ```python
@@ -69,36 +69,16 @@ Before fusing, both `schedule0` and `schedule1` have a shape (10, 10). Now, let
 ```python
 # Create a fused schedule
 schedule = acc.fuse(schedule0, schedule1)
-f, i, j = schedule.get_indices()
+i, j, f = schedule.get_indices()
 ```
-Fusing creates a new fused schedule `schedule` with a shape (2, 10, 10). It does not change `schedule0` and `schedule1`. The first dimension in `schedule` is the so-called fusing dimension `f`. Its slice (0, \*, \*) contains a copy of `schedule0`, and its slice (1, \*, \*) contains a copy of `schedule1`.
+Fusing creates a new fused schedule `schedule` with a shape (10, 10, 2). It does not change `schedule0` and `schedule1`. The last dimension in `schedule` is the so-called fusing dimension `f`. Its slice (\*, \*, 0) contains a copy of `schedule0`, and its slice (\*, \*, 1) contains a copy of `schedule1`.
 
-![Before fusing](../assets/viz/fuse1.png)  |  ![After fusing](../assets/viz/fuse1a.png)
+![Before fusing](../assets/viz/fuse1.png)  |  ![After fusing](../assets/viz/fuse1b.png)
 :-------------------------:|:-------------------------:
 *Before fusing* |  *After `fuse(schedule0, schedule1)`*
 
 In loop form, `schedule` is now equivalent to the following Python code:
-```python
-# f = 0
-for i in range(10):
-    for j in range(10):
-        C[i, j] += A[i, j]
-# f = 1
-for i in range(10):
-    for j in range(10):
-        C[i, j] *= B[i, j]
-```
 
-Not much has happened until now since executing `schedule` as-is is equivalent to executing `schedule0` followed by `schedule1`. However, this can be changed by transforming the fused schedule. For example, we can recover `schedule_simple` by reordering the indices as follows:
-```python
-schedule.reorder(i, j, f)
-```
-
-![Before reorder](../assets/viz/fuse1a.png)  |  ![After reorder](../assets/viz/fuse1b.png)
-:-------------------------:|:-------------------------:
-*Before `reorder(i, j, f)`, order is (f, i, j)* |  *After `reorder(i, j, f)`, order is (i, j, f)*
-
-The fusing dimension moves from the first position to the last position. Now, `schedule` is equivalent to the following Python code:
 ```python
 for i in range(10):
     for j in range(10):
@@ -141,7 +121,7 @@ The fusing dimension comes with certain constraints that are discussed from the
 Unlike other dimensions that allow parallelization, vectorization, or tensorization (see [Section 7](<07%20Plans%20-%20Operations%20and%20Optimizations.md>) ), none of these operations can be applied to the fusing dimension. The fusing dimension must be executed sequentially. This constraint enables the safety guarantee discussed below.   
 
 ### Safety
-Before applying any subsequent transformations, the fused schedule is always logically equal to executing the original schedules sequentially. However, is it safe? Recall that a schedule is considered safe if the underlying logic is guaranteed to be unchanged regardless of the applied transformation. The safety of a fused schedule depends on circumstances that may break logic equivalence: 
+Before applying any subsequent transformations, the fused schedule is always logically equivalent to executing the original schedules sequentially for each value of the fused dimensions. However, is it safe? Recall that a schedule is considered safe if the underlying logic is guaranteed to be unchanged regardless of the applied transformation. The safety of a fused schedule depends on circumstances that may break logic equivalence: 
 
 Accera preserves the order of the fused schedules *for each value of the fused dimensions*, regardless of how the fused schedule is transformed. For example, in the example above, the fused dimensions are `i` and `j`. Therefore, for any concrete value of `i` and `j`, the corresponding operation from `schedule0` is guaranteed to execute before the corresponding operation from `schedule1`, regardless of how the fused schedule is transformed. More specifically, for each `i` and `j`, the operation `C[i, j] += A[i, j]` is guaranteed to execute before the operation `C[i, j] *= B[i, j]`, no matter how we transform the fused schedule. Since those are the only operations that interact with `C[i,j]`, the Accera guarantee is sufficient, and we can claim that the fused schedule is safe. With this assurance, the programmer can apply any sequence of transformations without worrying about the correctness of the resulting implementation.
 
@@ -151,11 +131,11 @@ However, not every fusing operation creates a safe schedule. For example, consid
 schedule1.reorder(j1, i1)
 # Fuse schedule0 with the reordered schedule1
 schedule_t = acc.fuse(schedule0, schedule1)
-f, a, b = schedule_t.get_indices()
+a, b, f = schedule_t.get_indices()
 ```
-In this unnatural example, `i0` and `j1` are fused and named `a`. Similarly,`i1` and `j0` are fused and named `b`. As mentioned above, Accera guarantees that, for each value of `a` and `b`, the operation `C[a, b] += A[a, b]` is executed before `C[b, a] *= B[b, a]`. The fusing operation itself preserves the logical equivalence. However, the underlying logic is changed if we transform the fused schedule as follows: 
+In this unnatural example, `i0` and `j1` are fused and named `a`. Similarly,`i1` and `j0` are fused and named `b`. As mentioned above, Accera guarantees that, for each value of `a` and `b`, the operation `C[a, b] += A[a, b]` is executed before `C[b, a] *= B[b, a]`. The fusing operation itself preserves the logical equivalence. However, the underlying logic is changed with the transformation performed before fusion: 
 ```python
-schedule_t.reorder(a, b, f)
+schedule1.reorder(j1, i1)
 ```
 To understand this change in the logic, note that the resulting schedule is equivalent to the following Python code:
 ```python
@@ -175,16 +155,20 @@ Instead of fusing all the dimensions, we may want to fuse a subset of dimensions
 # Fuse the first s dimensions of three schedules
 schedule = acc.fuse((schedule0, schedule1, ...), partial=s)
 ```
-The order of the dimensions in the fused schedule is as follows: first the fusing dimension `f`, then the fused dimensions *s*, followed by the unfused dimensions of `schedule0`, `schedule1`, and so on.
+The order of the dimensions in the fused schedule is as follows: first the fused dimensions *s*, then the fusing dimension `f`, followed by the unfused dimensions of `schedule0`, `schedule1`, and so on.
 
 We can easily calculate the number of dimensions in the fused schedule. For example, if we fuse the first *s* dimensions of a *d0*-dimensional space `schedule0` and a *d1*-dimensional space `schedule1`, the fused iteration space will have *s* fused dimensions, *d0 + d1 - 2s* unfused dimensions, and the special fusing dimension `f`, for a total of *d0 + d1 - s + 1* dimensions.
 
 The `fuse` operation uses padding to ensure that the fused iteration space is not jagged in any direction. For example, say that we partially fuse the first 2 dimensions of `schedule0`, which is 4-dimensional, and `schedule1`, which is 3-dimensional:
 ```python
 schedule = acc.fuse((schedule0, schedule1), partial=2)
-f, i, j, k, l, m = schedule.get_indices()
+i, j = schedule.get_fused_indices()
+f = schedule.get_fusing_index()
+k, l, m = schedule.get_unfused_indices()
+# Alternative way:
+# i, j, f, k, l, m = schedule.get_indices()
 ```
-The first dimension is the fusing dimensions `f` of size 2. Next comes the fused dimensions `i` and `j`, followed by the unfused dimensions `k` and `l` from `schedule0` and `m` from `schedule1`. The slice (0, \*, \*, \*, \*, 0) contains a copy of `schedule0`, the slice (1, \*, \*, 0, 0, \*) contains a copy of `schedule1`, and the rest of `schedule` is padded with empty elements. Note that full fusing is a special case of partial fusing, where `s` is the larger of the dimensions of `schedule0` and `schedule1`.
+First comes the fused dimensions `i` and `j`. Nest is the fusing dimensions `f` of size 2, followed by the unfused dimensions `k` and `l` from `schedule0` and `m` from `schedule1`. The slice (\*, \*, 0, \*, \*, 0) contains a copy of `schedule0`, the slice (\*, \*, 1, 0, 0, \*) contains a copy of `schedule1`, and the rest of `schedule` is padded with empty elements. Note that full fusing is a special case of partial fusing, where `s` is the larger of the dimensions of `schedule0` and `schedule1`.
 
 ### Constraint 2: the fusing dimension always precedes unfused dimensions
 Another constraint introduced by partial fusing is that the fusing dimension must precede all of the unfused dimensions in its dimension order. This constraint applies to dimensions derived from the fusing dimension and the unfused dimensions via splitting.
@@ -226,27 +210,22 @@ schedule1 = nest1.create_schedule()
 In `schedule0` and `schedule1`, the first dimension represents the rows of `C` and the second dimension represents the columns of `C`. Additionally, `schedule0` has a third dimension that `schedule1` does not have. Therefore, we fuse the first two dimensions of the iteration spaces and leave the third dimension of `schedule0` unfused.
 ```python
 schedule = acc.fuse((schedule0, schedule1), partial=2)
-f, i, j, k0 = schedule.get_indices()
+i, j = schedule.get_fused_indices()
+f = schedule.get_fusing_index()
+k0 = schedule.get_unfused_indices()[0]
+# Alternative way:
+# i, j, f, k0 = schedule.get_indices()
 ```
 
-The fused iteration space `schedule` has a shape of (2, 8, 8, 4). Its slice (0, \*, \*, \*) contains a copy of `schedule0`, the slice (1, \*, \*, 0) contains a copy of `schedule1`, and the rest of its elements are padded. Note that the code above overwrites the index `k0`, which initially was an index of `schedule0`. However, now it corresponds to the unfused index in `schedule`. Note that the name `k0` is a stylistic choice, we could have chosen a different name.
+The fused iteration space `schedule` has a shape of (8, 8, 2, 4). Its slice (\*, \*, 0, \*) contains a copy of `schedule0`, the slice (\*, \*, 1, 0) contains a copy of `schedule1`, and the rest of its elements are padded. Note that the code above overwrites the index `k0`, which initially was an index of `schedule0`. However, now it corresponds to the unfused index in `schedule`. Note that the name `k0` is a stylistic choice, we could have chosen a different name.
 
-![Before fusing](../assets/viz/fuse2.png)  |  ![After fusing](../assets/viz/fuse2a.png)
+![Before fusing](../assets/viz/fuse2.png)  |  ![After fusing](../assets/viz/fuse2b.png)
 :-------------------------:|:-------------------------:
 *Before fusing* |  *After `fuse((schedule0, schedule1), partial=2)` (padded elements in blue)*
 
 #### Safety
 Is `schedule` safe? Recall that for each value of `i` and `j`, Accera guarantees that the corresponding work in `schedule0` (`C[i,j] += A[i,k0] * B[k0,j]` for all values of `k0`) is executed before the corresponding work in `schedule1` (`C[i,j] = max(C[i,j], 0)`), and this holds regardless of how the fused schedule is transformed. Since these are the only operations that touch `C[i,j]` and the `ReLU` operation is always executed last, this warrants that `schedule` is safe. Therefore, we can focus all of our attention on optimizing performance without worrying about correctness from this point onwards.
 
-Executing `schedule` as-is is equivalent to executing `schedule0` in its entirety, followed by executing `schedule1`. Suppose we want to interleave the two schedules and perform `relu` immediately after calculating each element of the matrix product. In that case, we reorder the dimensions such that `i` and `j` preceded `f`:
-```python
-schedule.reorder(i, j, f, k0)
-```
-
-![Before reorder](../assets/viz/fuse2a_A.png)  |  ![After reorder](../assets/viz/fuse2b.png)
-:-------------------------:|:-------------------------:
-*Before `reorder(i, j, f, k0)`* |  *After `reorder(i, j, f, k0)`*
-
 The resulting schedule is now equivalent to the following Python code:
 
 ```python
@@ -254,7 +233,7 @@ for i in range(16):
     for j in range(10):
         # f = 0
         for k0 in range(11):
-                C[i,j] += A[i,k0] * B[k0,j]
+            C[i,j] += A[i,k0] * B[k0,j]
         # f = 1
         C[i,j] = max(C[i,j], 0)
 ```
@@ -307,16 +286,20 @@ The temporary array `C` stores the output of `schedule0`, which is then used as
 ```python
 schedule1.reorder(i1, k1, j1)
 schedule = acc.fuse((schedule0, schedule1), partial=2)
-f, i, j, k0, j1 = schedule.get_indices()
+i, j = schedule.get_fused_indices()
+f = schedule.get_fusing_index()
+k0, j1 = schedule.get_unfused_indices()
+# Alternative way:
+# i, j, f, k0, j1 = schedule.get_indices()
 ```
 
 ![Before reorder(i1, k1, j1)](../assets/viz/fuse3.png)  |  ![After reorder(i1, k1, j1)](../assets/viz/fuse3a.png)
 :-------------------------:|:-------------------------:
 *Before `reorder(i1, k1, j1)`* |  *After `reorder(i1, k1, j1)`*
 
-The fused iteration space has a shape of (2, 4, 6, 5, 10). `f` is the fusing dimension, `i` is the result of fusing `i0` and `i1`, and `j` is the result of fusing `j0` and `k1`. On the other hand, `k0` is the unfused dimension from `schedule0`, and `j1` is the unfused dimension from `schedule1`. The slice (0, \*, \*, \*, 0) contains a copy of `schedule0` and the slice (1, \*, \*, 0, \*) contains a copy of `schedule1`. The rest of the iteration space is padded with empty elements.
+The fused iteration space has a shape of (4, 6, 2, 5, 10). `i` is the result of fusing `i0` and `i1`, `j` is the result of fusing `j0` and `k1` and `f` is the fusing dimension. On the other hand, `k0` is the unfused dimension from `schedule0`, and `j1` is the unfused dimension from `schedule1`. The slice (\*, \*, 0, \*, 0) contains a copy of `schedule0` and the slice (\*, \*, 1, 0, \*) contains a copy of `schedule1`. The rest of the iteration space is padded with empty elements.
 
-![After fusing](../assets/viz/fuse3b.png)
+![After fusing](../assets/viz/fuse3c.png)
 
 *After `fuse((schedule0, schedule1), partial=2)` (White elements represent `C += A @ B`; purple elements are `E += C @ D`; blue elements are padding.)*
 
@@ -325,24 +308,6 @@ Is `schedule` safe? Again, recall that for each value of `i` and `j`, Accera gua
 
 Initially, the fused schedule is equivalent to the following Python code:
 ```python
-# f = 0
-for i in range(4):
-    for j in range(6):
-        for k0 in range(5):
-            C[i, j] += A[i, k0] * B[k0, j]
-# f = 1
-for i in range(4):
-    for j in range(6):
-        for j1 in range(10):
-            E[i, j1] += C[i, j] * D[j, j1]
-```
-
-We can now manipulate the fused schedule in various ways. For example, we can do all the work to create one element of `C` and then immediately do all the work that uses this element before moving on to the next element.
-```python
-schedule.reorder(i, j, f, k0, j1)
-```
-This schedule is equivalent to the following Python loops:
-```python
 for i in range(4):
     for j in range(6):
         for f in range(2):
@@ -371,10 +336,6 @@ for i in range(4):
 
 The advantage of this schedule is that only one element of `C` is active at any time in the computation. Accera can reuse the same memory location to store the active element of `C` instead of storing all of `C` in physical memory.
 
-![After reorder(i, j, f, k0, j1)](../assets/viz/fuse3c.png)
-
-*After `reorder(i, j, f, k0, j1)` (White elements represent `C += A @ B`; purple elements are `E += C @ D`; blue elements are padding.)*
-
 #### Tiling
 As a further optimization, we can compute a 2&times;3 block of `C`. Do all the work that uses this block and then move on to the next block:
 ```python
diff --git a/docs/Reference/accera.md b/docs/Reference/accera.md
index 2f45af16..75bafae7 100644
--- a/docs/Reference/accera.md
+++ b/docs/Reference/accera.md
@@ -135,6 +135,18 @@ A scheduled (ordered) loop nest with no target-specific implementation details.
 * [`skew`](<classes/Schedule/skew.md>) `(index, reference_index)`
 * [`split`](<classes/Schedule/split.md>) `(index, size)`
 * [`tile`](<classes/Schedule/tile.md>) `(indices, sizes)`
+* [`get_indices`](<classes/Schedule/get_indices.md>) `()`
+
+---
+
+## `class accera.FusedSchedule`
+
+Child class of `class accera.Schedule` created as a result of fusing multiple schedules.
+
+### Methods (in addition to the inherited functions from `class accera.Schedule`)
+* [`get_fusing_index`](<classes/FusedSchedule/get_fusing_index.md>) `()`
+* [`get_fused_indices`](<classes/FusedSchedule/get_fused_indices.md>) `()`
+* [`get_unfused_indices`](<classes/FusedSchedule/get_unfused_indices.md>) `()`
 
 ---
 
diff --git a/docs/Reference/classes/Array/Role.md b/docs/Reference/classes/Array/Role.md
deleted file mode 100644
index 608e7ffd..00000000
--- a/docs/Reference/classes/Array/Role.md
+++ /dev/null
@@ -1,18 +0,0 @@
-[//]: # (Project: Accera)
-[//]: # (Version: v1.2)
-
-# Accera v1.2 Reference
-
-## `accera.Array.Role`
-
-type | description
---- | ---
-`accera.Array.Role.CONST` | A constant array (immutable internally scoped) whose contents are known at compile-time.
-`accera.Array.Role.INPUT` | An input array (immutable external-scope).
-`accera.Array.Role.INPUT_OUTPUT` | An input/output array (mutable external-scope).
-`accera.Array.Role.OUTPUT` | An output array (mutable external-scope) which is allocated at runtime.
-`accera.Array.Role.TEMP` | A temporary array (mutable internal-scope).
-
-<div style="page-break-after: always;"></div>
-
-
diff --git a/docs/Reference/classes/Dimension/Role.md b/docs/Reference/classes/Dimension/Role.md
deleted file mode 100644
index e184cbe1..00000000
--- a/docs/Reference/classes/Dimension/Role.md
+++ /dev/null
@@ -1,15 +0,0 @@
-[//]: # (Project: Accera)
-[//]: # (Version: v1.2)
-
-# Accera v1.2 Reference
-
-## `accera.Dimension.Role`
-
-type | description
---- | ---
-`accera.Dimension.Role.INPUT` | An input dimension (immutable and provided as an Accera function argument).
-`accera.Dimension.Role.OUTPUT` | An output dimension (mutable and updated by an Accera function).
-
-<div style="page-break-after: always;"></div>
-
-
diff --git a/docs/Reference/classes/FusedSchedule/get_fused_indices.md b/docs/Reference/classes/FusedSchedule/get_fused_indices.md
new file mode 100644
index 00000000..6bf2217e
--- /dev/null
+++ b/docs/Reference/classes/FusedSchedule/get_fused_indices.md
@@ -0,0 +1,18 @@
+[//]: # (Project: Accera)
+[//]: # (Version: v1.2)
+
+# Accera v1.2 Reference
+
+## `accera.FusedSchedule.get_fused_indices()`
+Gets the fused indices of a fused schedule.
+
+## Returns
+Tuple of `Index`
+
+## Examples
+
+```python
+i, j = fused_schedule.get_fused_indices()
+```
+
+<div style="page-break-after: always;"></div>
diff --git a/docs/Reference/classes/FusedSchedule/get_fusing_index.md b/docs/Reference/classes/FusedSchedule/get_fusing_index.md
new file mode 100644
index 00000000..f8d04f00
--- /dev/null
+++ b/docs/Reference/classes/FusedSchedule/get_fusing_index.md
@@ -0,0 +1,18 @@
+[//]: # (Project: Accera)
+[//]: # (Version: v1.2)
+
+# Accera v1.2 Reference
+
+## `accera.FusedSchedule.get_fusing_index()`
+Gets the fusing index of a fused schedule.
+
+## Returns
+Instance of `Index`
+
+## Examples
+
+```python
+f = fused_schedule.get_fusing_index()
+```
+
+<div style="page-break-after: always;"></div>
diff --git a/docs/Reference/classes/FusedSchedule/get_unfused_indices.md b/docs/Reference/classes/FusedSchedule/get_unfused_indices.md
new file mode 100644
index 00000000..8d465d4f
--- /dev/null
+++ b/docs/Reference/classes/FusedSchedule/get_unfused_indices.md
@@ -0,0 +1,18 @@
+[//]: # (Project: Accera)
+[//]: # (Version: v1.2)
+
+# Accera v1.2 Reference
+
+## `accera.FusedSchedule.get_unfused_indices()`
+Gets the unfused indices of a fused schedule.
+
+## Returns
+Tuple of `Index`
+
+## Examples
+
+```python
+ k, l = fused_schedule.get_unfused_indices()
+```
+
+<div style="page-break-after: always;"></div>
diff --git a/docs/Reference/classes/Schedule/get_indices.md b/docs/Reference/classes/Schedule/get_indices.md
new file mode 100644
index 00000000..bf3c615d
--- /dev/null
+++ b/docs/Reference/classes/Schedule/get_indices.md
@@ -0,0 +1,20 @@
+[//]: # (Project: Accera)
+[//]: # (Version: v1.2)
+
+# Accera v1.2 Reference
+
+## `accera.Schedule.get_indices()`
+Gets the iteration space dimensions for a schedule.
+
+## Returns
+Tuple of `Index`
+
+## Examples
+
+Get the iteration space dimensions for a 3-dimensional nest:
+
+```python
+i, j, k = schedule.get_indices()
+```
+
+<div style="page-break-after: always;"></div>
diff --git a/docs/Reference/functions/fuse.md b/docs/Reference/functions/fuse.md
index b5633139..d96a4902 100644
--- a/docs/Reference/functions/fuse.md
+++ b/docs/Reference/functions/fuse.md
@@ -17,7 +17,7 @@ argument | description | type/default
 `partial` | The number of dimensions to fuse. If not specified, all dimensions will be fused | non-negative integer
 
 ## Returns
-The fused `Schedule`
+Instance of `FusedSchedule`
 
 ## Examples
 
@@ -26,10 +26,6 @@ Full fusing of same-shaped iteration spaces:
 ```python
 # Fuse all dimensions of schedule0 and schedule1
 schedule = acc.fuse(schedule0, schedule1)
-f, i, j = schedule.get_indices()
-
-# Reorder the indices so that the fused dimension is the innermost
-schedule.reorder(i, j, f)
 ```
 
 Partial iteration space fusing:
@@ -37,10 +33,6 @@ Partial iteration space fusing:
 ```python
 # Fuse the first two dimensions of schedule0 and schedule1
 schedule = acc.fuse((schedule0, schedule1), partial=2)
-f, i, j, k = schedule.get_indices()
-
-# Reorder the indices to interleave the schedules
-schedule.reorder(i, j, f, k)
 ```
 
 
diff --git a/docs/Reference/safety_analysis.md b/docs/Reference/safety_analysis.md
index 88edea57..1f12b273 100644
--- a/docs/Reference/safety_analysis.md
+++ b/docs/Reference/safety_analysis.md
@@ -15,12 +15,12 @@ Fusing is another way to create a schedule (see [Section 4 of the Accera manual]
 ```python
 schedule = acc.fuse((schedule0, schedule1, ...), partial=m)
 ```
-At this point, `schedule` is equivalent to sequentially executing the individual schedules. However, is the fused `schedule` safe? In other words, does `schedule` guarantee the preservation of underlying logic, regardless of the applied transformation?
+At this point, `schedule` is equivalent to sequentially executing the individual schedules for each iteration of the fused dimensions. However, is the fused `schedule` safe? In other words, does `schedule` guarantee the preservation of underlying logic, regardless of the applied transformation?
 
 The dimensions of `schedule` fall into three categories:
 
-* *Fusing dimensions*: at first, this category contains a single dimension, the first dimension of `schedule`. However, if this dimension is split, its derived dimensions are added to this category.
 * *Fused dimensions*: at first, this category contains the next *m* dimensions of `schedule`. If any of these dimensions are split, the derived dimensions are also added to this category.
+* *Fusing dimensions*: at first, this category contains a single dimension, the first dimension of `schedule`. However, if this dimension is split, its derived dimensions are added to this category.
 * *Unfused dimensions*: all the remaining dimensions.
 
 Note that the individual schedules being fused may have been created by previous fusing operations. The categories above relate to the role of each dimension in the *current* fusing operation.
@@ -39,7 +39,7 @@ To make the theorem less abstract, we demonstrate how it applies to a simple exa
 i0, j0, k0 = schedule0.get_indices() # redundant operation, included for clarity
 i1, j1, k1 = schedule1.get_indices() # redundant operation, included for clarity
 schedule = acc.fuse((schedule0, schedule1), partial=2)
-f, i, j, k0, k1 = schedule.get_indices()
+i, j, f, k0, k1 = schedule.get_indices()
 ```
 Next, say that we transform `schedule` by tiling dimensions `j` and `k0` to reorder the dimensions as follows:
 ```python
diff --git a/requirements.txt b/requirements.txt
index 274e44b7..67c1481b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ tomlkit>=0.11.1, <0.11.5
 wheel
 pybind11>=2.6.0
 six
-conan
+conan<2.0.0
 lit
 packaging
 pytest