Squashed commit of the following:

commit 1691b1d75b89703542514ab102fa2316a40d0ca4 Author: Mason Remy <masonr@microsoft.com> Date: Thu Feb 23 20:16:51 2023 +0000 Merged PR 3107: Make vectorization happen after inlining and simplification Make vectorization happen after inlining and simplification This change fills out the vectorization passes and removes vectorization from LoopNestToValueFunc. Some bugs were exposed that this also fixes. Since vectorization is now a separate pass, mlir filecheck lit tests can be run more easily. This change adds the initial file with one test, but we should continue expanding this test suite commit 752615f6351db126e605666c72b309c5ccf436d6 Author: JUBI TANEJA <jubitaneja@microsoft.com> Date: Thu Feb 23 06:06:20 2023 +0000 Merged PR 3108: extend vectorization for masked store case commit ab0e23cb0fd7d9186b228ffe3462263eb4bdc3f0 Author: Mason Remy <masonr@microsoft.com> Date: Wed Feb 22 20:04:51 2023 +0000 Merged PR 3109: Set conan version < 2.0.0 Our infra isn't set up for the new conan 2 behavior, so fix our usage to version 1 until we take the upgrade intentionally commit 2737012b6f9441929accd9a180efe939dfeebf6f Author: Captain Jack Sparrow <ritdas@microsoft.com> Date: Wed Feb 22 06:33:32 2023 +0000 Merged PR 3104: Position fusing dim after the fused dimensions Position fusing dim after the fused dimensions commit 15c45b048b453edea05fbfdde253a9acecb5f96a Author: Chuck Jacobs <cjacobs@microsoft.com> Date: Tue Feb 21 21:55:21 2023 +0000 Merged PR 3096: Add "RelWithDebInfo"-like option to accc This PR adds another option to the `Options` flag for `AcceraProject.gemerate_and_emit` to keep some debug (the frame pointers) info around when building the Accera project. This can be helpful when trying to interpret perf profiler output.
microsoft · Feb 24, 2023 · 604e745 · 604e745
1 parent 2a6cae4
commit 604e745
Show file tree

Hide file tree

Showing 29 changed files with 661 additions and 396 deletions.
diff --git a/README.md b/README.md
@@ -77,7 +77,7 @@ No installation is required. This will launch a Jupyter notebook with the quicks
     package.add(schedule, args=(A, B, C), base_name="matmul_relu_fusion_naive")
 
     # transform the schedule, add to the package
-    f, i, j, k = schedule.get_indices()
+    i, j, f, k = schedule.get_indices()
     ii, jj = schedule.tile({
         i: 16,
         j: 16

diff --git a/accera/acc-opt/test/vectorization.mlir b/accera/acc-opt/test/vectorization.mlir
@@ -0,0 +1,84 @@
+// RUN: acc-opt --verify-each=false --acc-vectorize %s | FileCheck %s
+
+module @test_accera_vectorization attributes {accv.target_device_features = "-avx512pf,-tsxldtrk,+cx16,+sahf,-tbm,-avx512ifma,-sha,+crc32,-fma4,-vpclmulqdq,-prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-amx-tile,-uintr,-gfni,+popcnt,-widekl,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512er,-avxvnni,-avx512fp16,-avx512vnni,-amx-bf16,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-xsavec,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,-rdseed,-waitpkg,-kl,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,-serialize,-hreset,+invpcid,-avx512cd,+avx,-vaes,-avx512bf16,+cx8,+fma,-rtm,+bmi,-enqcmd,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,+fxsr,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,-amx-int8,+movbe,-avx512vp2intersect,+xsaveopt,-avx512dq,+sse2,-adx,+sse"} {
+    accv.module "test_accera_vectorization"  {
+
+        // Single-op cases:
+        // TODO : implement test cases for these
+        // mlir::memref::AllocaOp
+        // mlir::arith::ConstantOp
+        // mlir::memref::LoadOp sequential
+        // mlir::memref::LoadOp non-sequential
+        // mlir::memref::StoreOp sequential
+        // mlir::memref::StoreOp non-sequential
+        // mlir::affine::AffineLoadOp sequential
+        // mlir::affine::AffineLoadOp non-sequential
+        // mlir::affine::AffineStoreOp sequential
+        // mlir::affine::AffineStoreOp non-sequential
+        // mlir::SelectOp
+        // mlir::arith::ShLIOp
+        // mlir::arith::FPToSIOp
+        // mlir::arith::ExtSIOp
+        // mlir::math::AbsOp
+        // mlir::math::ExpOp
+        // value::CastOp
+        // value::RoundOp
+        // value::BitCastOp
+        // value::BinOp
+        // value::CmpOp
+        // value::ReferenceGlobalOp
+
+        // Special cases:
+        // TODO : implement test cases for these
+        // horizontal reduction
+        // multi-loop sequential cast
+        // two-row interleaved pack
+        // vpmaddwd avx 2
+        // vpmaddwd avx 512
+        // masked load
+        // two-row interleaved masked load and pack
+
+
+        // CHECK-LABEL builtin.func nested @test_view_split_dim_interleaved_pack
+        builtin.func nested @test_view_split_dim_interleaved_pack(%arg0: memref<1885x256xui8> loc(unknown), %arg1: memref<483840xui8> loc(unknown)) attributes {accv.dyn_arg_size_refs = [[-1, -1], [-1]], accv.usages = [1 : i8, 0 : i8], args_name = ["", ""], args_size = ["1885*256", "483840"], args_symbol = ["args_symbol_name_0", "args_symbol_name_1"], exec_target = 0 : i64} {
+            %c1024 = arith.constant 1024 : index
+            %c1 = arith.constant 1 : index
+            %c482816 = arith.constant 482816 : index
+            %c98304 = arith.constant 98304 : index
+            %c2 = arith.constant 2 : index
+            %c16 = arith.constant 16 : index
+            %c192 = arith.constant 192 : index
+            affine.for %arg2 = 0 to 1536 step 384 {
+                %0 = "accv.view"(%arg1, %c482816, %c1024, %c1) {operand_segment_sizes = dense<1> : vector<4xi32>} : (memref<483840xui8>, index, index, index) -> memref<482816xui8, affine_map<(d0) -> (d0 + 1024)>>
+                %1 = affine.apply affine_map<(d0) -> (d0 * 256)>(%arg2)
+                %2 = "accv.view"(%0, %c98304, %1, %c1) {operand_segment_sizes = dense<1> : vector<4xi32>} : (memref<482816xui8, affine_map<(d0) -> (d0 + 1024)>>, index, index, index) -> memref<98304xui8, affine_map<(d0)[s0] -> (d0 + s0 + 1024)>>
+                %3 = "accv.split_dim"(%2, %c2) {dim = 0 : i64} : (memref<98304xui8, affine_map<(d0)[s0] -> (d0 + s0 + 1024)>>, index) -> memref<49152x2xui8, affine_map<(d0, d1)[s0] -> (d0 * 2 + d1 + s0 + 1024)>>
+                %4 = "accv.split_dim"(%3, %c16) {dim = 0 : i64} : (memref<49152x2xui8, affine_map<(d0, d1)[s0] -> (d0 * 2 + d1 + s0 + 1024)>>, index) -> memref<3072x16x2xui8, affine_map<(d0, d1, d2)[s0] -> ((d0 * 16 + d1) * 2 + d2 + s0 + 1024)>>
+                %5 = "accv.split_dim"(%4, %c192) {dim = 0 : i64} : (memref<3072x16x2xui8, affine_map<(d0, d1, d2)[s0] -> ((d0 * 16 + d1) * 2 + d2 + s0 + 1024)>>, index) -> memref<16x192x16x2xui8, affine_map<(d0, d1, d2, d3)[s0] -> (((d0 * 192 + d1) * 16 + d2) * 2 + d3 + s0 + 1024)>>
+                // CHECK: affine.for %arg3 = 0 to 256 step 16 {
+                affine.for %arg3 = 0 to 256 step 16 {
+                    // CHECK-NEXT: affine.for %arg4 = 0 to 384 step 2 {
+                    affine.for %arg4 = 0 to 384 step 2 {
+                        affine.for %arg5 = 0 to 16 {
+                            affine.for %arg6 = 0 to 2 {
+                                %8 = affine.load %arg0[%arg6 + %arg4 + symbol(%arg2), %arg5 + %arg3] : memref<1885x256xui8>
+                                affine.store %8, %5[symbol(%arg3) floordiv 16, symbol(%arg4) floordiv 2, %arg5, %arg6] : memref<16x192x16x2xui8, affine_map<(d0, d1, d2, d3)[s0] -> (((d0 * 192 + d1) * 16 + d2) * 2 + d3 + s0 + 1024)>>
+                            } {beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (2)>, index = #accln<"index{j_i,245}">, kernels = ["_cache_fill"], operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, subdomainIndexOrder = [#accln<"index{i,240}">, #accln<"index{j,241}">], subdomainSize = [16, 2]}
+                        } {accxp_vectorizationInfo = #accxp<"vectorizationinfo{32,16,0}">, beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (16)>, index = #accln<"index{i_i,243}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, scheduledIndex = #accln<"index{i_i,243}">, subdomainIndexOrder = [#accln<"index{i,240}">, #accln<"index{j,241}">], subdomainSize = [16, 2]}
+                        // CHECK-NEXT: %6 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [482560], strides: [1] : memref<1885x256xui8> to memref<482560xui8>
+                        // CHECK-NEXT: %7 = affine.apply #map6(%arg4, %c0, %arg3)[%arg2]
+                        // CHECK-NEXT: %8 = vector.load %6[%7] : memref<482560xui8>, vector<16xui8>
+                        // CHECK-NEXT: %9 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [482560], strides: [1] : memref<1885x256xui8> to memref<482560xui8>
+                        // CHECK-NEXT: %10 = affine.apply #map7(%arg4, %c0, %arg3)[%arg2]
+                        // CHECK-NEXT: %11 = vector.load %9[%10] : memref<482560xui8>, vector<16xui8>
+                        // CHECK-NEXT: %12 = vector.shuffle %8, %11 [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31] : vector<16xui8>, vector<16xui8>
+                        // CHECK-NEXT: %13 = memref.reinterpret_cast %5 to offset: [0], sizes: [98304], strides: [1] : memref<16x192x16x2xui8, #map5> to memref<98304xui8>
+                        // CHECK-NEXT: %14 = affine.apply #map8(%c0, %c0, %arg2)[%arg3, %arg4]
+                        // CHECK-NEXT: vector.store %12, %13[%14] : memref<98304xui8>, vector<32xui8>
+                    } {beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (384)>, index = #accln<"index{i_i_o,257}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, subdomainIndexOrder = [#accln<"index{i,254}">], subdomainSize = [-1]}
+                } {beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (256)>, index = #accln<"index{i_i_o,262}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, subdomainIndexOrder = [#accln<"index{i,259}">], subdomainSize = [-1]}
+            } {beginMap = affine_map<() -> (0)>, endMap = affine_map<() -> (1536)>, index = #accln<"index{i_o,268}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, subdomainIndexOrder = [#accln<"index{i,266}">, #accln<"index{j,267}">], subdomainSize = [1885, 256]}
+            return
+        }
+    }
+}
diff --git a/accera/accc/accc.py b/accera/accc/accc.py
@@ -105,6 +105,8 @@ def bstr(val):
 
 OPT_DISABLE_LOOP_UNROLLING_ARGS = ["--disable-loop-unrolling"]
 
+LLVM_KEEP_DEBUG_INFO_ARGS = ["--frame-pointer=all"]
+
 LLVM_TOOLING_OPTS = {
     SystemTarget.HOST.value: ["-O3", "-mcpu=native"],
     SystemTarget.RPI4.value: [
@@ -137,9 +139,16 @@ def bstr(val):
 DEFAULT_LLC_ARGS = DEFAULT_LLVM_TOOLING_OPTS + ["-relocation-model=pic"]
 
 class Options(Flag):
-    NONE = auto() # (enable auto unroll | low precision float)
+    NONE = auto() # (enable auto unroll | low precision float | no debug info)
     DISABLE_AUTO_UNROLL = auto()
     HIGH_PRECISION_FLOATING_POINT_OPS = auto()
+    KEEP_DEBUG_INFO = auto()
+
+def _get_common_debug_info_options_args(options: Options):
+    if options & Options.KEEP_DEBUG_INFO:
+        return LLVM_KEEP_DEBUG_INFO_ARGS
+    else:
+        return []
 
 def _get_common_fp_options_args(options: Options):
     if options & Options.HIGH_PRECISION_FLOATING_POINT_OPS:
@@ -154,13 +163,15 @@ def _get_options_opt_args(options: Options):
         args += OPT_DISABLE_LOOP_UNROLLING_ARGS
 
     args += _get_common_fp_options_args(options)
+    args += _get_common_debug_info_options_args(options)
 
     return args
 
 def _get_options_llc_args(options: Options):
     args = []
 
     args += _get_common_fp_options_args(options)
+    args += _get_common_debug_info_options_args(options)
 
     return args
 

diff --git a/accera/ir/include/IRUtil.h b/accera/ir/include/IRUtil.h
@@ -461,5 +461,7 @@ namespace util
 
     bool IsTerminalOp(mlir::Operation* op);
 
+    std::vector<mlir::Value> GetDynamicOffsetSymbols(mlir::Value val);
+
 } // namespace util
 } // namespace accera::ir
diff --git a/accera/ir/src/IRUtil.cpp b/accera/ir/src/IRUtil.cpp
@@ -1304,35 +1304,55 @@ namespace util
             return shape;
         }
 
-        // Currently this utility only supports dynamic memrefs that are function arguments with dimension size handles which are
-        // also function arguments
-        if (!memref.isa<mlir::BlockArgument>())
+        // Currently this utility only supports dynamic memrefs that are alloc ops with shape args or
+        // function arguments with dimension size handles which are also function arguments
+        if (auto allocOp = memref.getDefiningOp<ir::value::AllocOp>())
         {
-            throw LogicException(LogicExceptionErrors::notImplemented, "Currently only supports function arguments for dynamic memref shape resolution");
+            // Assumes the operands to AllocOp are ordered in the logical shape order where they're needed
+            unsigned currentOperandIndex = 0;
+            auto allocOperands = allocOp.operands();
+            for (unsigned dimIdx = 0; dimIdx < memrefType.getRank(); ++dimIdx)
+            {
+                if (memrefType.isDynamicDim(dimIdx))
+                {
+                    shape.push_back(allocOperands[currentOperandIndex++]);
+                }
+                else
+                {
+                    shape.push_back(memrefType.getDimSize(dimIdx));
+                }
+            }
         }
-        auto memrefBlockArg = memref.cast<mlir::BlockArgument>();
-        auto memrefFuncArgIdx = memrefBlockArg.getArgNumber();
-        auto blockParentOp = memrefBlockArg.getOwner()->getParentOp();
+        else if (memref.isa<mlir::BlockArgument>())
+        {
+            auto memrefBlockArg = memref.cast<mlir::BlockArgument>();
+            auto memrefFuncArgIdx = memrefBlockArg.getArgNumber();
+            auto blockParentOp = memrefBlockArg.getOwner()->getParentOp();
 
-        auto allFuncArgs = memrefBlockArg.getOwner()->getArguments();
-        std::vector<mlir::Type> allFuncArgTypes;
-        allFuncArgTypes.reserve(allFuncArgs.size());
-        std::transform(allFuncArgs.begin(), allFuncArgs.end(), std::back_inserter(allFuncArgTypes), [](mlir::Value val) { return val.getType(); });
+            auto allFuncArgs = memrefBlockArg.getOwner()->getArguments();
+            std::vector<mlir::Type> allFuncArgTypes;
+            allFuncArgTypes.reserve(allFuncArgs.size());
+            std::transform(allFuncArgs.begin(), allFuncArgs.end(), std::back_inserter(allFuncArgTypes), [](mlir::Value val) { return val.getType(); });
 
-        std::vector<std::vector<int64_t>> dynamicArgSizeRefs = ParseDynamicArgSizeReferences(blockParentOp, allFuncArgTypes);
+            std::vector<std::vector<int64_t>> dynamicArgSizeRefs = ParseDynamicArgSizeReferences(blockParentOp, allFuncArgTypes);
 
-        for (unsigned dimIdx = 0; dimIdx < memrefType.getRank(); ++dimIdx)
-        {
-            if (memrefType.isDynamicDim(dimIdx))
-            {
-                auto shapeRefArgIdx = dynamicArgSizeRefs[memrefFuncArgIdx][dimIdx];
-                shape.push_back(allFuncArgs[shapeRefArgIdx]);
-            }
-            else
+            for (unsigned dimIdx = 0; dimIdx < memrefType.getRank(); ++dimIdx)
             {
-                shape.push_back(memrefType.getDimSize(dimIdx));
+                if (memrefType.isDynamicDim(dimIdx))
+                {
+                    auto shapeRefArgIdx = dynamicArgSizeRefs[memrefFuncArgIdx][dimIdx];
+                    shape.push_back(allFuncArgs[shapeRefArgIdx]);
+                }
+                else
+                {
+                    shape.push_back(memrefType.getDimSize(dimIdx));
+                }
             }
         }
+        else
+        {
+            throw LogicException(LogicExceptionErrors::notImplemented, "Currently only supports local allocations or function arguments for dynamic memref shape resolution");
+        }
         return shape;
     }
 
@@ -1495,5 +1515,30 @@ namespace util
         return op->getNumResults() == 0;
     }
 
+    std::vector<mlir::Value> GetDynamicOffsetSymbols(mlir::Value val)
+    {
+        std::vector<mlir::Value> offsetSymbols;
+        // If there are dynamic offsets, get the source handle for those and incorporate them into the offsetSymbols
+        if (auto memrefSrcOp = val.getDefiningOp())
+        {
+            // Currently only handles value::SplitDimOp and value::ViewOp
+            while (auto splitDimOp = mlir::dyn_cast_or_null<ir::value::SplitDimOp>(memrefSrcOp))
+            {
+                memrefSrcOp = splitDimOp.getViewSource().getDefiningOp();
+            }
+            if (auto viewOp = mlir::dyn_cast_or_null<ir::value::ViewOp>(memrefSrcOp))
+            {
+                for (auto offset : viewOp.offsets())
+                {
+                    if (!offset.getDefiningOp<mlir::arith::ConstantOp>())
+                    {
+                        offsetSymbols.push_back(offset);
+                    }
+                }
+            }
+        }
+        return offsetSymbols;
+    }
+
 } // namespace util
 } // namespace accera::ir
diff --git a/accera/ir/src/value/ValueCanonicalization.cpp b/accera/ir/src/value/ValueCanonicalization.cpp
@@ -245,6 +245,55 @@ mlir::Value constantBuildHelper<mlir::arith::ConstantFloatOp>(mlir::OpBuilder& b
 
 struct ValueBinOpSimplification : public mlir::OpRewritePattern<v::BinOp>
 {
+    mlir::Value handlePartiallyConstantBoolOp(mlir::PatternRewriter& rewriter, v::BinaryOpPredicate pred, mlir::Value lhs, mlir::Value rhs) const
+    {
+        auto lhsCast = lhs.getDefiningOp<mlir::arith::ConstantIntOp>();
+        auto rhsCast = rhs.getDefiningOp<mlir::arith::ConstantIntOp>();
+        if ((lhsCast == nullptr && rhsCast == nullptr) ||
+            (lhsCast != nullptr && rhsCast != nullptr))
+        {
+            return nullptr;
+        }
+        // Only one is non-null
+        auto constOperand = lhsCast == nullptr ? rhsCast : lhsCast; 
+        auto otherOperand = lhsCast == nullptr ? lhs : rhs;
+        auto type = constOperand.getType();
+        if (!type.isa<mlir::IntegerType>() || type.cast<mlir::IntegerType>().getWidth() != 1)
+        {
+            return nullptr;
+        }
+
+        auto boolResult = constOperand.value() != 0;
+        auto loc = lhs.getLoc();
+        switch (pred)
+        {
+        case v::BinaryOpPredicate::LOGICAL_AND:
+            if (boolResult)
+            {
+                // (arg AND true) == (arg)
+                return otherOperand;
+            }
+            else
+            {
+                // (arg AND false) == (false)
+                return rewriter.create<mlir::arith::ConstantIntOp>(loc, static_cast<int64_t>(boolResult), 1 /* bitwidth = i1 for boolean values */);
+            }
+        case v::BinaryOpPredicate::LOGICAL_OR:
+            if (boolResult)
+            {
+                // (arg OR true) == (true)
+                return rewriter.create<mlir::arith::ConstantIntOp>(loc, static_cast<int64_t>(boolResult), 1 /* bitwidth = i1 for boolean values */);
+            }
+            else
+            {
+                // (arg OR false) == (arg)
+                return otherOperand;
+            }
+        default:
+            return nullptr;
+        }
+    }
+
     using OpRewritePattern::OpRewritePattern;
 
     mlir::arith::ConstantOp handleConstantBoolOp(mlir::PatternRewriter& rewriter, v::BinaryOpPredicate pred, mlir::arith::ConstantOp lhs, mlir::arith::ConstantOp rhs) const
@@ -314,6 +363,7 @@ struct ValueBinOpSimplification : public mlir::OpRewritePattern<v::BinOp>
         // TODO : if we lowered BinOps to MLIR earlier than other value dialect ops, the built-in arithmetic canonicalizations and lowerings would handle this
         auto lhs = op.lhs();
         auto rhs = op.rhs();
+        auto pred = op.getPredicate();
         auto resultElementType = accera::ir::util::GetElementType(op.result().getType());
         auto lhsElementType = accera::ir::util::GetElementType(lhs.getType());
         auto rhsElementType = accera::ir::util::GetElementType(rhs.getType());
@@ -327,28 +377,33 @@ struct ValueBinOpSimplification : public mlir::OpRewritePattern<v::BinOp>
         {
             if (auto rhsConstantOp = rhs.getDefiningOp<mlir::arith::ConstantOp>())
             {
-                if (mlir::Value intOp = handleConstantOp<mlir::arith::ConstantIntOp>(rewriter, op.getPredicate(), lhsConstantOp, rhsConstantOp))
+                if (mlir::Value intOp = handleConstantOp<mlir::arith::ConstantIntOp>(rewriter, pred, lhsConstantOp, rhsConstantOp))
                 {
                     rewriter.replaceOp(op, { intOp });
                     return mlir::success();
                 }
-                else if (mlir::Value indexOp = handleConstantOp<mlir::arith::ConstantIndexOp>(rewriter, op.getPredicate(), lhsConstantOp, rhsConstantOp))
+                else if (mlir::Value indexOp = handleConstantOp<mlir::arith::ConstantIndexOp>(rewriter, pred, lhsConstantOp, rhsConstantOp))
                 {
                     rewriter.replaceOp(op, { indexOp });
                     return mlir::success();
                 }
-                else if (mlir::Value floatOp = handleConstantOp<mlir::arith::ConstantFloatOp>(rewriter, op.getPredicate(), lhsConstantOp, rhsConstantOp))
+                else if (mlir::Value floatOp = handleConstantOp<mlir::arith::ConstantFloatOp>(rewriter, pred, lhsConstantOp, rhsConstantOp))
                 {
                     rewriter.replaceOp(op, { floatOp });
                     return mlir::success();
                 }
-                else if (mlir::Value boolOp = handleConstantBoolOp(rewriter, op.getPredicate(), lhsConstantOp, rhsConstantOp))
+                else if (mlir::Value boolOp = handleConstantBoolOp(rewriter, pred, lhsConstantOp, rhsConstantOp))
                 {
                     rewriter.replaceOp(op, { boolOp });
                     return mlir::success();
                 }
             }
         }
+        if (auto partiallyConstReplaceVal = handlePartiallyConstantBoolOp(rewriter, pred, lhs, rhs))
+        {
+            rewriter.replaceOp(op, { partiallyConstReplaceVal });
+            return mlir::success();
+        }
         return mlir::failure();
     }
 

diff --git a/accera/python/accera/lang/Nest.py b/accera/python/accera/lang/Nest.py
@@ -84,10 +84,7 @@ def get_indices(self) -> Union[List[LoopIndex], LoopIndex]:
             else:
                 self._shape[0][1].name = names[0]
 
-        if len(self._shape) > 1:
-            return [idx for _, idx in self._shape]
-        else:
-            return self._shape[0][1]
+        return [idx for _, idx in self._shape]
 
     def iteration_logic(self, logic: Callable = None, predicate=None, placement=None):
         """Adds iteration logic to the nest